1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-09-11 20:47:56 +03:00

Only add AF values for PDF/A-2+

This commit is contained in:
Guillaume Ayoub 2024-02-02 16:42:10 +01:00
parent 05a34e7d9b
commit cec6e5ec3e
4 changed files with 74 additions and 57 deletions

View File

@ -6,8 +6,8 @@ importing sub-modules.
"""
import contextlib
import datetime
import os
from datetime import datetime
from os.path import getctime, getmtime
from pathlib import Path
from urllib.parse import urljoin
@ -310,46 +310,46 @@ class Attachment:
An instance is created in the same way as :class:`HTML`, except that the
HTML specific arguments (``encoding`` and ``media_type``) are not
supported. An optional description can be provided with the ``description``
argument.
supported.
:param description:
:param str description:
A description of the attachment to be included in the PDF document.
May be :obj:`None`.
:type created: :obj:`datetime.datetime`
:param created:
Creation date and time. Default is current date and time.
:type modified: :obj:`datetime.datetime`
:param modified:
Modification date and time. Default is current date and time.
:param str relationship:
A string that represents the relationship between the attachment and
the PDF it is embedded in. Default is 'Unspecified', other common
values are defined in ISO-32000-2:2020, 7.11.3.
"""
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
string=None, base_url=None, url_fetcher=default_url_fetcher,
description=None, created=None, modified=None,
af_relationship="Source"):
relationship='Unspecified'):
self.source = _select_source(
guess, filename, url, file_obj, string, base_url=base_url,
url_fetcher=url_fetcher)
self.description = description
self.af_relationship = af_relationship
self.relationship = relationship
self.md5 = None
def epoch_to_pdf(epoch):
dt_object = datetime.datetime.fromtimestamp(epoch)
return datetime_to_pdf(dt_object)
def datetime_to_pdf(dt_object):
return dt_object.strftime("D:%Y%m%d%H%M%SZ")
if created:
self.created = created
else:
if created is None:
if filename:
self.created = epoch_to_pdf(os.path.getctime(filename))
created = datetime.fromtimestamp(getctime(filename))
else:
self.created = datetime_to_pdf(datetime.datetime.now())
if modified:
self.modified = modified
else:
created = datetime.now()
if modified is None:
if filename:
self.modified = epoch_to_pdf(os.path.getmtime(filename))
modified = datetime.fromtimestamp(getmtime(filename))
else:
self.modified = datetime_to_pdf(datetime.datetime.now())
modified = datetime.now()
self.created = created.strftime('D:%Y%m%d%H%M%SZ')
self.modified = modified.strftime('D:%Y%m%d%H%M%SZ')
@contextlib.contextmanager

View File

@ -254,12 +254,9 @@ def generate_pdf(document, target, zoom, **options):
pdf_attachments.append(pdf_attachment)
if pdf_attachments:
content = pydyf.Dictionary({'Names': pydyf.Array()})
if 'AF' not in pdf.catalog:
pdf.catalog['AF'] = pydyf.Array()
for i, pdf_attachment in enumerate(pdf_attachments):
content['Names'].append(pydyf.String(f'attachment{i}'))
content['Names'].append(pdf_attachment.reference)
pdf.catalog['AF'].append(pdf_attachment.reference)
pdf.add_object(content)
if 'Names' not in pdf.catalog:
pdf.catalog['Names'] = pydyf.Dictionary()

View File

@ -293,58 +293,54 @@ def write_pdf_attachment(pdf, attachment, url_fetcher, compress):
elif not isinstance(attachment, Attachment):
attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
uncompressed_length = 0
stream = b''
md5 = hashlib.md5()
try:
with attachment.source as (source_type, source, url, _):
with attachment.source as (_, source, url, _):
if isinstance(source, bytes):
source = io.BytesIO(source)
uncompressed_length = 0
stream = b''
md5 = hashlib.md5()
for data in iter(lambda: source.read(4096), b''):
uncompressed_length += len(data)
md5.update(data)
stream += data
mime_type, _ = mimetypes.guess_type(url, strict=False)
if not mime_type:
mime_type = 'application/octet-stream'
mime_type = '/' + mime_type.replace('/', '#2f')
file_extra = pydyf.Dictionary({
'Type': '/EmbeddedFile',
"Subtype": mime_type,
'Params': pydyf.Dictionary({
'CheckSum': f'<{md5.hexdigest()}>',
'Size': uncompressed_length,
'CreationDate': attachment.created,
'ModDate': attachment.modified,
})
})
file_stream = pydyf.Stream([stream], file_extra, compress=compress)
pdf.add_object(file_stream)
except URLFetchingError as exception:
LOGGER.error('Failed to load attachment: %s', exception)
return
attachment.md5 = md5.hexdigest()
# TODO: Use the result object from a URL fetch operation to provide more
# details on the possible filename.
# details on the possible filename and MIME type.
if url and urlsplit(url).path:
filename = basename(unquote(urlsplit(url).path))
else:
filename = 'attachment.bin'
mime_type = mimetypes.guess_type(filename, strict=False)[0]
if not mime_type:
mime_type = 'application/octet-stream'
attachment = pydyf.Dictionary({
file_extra = pydyf.Dictionary({
'Type': '/EmbeddedFile',
'Subtype': f'/{mime_type.replace("/", "#2f")}',
'Params': pydyf.Dictionary({
'CheckSum': f'<{attachment.md5}>',
'Size': uncompressed_length,
'CreationDate': attachment.created,
'ModDate': attachment.modified,
})
})
file_stream = pydyf.Stream([stream], file_extra, compress=compress)
pdf.add_object(file_stream)
pdf_attachment = pydyf.Dictionary({
'Type': '/Filespec',
'F': pydyf.String(),
'UF': pydyf.String(filename),
"AFRelationship": "/"+attachment.af_relationship,
'EF': pydyf.Dictionary({'F': file_stream.reference}),
'Desc': pydyf.String(attachment.description or ''),
})
pdf.add_object(attachment)
if "AF" not in pdf.catalog:
pdf.catalog["AF"] = pydyf.Array()
pdf.catalog["AF"].append(attachment.reference)
return attachment
pdf.add_object(pdf_attachment)
return pdf_attachment
def resolve_links(pages):

View File

@ -34,6 +34,30 @@ def pdfa(pdf, metadata, document, page_streams, compress, version):
}),
])
# Add AF for attachments
if version >= 2:
attachments = []
if 'Names' in pdf.catalog and 'EmbeddedFiles' in pdf.catalog['Names']:
reference = int(pdf.catalog['Names']['EmbeddedFiles'].split()[0])
names = pdf.objects[reference]
for name in names[1::2]:
attachments.append(name)
relationships = {
attachment.md5: attachment.relationship
for attachment in document.metadata.attachments
if attachment.md5}
for pdf_object in pdf.objects:
if isinstance(pdf_object, dict):
if pdf_object.get('Type') == '/Filespec':
checksum = pdf_object['CheckSum']
relationship = relationships.get(checksum, 'Unspecified')
pdf_object['AFRelationship'] = f'/{relationship}'
attachments.append(pdf_object.reference)
if attachments:
if 'AF' not in pdf.catalog:
pdf.catalog['AF'] = pydyf.Array()
pdf.catalog['AF'].extend(attachments)
# Print annotations
for pdf_object in pdf.objects:
if isinstance(pdf_object, dict) and pdf_object.get('Type') == '/Annot':