"""Test PDF-related code, including metadata, bookmarks and hyperlinks."""
import hashlib
import io
import os
import re
from codecs import BOM_UTF16_BE
import pytest
from weasyprint import Attachment
from weasyprint.document import Document, DocumentMetadata
from weasyprint.text.fonts import FontConfiguration
from weasyprint.urls import path2url
from .testing_utils import (
FakeHTML, assert_no_logs, capture_logs, resource_filename)
# Top and right positions in points, rounded to the default float precision of
# 6 digits, a rendered by pydyf
TOP = round(297 * 72 / 25.4, 6)
RIGHT = round(210 * 72 / 25.4, 6)
@assert_no_logs
@pytest.mark.parametrize('zoom', (1, 1.5, 0.5))
def test_page_size_zoom(zoom):
pdf = FakeHTML(string='
''').write_pdf()
assert re.findall(b'/Count ([0-9-]*)', pdf)[-1] == b'4'
assert re.findall(b'/Title \\((.*)\\)', pdf) == [
b'a', b'b c d', b'e f', b'g h i']
@assert_no_logs
def test_links_none():
pdf = FakeHTML(string='').write_pdf()
assert b'Annots' not in pdf
@assert_no_logs
def test_links():
pdf = FakeHTML(string='''
Hello, World
a
''', base_url=resource_filename('')).write_pdf()
uris = re.findall(b'/URI \\((.*)\\)', pdf)
types = re.findall(b'/S (.*)', pdf)
subtypes = re.findall(b'/Subtype (.*)', pdf)
rects = [
[float(number) for number in match.split()] for match in re.findall(
b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]', pdf)]
# 30pt wide (like the image), 20pt high (like line-height)
assert uris.pop(0) == b'http://weasyprint.org'
assert subtypes.pop(0) == b'/Link'
assert types.pop(0) == b'/URI'
assert rects.pop(0) == [0, TOP, 30, TOP - 20]
# The image itself: 30*30pt
assert uris.pop(0) == b'http://weasyprint.org'
assert subtypes.pop(0) == b'/Link'
assert types.pop(0) == b'/URI'
assert rects.pop(0) == [0, TOP, 30, TOP - 30]
# 32pt wide (image + 2 * 1pt of border), 20pt high
assert subtypes.pop(0) == b'/Link'
assert b'/Dest (lipsum)' in pdf
link = re.search(
b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP, 0]
assert rects.pop(0) == [10, TOP - 100, 10 + 32, TOP - 100 - 20]
# The image itself: 32*32pt
assert subtypes.pop(0) == b'/Link'
assert rects.pop(0) == [10, TOP - 100, 10 + 32, TOP - 100 - 32]
# 100% wide (block), 30pt high
assert subtypes.pop(0) == b'/Link'
assert b'/Dest (hello)' in pdf
link = re.search(
b'\\(hello\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP - 200, 0]
assert rects.pop(0) == [0, TOP, RIGHT, TOP - 30]
@assert_no_logs
def test_sorted_links():
# Regression test for https://github.com/Kozea/WeasyPrint/issues/1352
pdf = FakeHTML(string='''
zzz
aaa
za
''', base_url=resource_filename('')).write_pdf()
assert b'(zzz) [' in pdf.split(b'(aaa) [')[-1]
@assert_no_logs
def test_relative_links_no_height():
# 100% wide (block), 0pt high
pdf = FakeHTML(
string='a',
base_url='http://weasyprint.org/foo/bar/').write_pdf()
assert b'/S /URI\n/URI (http://weasyprint.org/foo/lipsum)'
assert f'/Rect [ 0 {TOP} {RIGHT} {TOP} ]'.encode() in pdf
@assert_no_logs
def test_relative_links_missing_base():
# Relative URI reference without a base URI
pdf = FakeHTML(
string='a',
base_url=None).write_pdf()
assert b'/S /URI\n/URI (../lipsum)'
assert f'/Rect [ 0 {TOP} {RIGHT} {TOP} ]'.encode() in pdf
@assert_no_logs
def test_relative_links_missing_base_link():
# Relative URI reference without a base URI: not supported for -weasy-link
with capture_logs() as logs:
pdf = FakeHTML(
string='
',
base_url=None).write_pdf()
assert b'/Annots' not in pdf
assert len(logs) == 1
assert 'WARNING: Ignored `-weasy-link: url(../lipsum)`' in logs[0]
assert 'Relative URI reference without a base URI' in logs[0]
@assert_no_logs
def test_relative_links_internal():
# Internal URI reference without a base URI: OK
pdf = FakeHTML(
string='a',
base_url=None).write_pdf()
assert b'/Dest (lipsum)' in pdf
link = re.search(
b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP, 0]
rect = re.search(
b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
pdf).group(1)
assert [float(number) for number in rect.split()] == [0, TOP, RIGHT, TOP]
@assert_no_logs
def test_relative_links_anchors():
pdf = FakeHTML(
string='a',
base_url=None).write_pdf()
assert b'/Dest (lipsum)' in pdf
link = re.search(
b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP, 0]
rect = re.search(
b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
pdf).group(1)
assert [float(number) for number in rect.split()] == [0, TOP, RIGHT, TOP]
@assert_no_logs
def test_relative_links_different_base():
pdf = FakeHTML(
string='a',
base_url='http://weasyprint.org/foo/bar/').write_pdf()
assert b'http://weasyprint.org/test/lipsum' in pdf
@assert_no_logs
def test_relative_links_same_base():
pdf = FakeHTML(
string='a',
base_url='http://weasyprint.org/foo/bar/').write_pdf()
assert b'/Dest (test)' in pdf
@assert_no_logs
def test_missing_links():
with capture_logs() as logs:
pdf = FakeHTML(string='''
a
''', base_url=None).write_pdf()
assert b'/Dest (lipsum)' in pdf
assert len(logs) == 1
link = re.search(
b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]',
pdf).group(1)
assert [float(number) for number in link.split()] == [0, TOP - 15, 0]
rect = re.search(
b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]',
pdf).group(1)
assert [float(number) for number in rect.split()] == [
0, TOP, RIGHT, TOP - 15]
assert 'ERROR: No anchor #missing for internal URI reference' in logs[0]
@assert_no_logs
def test_anchor_multiple_pages():
pdf = FakeHTML(string='''
''').write_pdf()
assert b'/Author (I Me & Myself)' in pdf
assert b'/Title (Test document)' in pdf
assert (
b'/Creator ') in pdf
assert b'/Keywords (html, css, pdf)' in pdf
assert b'/Subject ' in pdf
assert b'/CreationDate (20110421230000Z)' in pdf
assert b"/ModDate (20130721234600+01'00)" in pdf
@assert_no_logs
def test_embedded_files_attachments(tmpdir):
absolute_tmp_file = tmpdir.join('some_file.txt').strpath
adata = b'12345678'
with open(absolute_tmp_file, 'wb') as afile:
afile.write(adata)
absolute_url = path2url(absolute_tmp_file)
assert absolute_url.startswith('file://')
relative_tmp_file = tmpdir.join('äöü.txt').strpath
rdata = b'abcdefgh'
with open(relative_tmp_file, 'wb') as rfile:
rfile.write(rdata)
pdf = FakeHTML(
string='''
Test document
Heading 1
Heading 2
'''.format(absolute_url, os.path.basename(relative_tmp_file)),
base_url=tmpdir.strpath,
).write_pdf(
attachments=[
Attachment('data:,oob attachment', description='Hello'),
'data:,raw URL',
io.BytesIO(b'file like obj')
]
)
assert '<{}>'.format(hashlib.md5(b'hi there').hexdigest()).encode() in pdf
assert b'/F ()' in pdf
assert b'/UF (attachment.bin)' in pdf
name = BOM_UTF16_BE + 'some file attachment äöü'.encode('utf-16-be')
assert b'/Desc <' + name.hex().encode() + b'>' in pdf
assert hashlib.md5(adata).hexdigest().encode() in pdf
assert os.path.basename(absolute_tmp_file).encode() in pdf
assert hashlib.md5(rdata).hexdigest().encode() in pdf
name = BOM_UTF16_BE + 'some file attachment äöü'.encode('utf-16-be')
assert b'/Desc <' + name.hex().encode() + b'>' in pdf
assert hashlib.md5(b'oob attachment').hexdigest().encode() in pdf
assert b'/Desc (Hello)' in pdf
assert hashlib.md5(b'raw URL').hexdigest().encode() in pdf
assert hashlib.md5(b'file like obj').hexdigest().encode() in pdf
assert b'/EmbeddedFiles' in pdf
assert b'/Outlines' in pdf
@assert_no_logs
def test_attachments_data():
pdf = FakeHTML(string='''
Test document 2
''').write_pdf()
md5 = '<{}>'.format(hashlib.md5(b'some data').hexdigest()).encode()
assert md5 in pdf
@assert_no_logs
def test_attachments_none():
pdf = FakeHTML(string='''
Test document 3
Heading
''').write_pdf()
assert b'Names' not in pdf
assert b'Outlines' in pdf
@assert_no_logs
def test_attachments_none_empty():
pdf = FakeHTML(string='''
Test document 3
''').write_pdf()
assert b'Names' not in pdf
assert b'Outlines' not in pdf
@assert_no_logs
def test_annotations():
pdf = FakeHTML(string='''
Test documentA link that lets you download an attachment
''').write_pdf()
assert hashlib.md5(b'some data').hexdigest().encode() in pdf
assert b'/FileAttachment' in pdf
assert b'/EmbeddedFiles' not in pdf
@pytest.mark.parametrize('style, media, bleed, trim', (
('bleed: 30pt; size: 10pt',
[-30, -30, 40, 40],
[-10, -10, 20, 20],
[0, 0, 10, 10]),
('bleed: 15pt 3pt 6pt 18pt; size: 12pt 15pt',
[-18, -15, 15, 21],
[-10, -10, 15, 21],
[0, 0, 12, 15]),
))
@assert_no_logs
def test_bleed(style, media, bleed, trim):
pdf = FakeHTML(string='''
Test document
test
''' % style).write_pdf()
assert '/MediaBox [ {} {} {} {} ]'.format(*media).encode() in pdf
assert '/BleedBox [ {} {} {} {} ]'.format(*bleed).encode() in pdf
assert '/TrimBox [ {} {} {} {} ]'.format(*trim).encode() in pdf