"""Test PDF-related code, including metadata, bookmarks and hyperlinks.""" import hashlib import io import os import re from codecs import BOM_UTF16_BE import pytest from weasyprint import Attachment from weasyprint.document import Document, DocumentMetadata from weasyprint.text.fonts import FontConfiguration from weasyprint.urls import path2url from .testing_utils import ( FakeHTML, assert_no_logs, capture_logs, resource_filename) # Top and right positions in points, rounded to the default float precision of # 6 digits, a rendered by pydyf TOP = round(297 * 72 / 25.4, 6) RIGHT = round(210 * 72 / 25.4, 6) @assert_no_logs @pytest.mark.parametrize('zoom', (1, 1.5, 0.5)) def test_page_size_zoom(zoom): pdf = FakeHTML(string='

1

2

3

4

5

6

7

8

9

10

11

''').write_pdf() # 1 # 2 # |_ 3 # |_ 4 # | L_ 5 # L_ 6 # 7 # L_ 8 # L_ 9 # 10 # L_ 11 assert re.findall(b'/Title \\((.*)\\)', pdf) == [ str(i).encode() for i in range(1, 12)] counts = re.findall(b'/Count ([0-9-]*)', pdf) counts.pop(0) # Page count outlines = counts.pop() assert outlines == b'11' assert counts == [ b'0', b'4', b'0', b'1', b'0', b'0', b'2', b'1', b'0', b'1', b'0'] @assert_no_logs def test_bookmarks_5(): pdf = FakeHTML(string='''

1

level 1

2

level 2

3

level 1

4

level 2

5

level 3 ''').write_pdf() # 1 # L_ 2 # 3 # L_ 4 # L_ 5 assert re.findall(b'/Title \\((.*)\\)', pdf) == [ str(i).encode() for i in range(1, 6)] counts = re.findall(b'/Count ([0-9-]*)', pdf) counts.pop(0) # Page count outlines = counts.pop() assert outlines == b'5' assert counts == [b'1', b'0', b'2', b'1', b'0'] @assert_no_logs def test_bookmarks_6(): pdf = FakeHTML(string='''

1

h2 level 1

2

h4 level 2

3

h3 level 2
4
h5 level 3

5

h1 level 1

6

h2 level 2

7

h2 level 2

8

h4 level 3

9

h1 level 1 ''').write_pdf() # 1 # |_ 2 # L_ 3 # L_ 4 # 5 # |_ 6 # L_ 7 # L_ 8 # 9 assert re.findall(b'/Title \\((.*)\\)', pdf) == [ str(i).encode() for i in range(1, 10)] counts = re.findall(b'/Count ([0-9-]*)', pdf) counts.pop(0) # Page count outlines = counts.pop() assert outlines == b'9' assert counts == [b'3', b'0', b'1', b'0', b'3', b'0', b'1', b'0', b'0'] @assert_no_logs def test_bookmarks_7(): # Reference for the next test. zoom=1 pdf = FakeHTML(string='

a

').write_pdf() assert re.findall(b'/Title \\((.*)\\)', pdf) == [b'a'] dest, = re.findall(b'/Dest \\[(.*)\\]', pdf) y = round(float(dest.strip().split()[-2])) pdf = FakeHTML(string='

a

').write_pdf(zoom=1.5) assert re.findall(b'/Title \\((.*)\\)', pdf) == [b'a'] dest, = re.findall(b'/Dest \\[(.*)\\]', pdf) assert round(float(dest.strip().split()[-2])) == 1.5 * y @assert_no_logs def test_bookmarks_8(): pdf = FakeHTML(string='''

a

b

c

d

e

f

g

''').write_pdf() # a # |_ b # | |_ c # |_ d (closed) # | |_ e # | |_ f # g assert re.findall(b'/Title \\((.*)\\)', pdf) == [ b'a', b'b', b'c', b'd', b'e', b'f', b'g'] counts = re.findall(b'/Count ([0-9-]*)', pdf) counts.pop(0) # Page count outlines = counts.pop() assert outlines == b'5' assert counts == [b'3', b'1', b'0', b'-2', b'1', b'0', b'0'] @assert_no_logs def test_bookmarks_9(): pdf = FakeHTML(string='''

a

''').write_pdf() counts = re.findall(b'/Count ([0-9-]*)', pdf) outlines = counts.pop() assert outlines == b'1' assert re.findall(b'/Title \\((.*)\\)', pdf) == [b'h1 on page 1'] @assert_no_logs def test_bookmarks_10(): pdf = FakeHTML(string='''
a
''').write_pdf() # x # x counts = re.findall(b'/Count ([0-9-]*)', pdf) outlines = counts.pop() assert outlines == b'2' assert re.findall(b'/Title \\((.*)\\)', pdf) == [b'x', b'x'] @assert_no_logs def test_bookmarks_11(): pdf = FakeHTML(string='''
a a a
b
c
''').write_pdf() # a # b counts = re.findall(b'/Count ([0-9-]*)', pdf) outlines = counts.pop() assert outlines == b'2' assert re.findall(b'/Title \\((.*)\\)', pdf) == [b'a', b'b'] @assert_no_logs def test_bookmarks_12(): pdf = FakeHTML(string='''
a
''').write_pdf() # a counts = re.findall(b'/Count ([0-9-]*)', pdf) outlines = counts.pop() assert outlines == b'1' assert re.findall(b'/Title \\((.*)\\)', pdf) == [b'a'] @assert_no_logs def test_bookmarks_13(): pdf = FakeHTML(string='''
a
''').write_pdf() # a counts = re.findall(b'/Count ([0-9-]*)', pdf) outlines = counts.pop() assert outlines == b'1' assert re.findall(b'/Title \\((.*)\\)', pdf) == [b'a'] @assert_no_logs def test_bookmarks_14(): pdf = FakeHTML(string='''

a

b c d

e f

g h i

''').write_pdf() assert re.findall(b'/Count ([0-9-]*)', pdf)[-1] == b'4' assert re.findall(b'/Title \\((.*)\\)', pdf) == [ b'a', b'b c d', b'e f', b'g h i'] @assert_no_logs def test_links_none(): pdf = FakeHTML(string='').write_pdf() assert b'Annots' not in pdf @assert_no_logs def test_links(): pdf = FakeHTML(string='''

Hello, World

a

''', base_url=resource_filename('')).write_pdf() uris = re.findall(b'/URI \\((.*)\\)', pdf) types = re.findall(b'/S (.*)', pdf) subtypes = re.findall(b'/Subtype (.*)', pdf) rects = [ [float(number) for number in match.split()] for match in re.findall( b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]', pdf)] # 30pt wide (like the image), 20pt high (like line-height) assert uris.pop(0) == b'http://weasyprint.org' assert subtypes.pop(0) == b'/Link' assert types.pop(0) == b'/URI' assert rects.pop(0) == [0, TOP, 30, TOP - 20] # The image itself: 30*30pt assert uris.pop(0) == b'http://weasyprint.org' assert subtypes.pop(0) == b'/Link' assert types.pop(0) == b'/URI' assert rects.pop(0) == [0, TOP, 30, TOP - 30] # 32pt wide (image + 2 * 1pt of border), 20pt high assert subtypes.pop(0) == b'/Link' assert b'/Dest (lipsum)' in pdf link = re.search( b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]', pdf).group(1) assert [float(number) for number in link.split()] == [0, TOP, 0] assert rects.pop(0) == [10, TOP - 100, 10 + 32, TOP - 100 - 20] # The image itself: 32*32pt assert subtypes.pop(0) == b'/Link' assert rects.pop(0) == [10, TOP - 100, 10 + 32, TOP - 100 - 32] # 100% wide (block), 30pt high assert subtypes.pop(0) == b'/Link' assert b'/Dest (hello)' in pdf link = re.search( b'\\(hello\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]', pdf).group(1) assert [float(number) for number in link.split()] == [0, TOP - 200, 0] assert rects.pop(0) == [0, TOP, RIGHT, TOP - 30] @assert_no_logs def test_sorted_links(): # Regression test for https://github.com/Kozea/WeasyPrint/issues/1352 pdf = FakeHTML(string='''

zzz

aaa

z a ''', base_url=resource_filename('')).write_pdf() assert b'(zzz) [' in pdf.split(b'(aaa) [')[-1] @assert_no_logs def test_relative_links_no_height(): # 100% wide (block), 0pt high pdf = FakeHTML( string='a', base_url='http://weasyprint.org/foo/bar/').write_pdf() assert b'/S /URI\n/URI (http://weasyprint.org/foo/lipsum)' assert f'/Rect [ 0 {TOP} {RIGHT} {TOP} ]'.encode() in pdf @assert_no_logs def test_relative_links_missing_base(): # Relative URI reference without a base URI pdf = FakeHTML( string='a', base_url=None).write_pdf() assert b'/S /URI\n/URI (../lipsum)' assert f'/Rect [ 0 {TOP} {RIGHT} {TOP} ]'.encode() in pdf @assert_no_logs def test_relative_links_missing_base_link(): # Relative URI reference without a base URI: not supported for -weasy-link with capture_logs() as logs: pdf = FakeHTML( string='
', base_url=None).write_pdf() assert b'/Annots' not in pdf assert len(logs) == 1 assert 'WARNING: Ignored `-weasy-link: url(../lipsum)`' in logs[0] assert 'Relative URI reference without a base URI' in logs[0] @assert_no_logs def test_relative_links_internal(): # Internal URI reference without a base URI: OK pdf = FakeHTML( string='a', base_url=None).write_pdf() assert b'/Dest (lipsum)' in pdf link = re.search( b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]', pdf).group(1) assert [float(number) for number in link.split()] == [0, TOP, 0] rect = re.search( b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]', pdf).group(1) assert [float(number) for number in rect.split()] == [0, TOP, RIGHT, TOP] @assert_no_logs def test_relative_links_anchors(): pdf = FakeHTML( string='
a', base_url=None).write_pdf() assert b'/Dest (lipsum)' in pdf link = re.search( b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]', pdf).group(1) assert [float(number) for number in link.split()] == [0, TOP, 0] rect = re.search( b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]', pdf).group(1) assert [float(number) for number in rect.split()] == [0, TOP, RIGHT, TOP] @assert_no_logs def test_relative_links_different_base(): pdf = FakeHTML( string='a', base_url='http://weasyprint.org/foo/bar/').write_pdf() assert b'http://weasyprint.org/test/lipsum' in pdf @assert_no_logs def test_relative_links_same_base(): pdf = FakeHTML( string='a', base_url='http://weasyprint.org/foo/bar/').write_pdf() assert b'/Dest (test)' in pdf @assert_no_logs def test_missing_links(): with capture_logs() as logs: pdf = FakeHTML(string=''' a ''', base_url=None).write_pdf() assert b'/Dest (lipsum)' in pdf assert len(logs) == 1 link = re.search( b'\\(lipsum\\) \\[ \\d+ 0 R /XYZ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+) ]', pdf).group(1) assert [float(number) for number in link.split()] == [0, TOP - 15, 0] rect = re.search( b'/Rect \\[ ([\\d\\.]+ [\\d\\.]+ [\\d\\.]+ [\\d\\.]+) \\]', pdf).group(1) assert [float(number) for number in rect.split()] == [ 0, TOP, RIGHT, TOP - 15] assert 'ERROR: No anchor #missing for internal URI reference' in logs[0] @assert_no_logs def test_anchor_multiple_pages(): pdf = FakeHTML(string='''
''', base_url=None).write_pdf() first_page, = re.findall(b'/Kids \\[ (\\d+) 0 R', pdf) assert b'/Names [ (lipsum) [ ' + first_page in pdf @assert_no_logs def test_embed_gif(): assert b'/Filter /DCTDecode' not in FakeHTML( base_url=resource_filename('dummy.html'), string='').write_pdf() @assert_no_logs def test_embed_jpeg(): # JPEG-encoded image, embedded in PDF: assert b'/Filter /DCTDecode' in FakeHTML( base_url=resource_filename('dummy.html'), string='').write_pdf() @assert_no_logs def test_embed_image_once(): # Image repeated multiple times, embedded once assert FakeHTML( base_url=resource_filename('dummy.html'), string='''
''').write_pdf().count(b'/Filter /DCTDecode') == 1 @assert_no_logs def test_embed_images_from_pages(): page1, = FakeHTML( base_url=resource_filename('dummy.html'), string='').render().pages page2, = FakeHTML( base_url=resource_filename('dummy.html'), string='').render().pages document = Document( (page1, page2), metadata=DocumentMetadata(), font_config=FontConfiguration(), url_fetcher=None, optimize_size=()).write_pdf() assert document.count(b'/Filter /DCTDecode') == 2 @assert_no_logs def test_document_info(): pdf = FakeHTML(string=''' Test document

Another title

''').write_pdf() assert b'/Author (I Me & Myself)' in pdf assert b'/Title (Test document)' in pdf assert ( b'/Creator ') in pdf assert b'/Keywords (html, css, pdf)' in pdf assert b'/Subject ' in pdf assert b'/CreationDate (20110421230000Z)' in pdf assert b"/ModDate (20130721234600+01'00)" in pdf @assert_no_logs def test_embedded_files_attachments(tmpdir): absolute_tmp_file = tmpdir.join('some_file.txt').strpath adata = b'12345678' with open(absolute_tmp_file, 'wb') as afile: afile.write(adata) absolute_url = path2url(absolute_tmp_file) assert absolute_url.startswith('file://') relative_tmp_file = tmpdir.join('äöü.txt').strpath rdata = b'abcdefgh' with open(relative_tmp_file, 'wb') as rfile: rfile.write(rdata) pdf = FakeHTML( string=''' Test document

Heading 1

Heading 2

'''.format(absolute_url, os.path.basename(relative_tmp_file)), base_url=tmpdir.strpath, ).write_pdf( attachments=[ Attachment('data:,oob attachment', description='Hello'), 'data:,raw URL', io.BytesIO(b'file like obj') ] ) assert '<{}>'.format(hashlib.md5(b'hi there').hexdigest()).encode() in pdf assert b'/F ()' in pdf assert b'/UF (attachment.bin)' in pdf name = BOM_UTF16_BE + 'some file attachment äöü'.encode('utf-16-be') assert b'/Desc <' + name.hex().encode() + b'>' in pdf assert hashlib.md5(adata).hexdigest().encode() in pdf assert os.path.basename(absolute_tmp_file).encode() in pdf assert hashlib.md5(rdata).hexdigest().encode() in pdf name = BOM_UTF16_BE + 'some file attachment äöü'.encode('utf-16-be') assert b'/Desc <' + name.hex().encode() + b'>' in pdf assert hashlib.md5(b'oob attachment').hexdigest().encode() in pdf assert b'/Desc (Hello)' in pdf assert hashlib.md5(b'raw URL').hexdigest().encode() in pdf assert hashlib.md5(b'file like obj').hexdigest().encode() in pdf assert b'/EmbeddedFiles' in pdf assert b'/Outlines' in pdf @assert_no_logs def test_attachments_data(): pdf = FakeHTML(string=''' Test document 2 ''').write_pdf() md5 = '<{}>'.format(hashlib.md5(b'some data').hexdigest()).encode() assert md5 in pdf @assert_no_logs def test_attachments_none(): pdf = FakeHTML(string=''' Test document 3

Heading

''').write_pdf() assert b'Names' not in pdf assert b'Outlines' in pdf @assert_no_logs def test_attachments_none_empty(): pdf = FakeHTML(string=''' Test document 3 ''').write_pdf() assert b'Names' not in pdf assert b'Outlines' not in pdf @assert_no_logs def test_annotations(): pdf = FakeHTML(string=''' Test document A link that lets you download an attachment ''').write_pdf() assert hashlib.md5(b'some data').hexdigest().encode() in pdf assert b'/FileAttachment' in pdf assert b'/EmbeddedFiles' not in pdf @pytest.mark.parametrize('style, media, bleed, trim', ( ('bleed: 30pt; size: 10pt', [-30, -30, 40, 40], [-10, -10, 20, 20], [0, 0, 10, 10]), ('bleed: 15pt 3pt 6pt 18pt; size: 12pt 15pt', [-18, -15, 15, 21], [-10, -10, 15, 21], [0, 0, 12, 15]), )) @assert_no_logs def test_bleed(style, media, bleed, trim): pdf = FakeHTML(string=''' Test document test ''' % style).write_pdf() assert '/MediaBox [ {} {} {} {} ]'.format(*media).encode() in pdf assert '/BleedBox [ {} {} {} {} ]'.format(*bleed).encode() in pdf assert '/TrimBox [ {} {} {} {} ]'.format(*trim).encode() in pdf