1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-10-05 00:21:15 +03:00

Refactor PDF dictionaries.

This commit is contained in:
Simon Sapin 2012-05-20 13:00:39 +02:00
parent 6e7ef6b62a
commit 6498c3fb00

View File

@ -38,13 +38,6 @@ from . import VERSION_STRING
from .compat import xrange, iteritems
TRAILER_RE = re.compile(b'\ntrailer\n(.+)\nstartxref\n(\d+)\n%%EOF\n$',
re.DOTALL)
# No end delimiter, + defaults to greedy
DICT_TYPE_RE = re.compile(b'/Type /(\w+)')
class PDFDictionary(object):
def __init__(self, object_number, byte_string):
self.object_number = object_number
@ -54,35 +47,23 @@ class PDFDictionary(object):
return self.__class__.__name__ + repr(
(self.object_number, self.byte_string))
def get_type(self):
return DICT_TYPE_RE.search(self.byte_string).group(1).decode('ascii')
_re_cache = {}
# __cache is a shared mutable, not an actual parameter.
def get_indirect_object_number(self, key, __cache={}):
"""Read the value for `key`, assuming it is an indirect object.
:return: (int) the object number
"""
regex = __cache.get(key)
def _get_value(self, key, value_re):
regex = self._re_cache.get((key, value_re))
if not regex:
regex = re.compile(('/%s (\d+) 0 R' % key).encode('ascii'))
__cache[key] = regex
return int(regex.search(self.byte_string).group(1))
# __cache is a shared mutable, not an actual parameter.
def get_array(self, key, __cache={}):
"""Read the value for `key`, assuming it is an array.
:return: (bytes) the unparsed array content.
"""
regex = __cache.get(key)
if not regex:
regex = re.compile(('/%s \[([^\]]+)\]' % key).encode('ascii'))
__cache[key] = regex
regex = re.compile('/{} {}'.format(key, value_re).encode('ascii'))
self._re_cache[key, value_re] = regex
return regex.search(self.byte_string).group(1)
def get_type(self):
"""
:returns: the value for the /Type key.
"""
# No end delimiter, + defaults to greedy
return self._get_value('Type', '/(\w+)').decode('ascii')
def get_indirect_dict(self, key, pdf_file):
"""Read the value for `key` and follow the reference, assuming
it is an indirect dictionary object.
@ -90,7 +71,7 @@ class PDFDictionary(object):
:return: a new PDFDictionary instance.
"""
object_number = self.get_indirect_object_number(key)
object_number = int(self._get_value(key, '(\d+) 0 R'))
return type(self)(object_number, pdf_file.read_object(object_number))
def get_indirect_dict_array(self, key, pdf_file):
@ -100,7 +81,7 @@ class PDFDictionary(object):
:return: a list of new PDFDictionary instance.
"""
parts = self.get_array(key).split(b' 0 R')
parts = self._get_value(key, '\[([^\]]+)\]').split(b' 0 R')
# The array looks like this: ' <a> 0 R <b> 0 R <c> 0 R '
# so `parts` ends up like this [' <a>', ' <b>', ' <c>', ' ']
# With the trailing white space in the list.
@ -117,11 +98,14 @@ class PDFFile(object):
A seekable binary file-like object for a PDF generated by cairo.
"""
trailer_re = re.compile(
b'\ntrailer\n(.+)\nstartxref\n(\d+)\n%%EOF\n$', re.DOTALL)
def __init__(self, fileobj):
# cairos trailer only has Size, Root and Info.
# The trailer + startxref + EOF is typically under 100 bytes
fileobj.seek(-200, os.SEEK_END)
trailer, startxref = TRAILER_RE.search(fileobj.read()).groups()
trailer, startxref = self.trailer_re.search(fileobj.read()).groups()
trailer = PDFDictionary(None, trailer)
startxref = int(startxref)