From 59e4fb5146832dc81fb45ed56c3ad4872ef3622d Mon Sep 17 00:00:00 2001 From: Guillaume Ayoub Date: Tue, 16 Aug 2016 22:15:18 +0200 Subject: [PATCH] Change the way filenames are managed --- weasyprint/compat.py | 17 ++++++++++++++++- weasyprint/document.py | 7 +------ weasyprint/urls.py | 21 +++++++-------------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/weasyprint/compat.py b/weasyprint/compat.py index f1d7de84..c5e59e7a 100644 --- a/weasyprint/compat.py +++ b/weasyprint/compat.py @@ -12,6 +12,7 @@ from __future__ import division, unicode_literals +import codecs import sys import email @@ -24,6 +25,15 @@ __all__ = ['Request', 'base64_decode', 'base64_encode', 'basestring', 'urlparse_uses_relative', 'urlsplit', 'xrange'] +# getfilesystemencoding() on Linux is sometimes stupid... +FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8' +try: + if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii': + FILESYSTEM_ENCODING = 'utf-8' +except LookupError: + FILESYSTEM_ENCODING = 'utf-8' + + if sys.version_info[0] >= 3: # Python 3 from urllib.parse import ( @@ -63,7 +73,7 @@ else: from urlparse import (urljoin, urlsplit, parse_qs, uses_relative as urlparse_uses_relative) from urllib2 import urlopen, Request - from urllib import pathname2url, quote, unquote, urlencode + from urllib import pathname2url as _pathname2url, quote, unquote, urlencode from array import array as _array from itertools import izip, imap from base64 import (decodestring as base64_decode, @@ -77,6 +87,11 @@ else: def array(typecode, initializer): return _array(typecode.encode('ascii'), initializer) + def pathname2url(path): + if isinstance(path, unicode): + path = path.encode(FILESYSTEM_ENCODING) + return _pathname2url(path) + def urllib_get_content_type(urlobj): return urlobj.info().gettype() diff --git a/weasyprint/document.py b/weasyprint/document.py index 5808bf6c..a576df49 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -11,7 +11,6 @@ from __future__ import division, unicode_literals import io -import sys import math import shutil import functools @@ -28,8 +27,7 @@ from .layout import layout_document from .layout.backgrounds import percentage from .draw import draw_page, stacked from .pdf import write_pdf_metadata -from .compat import izip, iteritems, unicode -from .urls import FILESYSTEM_ENCODING +from .compat import izip, iteritems def _get_matrix(box): @@ -551,9 +549,6 @@ class Document(object): surface.write_to_png(target) png_bytes = target.getvalue() else: - if sys.version_info[0] < 3 and isinstance(target, unicode): - # py2cairo 1.8 does not support unicode filenames. - target = target.encode(FILESYSTEM_ENCODING) surface.write_to_png(target) png_bytes = None return png_bytes, max_width, sum_heights diff --git a/weasyprint/urls.py b/weasyprint/urls.py index d4f48710..3cff11b8 100644 --- a/weasyprint/urls.py +++ b/weasyprint/urls.py @@ -15,7 +15,6 @@ from __future__ import division, unicode_literals import io import re import sys -import codecs import os.path import mimetypes import contextlib @@ -40,15 +39,6 @@ else: mimetypes.add_type(b'image/svg+xml', b'.svg') -# getfilesystemencoding() on Linux is sometimes stupid... -FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8' -try: - if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii': - FILESYSTEM_ENCODING = 'utf-8' -except LookupError: - FILESYSTEM_ENCODING = 'utf-8' - - # See http://stackoverflow.com/a/11687993/1162888 # Both are needed in Python 3 as the re module does not like to mix # http://tools.ietf.org/html/rfc3986#section-3.1 @@ -64,8 +54,7 @@ def iri_to_uri(url): # Data URIs can be huge, but don’t need this anyway. return url # Use UTF-8 as per RFC 3987 (IRI), except for file:// - url = url.encode(FILESYSTEM_ENCODING - if url.startswith('file:') else 'utf-8') + url = url.encode('utf-8') # This is a full URI, not just a component. Only %-encode characters # that are not allowed at all in URIs. Everthing else is "safe": # * Reserved characters: /:?#[]@!$&'()*+,;= @@ -82,8 +71,6 @@ def path2url(path): # Make sure directory names have a trailing slash. # Otherwise relative URIs are resolved from the parent directory. path += os.path.sep - if isinstance(path, unicode): - path = path.encode(FILESYSTEM_ENCODING) path = pathname2url(path) if path.startswith('///'): # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo' @@ -271,6 +258,12 @@ def default_url_fetcher(url): """ if url.lower().startswith('data:'): return open_data_url(url) + elif url.lower().startswith('file:'): + filename = unquote(url[7:]) # len('file://') == 7 + mime_type = mimetypes.guess_type(filename)[0] + with open(filename, 'rb') as fd: + return dict(filename=filename, string=fd.read(), + mime_type=mime_type, redirected_url=url) elif UNICODE_SCHEME_RE.match(url): url = iri_to_uri(url) response = urlopen(Request(url, headers=HTTP_HEADERS))