Change the way filenames are managed

2024-10-04 16:07:57 +03:00 · 2016-08-16 22:15:18 +02:00 · 2016-08-16 22:15:18 +02:00 · 59e4fb5146
commit 59e4fb5146
parent b94546079d
3 changed files with 24 additions and 21 deletions
--- a/weasyprint/compat.py
+++ b/weasyprint/compat.py
@ -12,6 +12,7 @@

 from __future__ import division, unicode_literals

+import codecs
 import sys
 import email

@ -24,6 +25,15 @@ __all__ = ['Request', 'base64_decode', 'base64_encode', 'basestring',
           'urlparse_uses_relative', 'urlsplit', 'xrange']


+# getfilesystemencoding() on Linux is sometimes stupid...
+FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8'
+try:
+    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
+        FILESYSTEM_ENCODING = 'utf-8'
+except LookupError:
+    FILESYSTEM_ENCODING = 'utf-8'
+
+
 if sys.version_info[0] >= 3:
    # Python 3
    from urllib.parse import (
@ -63,7 +73,7 @@ else:
    from urlparse import (urljoin, urlsplit, parse_qs,
                          uses_relative as urlparse_uses_relative)
    from urllib2 import urlopen, Request
-    from urllib import pathname2url, quote, unquote, urlencode
+    from urllib import pathname2url as _pathname2url, quote, unquote, urlencode
    from array import array as _array
    from itertools import izip, imap
    from base64 import (decodestring as base64_decode,
@ -77,6 +87,11 @@ else:
    def array(typecode, initializer):
        return _array(typecode.encode('ascii'), initializer)

+    def pathname2url(path):
+        if isinstance(path, unicode):
+            path = path.encode(FILESYSTEM_ENCODING)
+        return _pathname2url(path)
+
    def urllib_get_content_type(urlobj):
        return urlobj.info().gettype()

--- a/weasyprint/document.py
+++ b/weasyprint/document.py
@ -11,7 +11,6 @@
 from __future__ import division, unicode_literals

 import io
-import sys
 import math
 import shutil
 import functools
@ -28,8 +27,7 @@ from .layout import layout_document
 from .layout.backgrounds import percentage
 from .draw import draw_page, stacked
 from .pdf import write_pdf_metadata
-from .compat import izip, iteritems, unicode
-from .urls import FILESYSTEM_ENCODING
+from .compat import izip, iteritems


 def _get_matrix(box):
@ -551,9 +549,6 @@ class Document(object):
            surface.write_to_png(target)
            png_bytes = target.getvalue()
        else:
-            if sys.version_info[0] < 3 and isinstance(target, unicode):
-                # py2cairo 1.8 does not support unicode filenames.
-                target = target.encode(FILESYSTEM_ENCODING)
            surface.write_to_png(target)
            png_bytes = None
        return png_bytes, max_width, sum_heights
--- a/weasyprint/urls.py
+++ b/weasyprint/urls.py
@ -15,7 +15,6 @@ from __future__ import division, unicode_literals
 import io
 import re
 import sys
-import codecs
 import os.path
 import mimetypes
 import contextlib
@ -40,15 +39,6 @@ else:
    mimetypes.add_type(b'image/svg+xml', b'.svg')


-# getfilesystemencoding() on Linux is sometimes stupid...
-FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8'
-try:
-    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
-        FILESYSTEM_ENCODING = 'utf-8'
-except LookupError:
-    FILESYSTEM_ENCODING = 'utf-8'
-
-
 # See http://stackoverflow.com/a/11687993/1162888
 # Both are needed in Python 3 as the re module does not like to mix
 # http://tools.ietf.org/html/rfc3986#section-3.1
@ -64,8 +54,7 @@ def iri_to_uri(url):
        # Data URIs can be huge, but don’t need this anyway.
        return url
    # Use UTF-8 as per RFC 3987 (IRI), except for file://
-    url = url.encode(FILESYSTEM_ENCODING
-                     if url.startswith('file:') else 'utf-8')
+    url = url.encode('utf-8')
    # This is a full URI, not just a component. Only %-encode characters
    # that are not allowed at all in URIs. Everthing else is "safe":
    # * Reserved characters: /:?#[]@!$&'()*+,;=
@ -82,8 +71,6 @@ def path2url(path):
        # Make sure directory names have a trailing slash.
        # Otherwise relative URIs are resolved from the parent directory.
        path += os.path.sep
-    if isinstance(path, unicode):
-        path = path.encode(FILESYSTEM_ENCODING)
    path = pathname2url(path)
    if path.startswith('///'):
        # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
@ -271,6 +258,12 @@ def default_url_fetcher(url):
    """
    if url.lower().startswith('data:'):
        return open_data_url(url)
+    elif url.lower().startswith('file:'):
+        filename = unquote(url[7:])  # len('file://') == 7
+        mime_type = mimetypes.guess_type(filename)[0]
+        with open(filename, 'rb') as fd:
+            return dict(filename=filename, string=fd.read(),
+                        mime_type=mime_type, redirected_url=url)
    elif UNICODE_SCHEME_RE.match(url):
        url = iri_to_uri(url)
        response = urlopen(Request(url, headers=HTTP_HEADERS))