From 59e4fb5146832dc81fb45ed56c3ad4872ef3622d Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume.ayoub@kozea.fr>
Date: Tue, 16 Aug 2016 22:15:18 +0200
Subject: [PATCH] Change the way filenames are managed

---
 weasyprint/compat.py   | 17 ++++++++++++++++-
 weasyprint/document.py |  7 +------
 weasyprint/urls.py     | 21 +++++++--------------
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/weasyprint/compat.py b/weasyprint/compat.py
index f1d7de84..c5e59e7a 100644
--- a/weasyprint/compat.py
+++ b/weasyprint/compat.py
@@ -12,6 +12,7 @@
 
 from __future__ import division, unicode_literals
 
+import codecs
 import sys
 import email
 
@@ -24,6 +25,15 @@ __all__ = ['Request', 'base64_decode', 'base64_encode', 'basestring',
            'urlparse_uses_relative', 'urlsplit', 'xrange']
 
 
+# getfilesystemencoding() on Linux is sometimes stupid...
+FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8'
+try:
+    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
+        FILESYSTEM_ENCODING = 'utf-8'
+except LookupError:
+    FILESYSTEM_ENCODING = 'utf-8'
+
+
 if sys.version_info[0] >= 3:
     # Python 3
     from urllib.parse import (
@@ -63,7 +73,7 @@ else:
     from urlparse import (urljoin, urlsplit, parse_qs,
                           uses_relative as urlparse_uses_relative)
     from urllib2 import urlopen, Request
-    from urllib import pathname2url, quote, unquote, urlencode
+    from urllib import pathname2url as _pathname2url, quote, unquote, urlencode
     from array import array as _array
     from itertools import izip, imap
     from base64 import (decodestring as base64_decode,
@@ -77,6 +87,11 @@ else:
     def array(typecode, initializer):
         return _array(typecode.encode('ascii'), initializer)
 
+    def pathname2url(path):
+        if isinstance(path, unicode):
+            path = path.encode(FILESYSTEM_ENCODING)
+        return _pathname2url(path)
+
     def urllib_get_content_type(urlobj):
         return urlobj.info().gettype()
 
diff --git a/weasyprint/document.py b/weasyprint/document.py
index 5808bf6c..a576df49 100644
--- a/weasyprint/document.py
+++ b/weasyprint/document.py
@@ -11,7 +11,6 @@
 from __future__ import division, unicode_literals
 
 import io
-import sys
 import math
 import shutil
 import functools
@@ -28,8 +27,7 @@ from .layout import layout_document
 from .layout.backgrounds import percentage
 from .draw import draw_page, stacked
 from .pdf import write_pdf_metadata
-from .compat import izip, iteritems, unicode
-from .urls import FILESYSTEM_ENCODING
+from .compat import izip, iteritems
 
 
 def _get_matrix(box):
@@ -551,9 +549,6 @@ class Document(object):
             surface.write_to_png(target)
             png_bytes = target.getvalue()
         else:
-            if sys.version_info[0] < 3 and isinstance(target, unicode):
-                # py2cairo 1.8 does not support unicode filenames.
-                target = target.encode(FILESYSTEM_ENCODING)
             surface.write_to_png(target)
             png_bytes = None
         return png_bytes, max_width, sum_heights
diff --git a/weasyprint/urls.py b/weasyprint/urls.py
index d4f48710..3cff11b8 100644
--- a/weasyprint/urls.py
+++ b/weasyprint/urls.py
@@ -15,7 +15,6 @@ from __future__ import division, unicode_literals
 import io
 import re
 import sys
-import codecs
 import os.path
 import mimetypes
 import contextlib
@@ -40,15 +39,6 @@ else:
     mimetypes.add_type(b'image/svg+xml', b'.svg')
 
 
-# getfilesystemencoding() on Linux is sometimes stupid...
-FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8'
-try:
-    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
-        FILESYSTEM_ENCODING = 'utf-8'
-except LookupError:
-    FILESYSTEM_ENCODING = 'utf-8'
-
-
 # See http://stackoverflow.com/a/11687993/1162888
 # Both are needed in Python 3 as the re module does not like to mix
 # http://tools.ietf.org/html/rfc3986#section-3.1
@@ -64,8 +54,7 @@ def iri_to_uri(url):
         # Data URIs can be huge, but don’t need this anyway.
         return url
     # Use UTF-8 as per RFC 3987 (IRI), except for file://
-    url = url.encode(FILESYSTEM_ENCODING
-                     if url.startswith('file:') else 'utf-8')
+    url = url.encode('utf-8')
     # This is a full URI, not just a component. Only %-encode characters
     # that are not allowed at all in URIs. Everthing else is "safe":
     # * Reserved characters: /:?#[]@!$&'()*+,;=
@@ -82,8 +71,6 @@ def path2url(path):
         # Make sure directory names have a trailing slash.
         # Otherwise relative URIs are resolved from the parent directory.
         path += os.path.sep
-    if isinstance(path, unicode):
-        path = path.encode(FILESYSTEM_ENCODING)
     path = pathname2url(path)
     if path.startswith('///'):
         # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
@@ -271,6 +258,12 @@ def default_url_fetcher(url):
     """
     if url.lower().startswith('data:'):
         return open_data_url(url)
+    elif url.lower().startswith('file:'):
+        filename = unquote(url[7:])  # len('file://') == 7
+        mime_type = mimetypes.guess_type(filename)[0]
+        with open(filename, 'rb') as fd:
+            return dict(filename=filename, string=fd.read(),
+                        mime_type=mime_type, redirected_url=url)
     elif UNICODE_SCHEME_RE.match(url):
         url = iri_to_uri(url)
         response = urlopen(Request(url, headers=HTTP_HEADERS))