mirror of
https://github.com/facebook/sapling.git
synced 2024-12-26 06:21:48 +03:00
py3: fix unicode characters in patches
Summary: When I added the surrogateescape patch for the email parser decoder used during patches, I incorrectly added a corresponding encoder on the other end when we get the data out of the parser. It turns out the parser is smart/dumb. When using get_payload() it attempts a few different decodings of the data and ends up replacing all the non-ascii characters with replacement bits (question marks). Instead we should use get_payload(decode=True), which bizarrely actually encodes the data into bytes, correctly detecting the presence of surrogates and using the correct ascii+surrogateescape encoding. Reviewed By: singhsrb Differential Revision: D23720111 fbshipit-source-id: ed40a15056c39730c91067b830f194fbe41e5788
This commit is contained in:
parent
f2e0da3af5
commit
9f80bd1d6f
@ -251,12 +251,6 @@ def extract(ui, fileobj):
|
||||
ui.debug("Content-Type: %s\n" % content_type)
|
||||
if content_type not in ok_types:
|
||||
continue
|
||||
if sys.version_info[0] >= 3:
|
||||
# The message was surrogateescape encoded, so we need to undo
|
||||
# that.
|
||||
payload = part.get_payload()
|
||||
payload = payload.encode("ascii", errors="surrogateescape")
|
||||
else:
|
||||
payload = part.get_payload(decode=True)
|
||||
m = diffre.search(payload)
|
||||
if m:
|
||||
|
@ -157,8 +157,10 @@ if sys.version_info[0] >= 3:
|
||||
|
||||
ep = email.parser.Parser()
|
||||
# disable the "universal newlines" mode, which isn't binary safe.
|
||||
# We'll have to use surrogateescape when encoding the string back to
|
||||
# bytes later.
|
||||
# Note, although we specific ascii+surrogateescape decoding here, we don't have
|
||||
# to specify it elsewhere for reencoding as the email.parser detects the
|
||||
# surrogates and automatically chooses the appropriate encoding.
|
||||
# See: https://github.com/python/cpython/blob/3.8/Lib/email/message.py::get_payload()
|
||||
fp = io.TextIOWrapper(
|
||||
fp, encoding=r"ascii", errors=r"surrogateescape", newline=chr(10)
|
||||
)
|
||||
|
@ -1,3 +1,4 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
# Copyright (c) Mercurial Contributors.
|
||||
#
|
||||
@ -1497,3 +1498,12 @@ sh % "printf 'diff --git a/a b/b\\nrename from a\\nrename to b'" | "hg import -"
|
||||
a not tracked!
|
||||
abort: source file 'a' does not exist
|
||||
[255]"""
|
||||
|
||||
# Verify that utf-8 characters in patches can be imported
|
||||
open("unicode.txt", "w").write("echo 🍺")
|
||||
sh % "hg commit -Aqm unicode"
|
||||
sh % "hg rm unicode.txt"
|
||||
sh % "hg commit -qm remove"
|
||||
sh % "hg export --rev 'desc(unicode)'" | "hg import -" == r"""
|
||||
applying patch from stdin
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user