py3: fix revlog path encodings

Summary: This is needed to move our hg servers to python 3.

Reviewed By: quark-zju

Differential Revision: D24204056

fbshipit-source-id: cbaf97893f8f77b535952ac290766f0fd5e14f0c
This commit is contained in:
Durham Goode 2020-10-09 13:30:10 -07:00 committed by Facebook GitHub Bot
parent 2207e27ce0
commit 43fe23f09d
4 changed files with 68 additions and 63 deletions

View File

@ -150,6 +150,9 @@ if sys.version_info[0] >= 3:
# type: (str) -> str
return value
def inttobyte(value):
return bytes([value])
def parse_email(fp):
# Rarely used, so let's lazy load it
import email.parser
@ -253,6 +256,9 @@ else:
# type: (str) -> str
return value.decode("utf-8", "replace").encode("utf-8")
def inttobyte(value):
return chr(value)
def parse_email(fp):
import email.parser

View File

@ -24,7 +24,7 @@ from edenscmnative import parsers
from . import encoding, error, hintutil, pycompat, util, vfs as vfsmod
from .i18n import _
from .node import bin
from .pycompat import decodeutf8, encodeutf8, range
from .pycompat import decodeutf8, encodeutf8, inttobyte, range
# This avoids a collision between a file named foo and a dir named
@ -144,14 +144,14 @@ def _buildencodefun(forfncache):
"""
e = "_"
xchr = pycompat.bytechr
asciistr = list(map(xchr, range(127)))
asciistr = list(map(inttobyte, range(127)))
capitals = list(range(ord("A"), ord("Z") + 1))
cmap = dict((x, x) for x in asciistr)
for x in _reserved():
cmap[xchr(x)] = "~%02x" % x
cmap[inttobyte(x)] = encodeutf8("~%02x" % x)
for x in capitals + [ord(e)]:
cmap[xchr(x)] = e + xchr(x).lower()
cmap[inttobyte(x)] = encodeutf8(e + xchr(x).lower())
dmap = {}
for k, v in cmap.items():
@ -161,28 +161,33 @@ def _buildencodefun(forfncache):
cmaplong = cmap.copy()
for i in capitals:
c = chr(i)
c = inttobyte(i)
cmaplong[c] = c
assert c not in dmap
dmap[c] = c
cmapverylong = cmaplong.copy()
cmapverylong["_"] = ":"
assert ":" not in dmap
dmap[":"] = "_"
cmapverylong[b"_"] = b":"
assert b":" not in dmap
dmap[b":"] = b"_"
def encodecomp(comp):
encoded = "".join(cmap[c] for c in comp)
assert isinstance(comp, str), "encodecomp accepts str paths"
comp = encodeutf8(comp)
comp = [comp[i : i + 1] for i in range(len(comp))]
encoded = b"".join(cmap[c] for c in comp)
if len(encoded) > 255:
encoded = "".join(cmaplong[c] for c in comp)
encoded = b"".join(cmaplong[c] for c in comp)
if len(encoded) > 255:
encoded = "".join(cmapverylong[c] for c in comp)
return encoded
encoded = b"".join(cmapverylong[c] for c in comp)
return decodeutf8(encoded)
def encodemaybelong(path):
assert isinstance(path, str), "encodemaybelong accepts str paths"
return "/".join(map(encodecomp, path.split("/")))
def decode(s):
assert isinstance(s, bytes), "decode accepts bytes paths"
i = 0
while i < len(s):
for l in range(1, 4):
@ -196,12 +201,18 @@ def _buildencodefun(forfncache):
raise KeyError
if forfncache:
return (
lambda s: "".join([cmap[s[c : c + 1]] for c in range(len(s))]),
lambda s: "".join(list(decode(s))),
)
def encode(s):
assert isinstance(s, str), "encode accepts str paths"
s = encodeutf8(s)
return decodeutf8(b"".join([cmap[s[c : c + 1]] for c in range(len(s))]))
return (encode, lambda s: decodeutf8(b"".join(list(decode(s)))))
else:
return (encodemaybelong, lambda s: "".join(list(decode(s))))
return (
encodemaybelong,
lambda s: decodeutf8(b"".join(list(decode(encodeutf8(s))))),
)
_encodefname, _decodefname = _buildencodefun(True)
@ -239,14 +250,15 @@ def _buildlowerencodefun():
'the~07quick~adshot'
"""
xchr = pycompat.bytechr
cmap = dict([(xchr(x), xchr(x)) for x in range(127)])
cmap = dict([(inttobyte(x), inttobyte(x)) for x in range(127)])
for x in _reserved():
cmap[xchr(x)] = "~%02x" % x
cmap[inttobyte(x)] = encodeutf8("~%02x" % x)
for x in range(ord("A"), ord("Z") + 1):
cmap[xchr(x)] = xchr(x).lower()
cmap[inttobyte(x)] = encodeutf8(xchr(x).lower())
def lowerencode(s):
return "".join([cmap[c] for c in iter(s)])
s = encodeutf8(s)
return decodeutf8(b"".join([cmap[c] for c in iter(s)]))
return lowerencode

View File

@ -1,28 +1,40 @@
# encoding: utf-8
from __future__ import absolute_import, print_function
import sys
from edenscm.mercurial import store
from hghave import require
require(["py2"])
if sys.version_info[0] >= 3:
def escape(s):
return str(s.encode("utf-8"))[2:-1]
else:
def escape(s):
return s.encode("string_escape")
def show(s):
# show test input
print("A = '%s'" % s.encode("string_escape"))
print("A = '%s'" % escape(s))
# show the result of the C implementation, if available
h = store._pathencode(s)
print("B = '%s'" % h.encode("string_escape"))
print("B = '%s'" % escape(h))
# compare it with reference implementation in Python
r = store._hybridencode(s, True)
if h != r:
print("R = '%s'" % r.encode("string_escape"))
print("R = '%s'" % escape(r))
print()
show("data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&'()+,-.;=[]^`{}")
show("data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&()+,-.;=[]^`{}")
print("uppercase char X is encoded as _x")
show("data/ABCDEFGHIJKLMNOPQRSTUVWXYZ")
@ -30,6 +42,9 @@ show("data/ABCDEFGHIJKLMNOPQRSTUVWXYZ")
print("underbar is doubled")
show("data/_")
print("unicode character")
show("data/🐐")
print("tilde is character-encoded")
show("data/~")
@ -39,25 +54,6 @@ show(
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
)
print("characters in ASCII code range 126..255")
show(
"data/\x7e\x7f"
"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
)
show(
"data/\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
"\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
)
show(
"data/\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
"\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
)
show(
"data/\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
)
print("Windows reserved characters")
show(
'data/less <, greater >, colon :, double-quote ", backslash \\'
@ -207,7 +203,7 @@ show(
print("not hitting limit with any of these")
show(
"data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&'()+,-.;="
"data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&()+,-.;="
"[]^`{}xxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-"
"123456789-12345"
)

View File

@ -1,5 +1,5 @@
A = 'data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&\'()+,-.;=[]^`{}'
B = 'data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&\'()+,-.;=[]^`{}'
A = 'data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&()+,-.;=[]^`{}'
B = 'data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&()+,-.;=[]^`{}'
uppercase char X is encoded as _x
A = 'data/ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -9,6 +9,10 @@ underbar is doubled
A = 'data/_'
B = 'data/__'
unicode character
A = 'data/\xf0\x9f\x90\x90'
B = 'data/~f0~9f~90~90'
tilde is character-encoded
A = 'data/~'
B = 'data/~7e'
@ -17,19 +21,6 @@ characters in ASCII code range 1..31
A = 'data/\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'
B = 'data/~01~02~03~04~05~06~07~08~09~0a~0b~0c~0d~0e~0f~10~11~12~13~14~15~16~17~18~19~1a~1b~1c~1d~1e~1f'
characters in ASCII code range 126..255
A = 'data/~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f'
B = 'data/~7e~7f~80~81~82~83~84~85~86~87~88~89~8a~8b~8c~8d~8e~8f~90~91~92~93~94~95~96~97~98~99~9a~9b~9c~9d~9e~9f'
A = 'data/\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf'
B = 'data/~a0~a1~a2~a3~a4~a5~a6~a7~a8~a9~aa~ab~ac~ad~ae~af~b0~b1~b2~b3~b4~b5~b6~b7~b8~b9~ba~bb~bc~bd~be~bf'
A = 'data/\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf'
B = 'data/~c0~c1~c2~c3~c4~c5~c6~c7~c8~c9~ca~cb~cc~cd~ce~cf~d0~d1~d2~d3~d4~d5~d6~d7~d8~d9~da~db~dc~dd~de~df'
A = 'data/\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'
B = 'data/~e0~e1~e2~e3~e4~e5~e6~e7~e8~e9~ea~eb~ec~ed~ee~ef~f0~f1~f2~f3~f4~f5~f6~f7~f8~f9~fa~fb~fc~fd~fe~ff'
Windows reserved characters
A = 'data/less <, greater >, colon :, double-quote ", backslash \\, pipe |, question-mark ?, asterisk *'
B = 'data/less ~3c, greater ~3e, colon ~3a, double-quote ~22, backslash ~5c, pipe ~7c, question-mark ~3f, asterisk ~2a'
@ -229,8 +220,8 @@ A = 'data/z23456789-123456789-123456789-123456789-123456789-xxxxxxxxx-xxxxxxxxx-
B = 'data/z23456789-123456789-123456789-123456789-123456789-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-123456789-12345'
not hitting limit with any of these
A = 'data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&\'()+,-.;=[]^`{}xxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-123456789-12345'
B = 'data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&\'()+,-.;=[]^`{}xxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-123456789-12345'
A = 'data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&()+,-.;=[]^`{}xxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-123456789-12345'
B = 'data/abcdefghijklmnopqrstuvwxyz0123456789 !#%&()+,-.;=[]^`{}xxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-123456789-12345'
underbar hitting length limit due to encoding
A = 'data/_23456789-123456789-123456789-123456789-123456789-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-xxxxxxxxx-123456789-12345'