mirror of
https://github.com/facebook/sapling.git
synced 2024-10-06 23:07:18 +03:00
hfs+: rewrite percent-escaper (issue3918)
The original code was a bit too clever and got confused by some cp949 Korean text. This rewrite bytes the bullet and manually decodes UTF-8 sequences. Adds some doctests.
This commit is contained in:
parent
c787225d04
commit
3a8df7d53a
@ -194,6 +194,22 @@ if sys.platform == 'darwin':
|
||||
import fcntl # only needed on darwin, missing on jython
|
||||
|
||||
def normcase(path):
|
||||
'''
|
||||
Normalize a filename for OS X-compatible comparison:
|
||||
- escape-encode invalid characters
|
||||
- decompose to NFD
|
||||
- lowercase
|
||||
|
||||
>>> normcase('UPPER')
|
||||
'upper'
|
||||
>>> normcase('Caf\xc3\xa9')
|
||||
'cafe\\xcc\\x81'
|
||||
>>> normcase('\xc3\x89')
|
||||
'e\\xcc\\x81'
|
||||
>>> normcase('\xb8\xca\xc3\xca\xbe\xc8.JPG') # issue3918
|
||||
'%b8%ca%c3\\xca\\xbe%c8.jpg'
|
||||
'''
|
||||
|
||||
try:
|
||||
path.decode('ascii') # throw exception for non-ASCII character
|
||||
return path.lower()
|
||||
@ -202,16 +218,42 @@ if sys.platform == 'darwin':
|
||||
try:
|
||||
u = path.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# percent-encode any characters that don't round-trip
|
||||
p2 = path.decode('utf-8', 'ignore').encode('utf-8')
|
||||
s = ""
|
||||
pos = 0
|
||||
# OS X percent-encodes any bytes that aren't valid utf-8
|
||||
s = ''
|
||||
g = ''
|
||||
l = 0
|
||||
for c in path:
|
||||
if p2[pos:pos + 1] == c:
|
||||
o = ord(c)
|
||||
if l and o < 128 or o >= 192:
|
||||
# we want a continuation byte, but didn't get one
|
||||
s += ''.join(["%%%02X" % ord(x) for x in g])
|
||||
g = ''
|
||||
l = 0
|
||||
if l == 0 and o < 128:
|
||||
# ascii
|
||||
s += c
|
||||
pos += 1
|
||||
elif l == 0 and 194 <= o < 245:
|
||||
# valid leading bytes
|
||||
if o < 224:
|
||||
l = 1
|
||||
elif o < 240:
|
||||
l = 2
|
||||
else:
|
||||
l = 3
|
||||
g = c
|
||||
elif l > 0 and 128 <= o < 192:
|
||||
# valid continuations
|
||||
g += c
|
||||
l -= 1
|
||||
if not l:
|
||||
s += g
|
||||
g = ''
|
||||
else:
|
||||
s += "%%%02X" % ord(c)
|
||||
# invalid
|
||||
s += "%%%02X" % o
|
||||
|
||||
# any remaining partial characters
|
||||
s += ''.join(["%%%02X" % ord(x) for x in g])
|
||||
u = s.decode('utf-8')
|
||||
|
||||
# Decompose then lowercase (HFS+ technote specifies lower)
|
||||
|
Loading…
Reference in New Issue
Block a user