hfs+: rewrite percent-escaper (issue3918)

The original code was a bit too clever and got confused by some cp949
Korean text. This rewrite bytes the bullet and manually decodes UTF-8
sequences. Adds some doctests.
This commit is contained in:
Matt Mackall 2013-05-04 14:51:21 -05:00
parent c787225d04
commit 3a8df7d53a

View File

@ -194,6 +194,22 @@ if sys.platform == 'darwin':
import fcntl # only needed on darwin, missing on jython
def normcase(path):
'''
Normalize a filename for OS X-compatible comparison:
- escape-encode invalid characters
- decompose to NFD
- lowercase
>>> normcase('UPPER')
'upper'
>>> normcase('Caf\xc3\xa9')
'cafe\\xcc\\x81'
>>> normcase('\xc3\x89')
'e\\xcc\\x81'
>>> normcase('\xb8\xca\xc3\xca\xbe\xc8.JPG') # issue3918
'%b8%ca%c3\\xca\\xbe%c8.jpg'
'''
try:
path.decode('ascii') # throw exception for non-ASCII character
return path.lower()
@ -202,16 +218,42 @@ if sys.platform == 'darwin':
try:
u = path.decode('utf-8')
except UnicodeDecodeError:
# percent-encode any characters that don't round-trip
p2 = path.decode('utf-8', 'ignore').encode('utf-8')
s = ""
pos = 0
# OS X percent-encodes any bytes that aren't valid utf-8
s = ''
g = ''
l = 0
for c in path:
if p2[pos:pos + 1] == c:
o = ord(c)
if l and o < 128 or o >= 192:
# we want a continuation byte, but didn't get one
s += ''.join(["%%%02X" % ord(x) for x in g])
g = ''
l = 0
if l == 0 and o < 128:
# ascii
s += c
pos += 1
elif l == 0 and 194 <= o < 245:
# valid leading bytes
if o < 224:
l = 1
elif o < 240:
l = 2
else:
l = 3
g = c
elif l > 0 and 128 <= o < 192:
# valid continuations
g += c
l -= 1
if not l:
s += g
g = ''
else:
s += "%%%02X" % ord(c)
# invalid
s += "%%%02X" % o
# any remaining partial characters
s += ''.join(["%%%02X" % ord(x) for x in g])
u = s.decode('utf-8')
# Decompose then lowercase (HFS+ technote specifies lower)