hfs+: rewrite percent-escaper (issue3918)

The original code was a bit too clever and got confused by some cp949 Korean text. This rewrite bytes the bullet and manually decodes UTF-8 sequences. Adds some doctests.
2024-10-06 23:07:18 +03:00 · 2013-05-04 14:51:21 -05:00 · 2013-05-04 14:51:21 -05:00 · 3a8df7d53a
commit 3a8df7d53a
parent c787225d04
1 changed files with 49 additions and 7 deletions
--- a/mercurial/posix.py
+++ b/mercurial/posix.py
@ -194,6 +194,22 @@ if sys.platform == 'darwin':
    import fcntl # only needed on darwin, missing on jython

    def normcase(path):
+        '''
+        Normalize a filename for OS X-compatible comparison:
+        - escape-encode invalid characters
+        - decompose to NFD
+        - lowercase
+
+        >>> normcase('UPPER')
+        'upper'
+        >>> normcase('Caf\xc3\xa9')
+        'cafe\\xcc\\x81'
+        >>> normcase('\xc3\x89')
+        'e\\xcc\\x81'
+        >>> normcase('\xb8\xca\xc3\xca\xbe\xc8.JPG') # issue3918
+        '%b8%ca%c3\\xca\\xbe%c8.jpg'
+        '''
+
        try:
            path.decode('ascii') # throw exception for non-ASCII character
            return path.lower()
@ -202,16 +218,42 @@ if sys.platform == 'darwin':
        try:
            u = path.decode('utf-8')
        except UnicodeDecodeError:
-            # percent-encode any characters that don't round-trip
-            p2 = path.decode('utf-8', 'ignore').encode('utf-8')
-            s = ""
-            pos = 0
+            # OS X percent-encodes any bytes that aren't valid utf-8
+            s = ''
+            g = ''
+            l = 0
            for c in path:
-                if p2[pos:pos + 1] == c:
+                o = ord(c)
+                if l and o < 128 or o >= 192:
+                    # we want a continuation byte, but didn't get one
+                    s += ''.join(["%%%02X" % ord(x) for x in g])
+                    g = ''
+                    l = 0
+                if l == 0 and o < 128:
+                    # ascii
                    s += c
-                    pos += 1
+                elif l == 0 and 194 <= o < 245:
+                    # valid leading bytes
+                    if o < 224:
+                        l = 1
+                    elif o < 240:
+                        l = 2
+                    else:
+                        l = 3
+                    g = c
+                elif l > 0 and 128 <= o < 192:
+                    # valid continuations
+                    g += c
+                    l -= 1
+                    if not l:
+                        s += g
+                        g = ''
                else:
-                    s += "%%%02X" % ord(c)
+                    # invalid
+                    s += "%%%02X" % o
+
+            # any remaining partial characters
+            s += ''.join(["%%%02X" % ord(x) for x in g])
            u = s.decode('utf-8')

        # Decompose then lowercase (HFS+ technote specifies lower)