OLD | NEW |
(Empty) | |
| 1 """ |
| 2 This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogatees
cape" error |
| 3 handler of Python 3. |
| 4 |
| 5 Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc |
| 6 """ |
| 7 |
| 8 # This code is released under the Python license and the BSD 2-clause license |
| 9 |
| 10 import codecs |
| 11 import sys |
| 12 |
| 13 from future import utils |
| 14 |
| 15 |
| 16 FS_ERRORS = 'surrogateescape' |
| 17 |
| 18 # # -- Python 2/3 compatibility ------------------------------------- |
| 19 # FS_ERRORS = 'my_surrogateescape' |
| 20 |
| 21 def u(text): |
| 22 if utils.PY3: |
| 23 return text |
| 24 else: |
| 25 return text.decode('unicode_escape') |
| 26 |
| 27 def b(data): |
| 28 if utils.PY3: |
| 29 return data.encode('latin1') |
| 30 else: |
| 31 return data |
| 32 |
| 33 if utils.PY3: |
| 34 _unichr = chr |
| 35 bytes_chr = lambda code: bytes((code,)) |
| 36 else: |
| 37 _unichr = unichr |
| 38 bytes_chr = chr |
| 39 |
| 40 def surrogateescape_handler(exc): |
| 41 """ |
| 42 Pure Python implementation of the PEP 383: the "surrogateescape" error |
| 43 handler of Python 3. Undecodable bytes will be replaced by a Unicode |
| 44 character U+DCxx on decoding, and these are translated into the |
| 45 original bytes on encoding. |
| 46 """ |
| 47 mystring = exc.object[exc.start:exc.end] |
| 48 |
| 49 try: |
| 50 if isinstance(exc, UnicodeDecodeError): |
| 51 # mystring is a byte-string in this case |
| 52 decoded = replace_surrogate_decode(mystring) |
| 53 elif isinstance(exc, UnicodeEncodeError): |
| 54 # In the case of u'\udcc3'.encode('ascii', |
| 55 # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an |
| 56 # exception anyway after this function is called, even though I thin
k |
| 57 # it's doing what it should. It seems that the strict encoder is cal
led |
| 58 # to encode the unicode string that this function returns ... |
| 59 decoded = replace_surrogate_encode(mystring) |
| 60 else: |
| 61 raise exc |
| 62 except NotASurrogateError: |
| 63 raise exc |
| 64 return (decoded, exc.end) |
| 65 |
| 66 |
| 67 class NotASurrogateError(Exception): |
| 68 pass |
| 69 |
| 70 |
| 71 def replace_surrogate_encode(mystring): |
| 72 """ |
| 73 Returns a (unicode) string, not the more logical bytes, because the codecs |
| 74 register_error functionality expects this. |
| 75 """ |
| 76 decoded = [] |
| 77 for ch in mystring: |
| 78 # if utils.PY3: |
| 79 # code = ch |
| 80 # else: |
| 81 code = ord(ch) |
| 82 |
| 83 # The following magic comes from Py3.3's Python/codecs.c file: |
| 84 if not 0xD800 <= code <= 0xDCFF: |
| 85 # Not a surrogate. Fail with the original exception. |
| 86 raise exc |
| 87 # mybytes = [0xe0 | (code >> 12), |
| 88 # 0x80 | ((code >> 6) & 0x3f), |
| 89 # 0x80 | (code & 0x3f)] |
| 90 # Is this a good idea? |
| 91 if 0xDC00 <= code <= 0xDC7F: |
| 92 decoded.append(_unichr(code - 0xDC00)) |
| 93 elif code <= 0xDCFF: |
| 94 decoded.append(_unichr(code - 0xDC00)) |
| 95 else: |
| 96 raise NotASurrogateError |
| 97 return str().join(decoded) |
| 98 |
| 99 |
| 100 def replace_surrogate_decode(mybytes): |
| 101 """ |
| 102 Returns a (unicode) string |
| 103 """ |
| 104 decoded = [] |
| 105 for ch in mybytes: |
| 106 # We may be parsing newbytes (in which case ch is an int) or a native |
| 107 # str on Py2 |
| 108 if isinstance(ch, int): |
| 109 code = ch |
| 110 else: |
| 111 code = ord(ch) |
| 112 if 0x80 <= code <= 0xFF: |
| 113 decoded.append(_unichr(0xDC00 + code)) |
| 114 elif code <= 0x7F: |
| 115 decoded.append(_unichr(code)) |
| 116 else: |
| 117 # # It may be a bad byte |
| 118 # # Try swallowing it. |
| 119 # continue |
| 120 # print("RAISE!") |
| 121 raise NotASurrogateError |
| 122 return str().join(decoded) |
| 123 |
| 124 |
| 125 def encodefilename(fn): |
| 126 if FS_ENCODING == 'ascii': |
| 127 # ASCII encoder of Python 2 expects that the error handler returns a |
| 128 # Unicode string encodable to ASCII, whereas our surrogateescape error |
| 129 # handler has to return bytes in 0x80-0xFF range. |
| 130 encoded = [] |
| 131 for index, ch in enumerate(fn): |
| 132 code = ord(ch) |
| 133 if code < 128: |
| 134 ch = bytes_chr(code) |
| 135 elif 0xDC80 <= code <= 0xDCFF: |
| 136 ch = bytes_chr(code - 0xDC00) |
| 137 else: |
| 138 raise UnicodeEncodeError(FS_ENCODING, |
| 139 fn, index, index+1, |
| 140 'ordinal not in range(128)') |
| 141 encoded.append(ch) |
| 142 return bytes().join(encoded) |
| 143 elif FS_ENCODING == 'utf-8': |
| 144 # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF |
| 145 # doesn't go through our error handler |
| 146 encoded = [] |
| 147 for index, ch in enumerate(fn): |
| 148 code = ord(ch) |
| 149 if 0xD800 <= code <= 0xDFFF: |
| 150 if 0xDC80 <= code <= 0xDCFF: |
| 151 ch = bytes_chr(code - 0xDC00) |
| 152 encoded.append(ch) |
| 153 else: |
| 154 raise UnicodeEncodeError( |
| 155 FS_ENCODING, |
| 156 fn, index, index+1, 'surrogates not allowed') |
| 157 else: |
| 158 ch_utf8 = ch.encode('utf-8') |
| 159 encoded.append(ch_utf8) |
| 160 return bytes().join(encoded) |
| 161 else: |
| 162 return fn.encode(FS_ENCODING, FS_ERRORS) |
| 163 |
| 164 def decodefilename(fn): |
| 165 return fn.decode(FS_ENCODING, FS_ERRORS) |
| 166 |
| 167 FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
| 168 # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') |
| 169 # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
| 170 |
| 171 |
| 172 # normalize the filesystem encoding name. |
| 173 # For example, we expect "utf-8", not "UTF8". |
| 174 FS_ENCODING = codecs.lookup(FS_ENCODING).name |
| 175 |
| 176 |
| 177 def register_surrogateescape(): |
| 178 """ |
| 179 Registers the surrogateescape error handler on Python 2 (only) |
| 180 """ |
| 181 if utils.PY3: |
| 182 return |
| 183 try: |
| 184 codecs.lookup_error(FS_ERRORS) |
| 185 except LookupError: |
| 186 codecs.register_error(FS_ERRORS, surrogateescape_handler) |
| 187 |
| 188 |
| 189 if __name__ == '__main__': |
| 190 pass |
| 191 # # Tests: |
| 192 # register_surrogateescape() |
| 193 |
| 194 # b = decodefilename(fn) |
| 195 # assert b == encoded, "%r != %r" % (b, encoded) |
| 196 # c = encodefilename(b) |
| 197 # assert c == fn, '%r != %r' % (c, fn) |
| 198 # # print("ok") |
| 199 |
| 200 |
OLD | NEW |