Index: third_party/google-endpoints/future/utils/surrogateescape.py |
diff --git a/third_party/google-endpoints/future/utils/surrogateescape.py b/third_party/google-endpoints/future/utils/surrogateescape.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..398c3531b63040335f86b316435aefcd3c1464ec |
--- /dev/null |
+++ b/third_party/google-endpoints/future/utils/surrogateescape.py |
@@ -0,0 +1,200 @@ |
+""" |
+This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error |
+handler of Python 3. |
+ |
+Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc |
+""" |
+ |
+# This code is released under the Python license and the BSD 2-clause license |
+ |
+import codecs |
+import sys |
+ |
+from future import utils |
+ |
+ |
+FS_ERRORS = 'surrogateescape' |
+ |
+# # -- Python 2/3 compatibility ------------------------------------- |
+# FS_ERRORS = 'my_surrogateescape' |
+ |
+def u(text): |
+ if utils.PY3: |
+ return text |
+ else: |
+ return text.decode('unicode_escape') |
+ |
+def b(data): |
+ if utils.PY3: |
+ return data.encode('latin1') |
+ else: |
+ return data |
+ |
+if utils.PY3: |
+ _unichr = chr |
+ bytes_chr = lambda code: bytes((code,)) |
+else: |
+ _unichr = unichr |
+ bytes_chr = chr |
+ |
+def surrogateescape_handler(exc): |
+ """ |
+ Pure Python implementation of the PEP 383: the "surrogateescape" error |
+ handler of Python 3. Undecodable bytes will be replaced by a Unicode |
+ character U+DCxx on decoding, and these are translated into the |
+ original bytes on encoding. |
+ """ |
+ mystring = exc.object[exc.start:exc.end] |
+ |
+ try: |
+ if isinstance(exc, UnicodeDecodeError): |
+ # mystring is a byte-string in this case |
+ decoded = replace_surrogate_decode(mystring) |
+ elif isinstance(exc, UnicodeEncodeError): |
+ # In the case of u'\udcc3'.encode('ascii', |
+ # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an |
+ # exception anyway after this function is called, even though I think |
+ # it's doing what it should. It seems that the strict encoder is called |
+ # to encode the unicode string that this function returns ... |
+ decoded = replace_surrogate_encode(mystring) |
+ else: |
+ raise exc |
+ except NotASurrogateError: |
+ raise exc |
+ return (decoded, exc.end) |
+ |
+ |
+class NotASurrogateError(Exception): |
+ pass |
+ |
+ |
+def replace_surrogate_encode(mystring): |
+ """ |
+ Returns a (unicode) string, not the more logical bytes, because the codecs |
+ register_error functionality expects this. |
+ """ |
+ decoded = [] |
+ for ch in mystring: |
+ # if utils.PY3: |
+ # code = ch |
+ # else: |
+ code = ord(ch) |
+ |
+ # The following magic comes from Py3.3's Python/codecs.c file: |
+ if not 0xD800 <= code <= 0xDCFF: |
+ # Not a surrogate. Fail with the original exception. |
+ raise exc |
+ # mybytes = [0xe0 | (code >> 12), |
+ # 0x80 | ((code >> 6) & 0x3f), |
+ # 0x80 | (code & 0x3f)] |
+ # Is this a good idea? |
+ if 0xDC00 <= code <= 0xDC7F: |
+ decoded.append(_unichr(code - 0xDC00)) |
+ elif code <= 0xDCFF: |
+ decoded.append(_unichr(code - 0xDC00)) |
+ else: |
+ raise NotASurrogateError |
+ return str().join(decoded) |
+ |
+ |
+def replace_surrogate_decode(mybytes): |
+ """ |
+ Returns a (unicode) string |
+ """ |
+ decoded = [] |
+ for ch in mybytes: |
+ # We may be parsing newbytes (in which case ch is an int) or a native |
+ # str on Py2 |
+ if isinstance(ch, int): |
+ code = ch |
+ else: |
+ code = ord(ch) |
+ if 0x80 <= code <= 0xFF: |
+ decoded.append(_unichr(0xDC00 + code)) |
+ elif code <= 0x7F: |
+ decoded.append(_unichr(code)) |
+ else: |
+ # # It may be a bad byte |
+ # # Try swallowing it. |
+ # continue |
+ # print("RAISE!") |
+ raise NotASurrogateError |
+ return str().join(decoded) |
+ |
+ |
+def encodefilename(fn): |
+ if FS_ENCODING == 'ascii': |
+ # ASCII encoder of Python 2 expects that the error handler returns a |
+ # Unicode string encodable to ASCII, whereas our surrogateescape error |
+ # handler has to return bytes in 0x80-0xFF range. |
+ encoded = [] |
+ for index, ch in enumerate(fn): |
+ code = ord(ch) |
+ if code < 128: |
+ ch = bytes_chr(code) |
+ elif 0xDC80 <= code <= 0xDCFF: |
+ ch = bytes_chr(code - 0xDC00) |
+ else: |
+ raise UnicodeEncodeError(FS_ENCODING, |
+ fn, index, index+1, |
+ 'ordinal not in range(128)') |
+ encoded.append(ch) |
+ return bytes().join(encoded) |
+ elif FS_ENCODING == 'utf-8': |
+ # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF |
+ # doesn't go through our error handler |
+ encoded = [] |
+ for index, ch in enumerate(fn): |
+ code = ord(ch) |
+ if 0xD800 <= code <= 0xDFFF: |
+ if 0xDC80 <= code <= 0xDCFF: |
+ ch = bytes_chr(code - 0xDC00) |
+ encoded.append(ch) |
+ else: |
+ raise UnicodeEncodeError( |
+ FS_ENCODING, |
+ fn, index, index+1, 'surrogates not allowed') |
+ else: |
+ ch_utf8 = ch.encode('utf-8') |
+ encoded.append(ch_utf8) |
+ return bytes().join(encoded) |
+ else: |
+ return fn.encode(FS_ENCODING, FS_ERRORS) |
+ |
+def decodefilename(fn): |
+ return fn.decode(FS_ENCODING, FS_ERRORS) |
+ |
+FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
+# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') |
+# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
+ |
+ |
+# normalize the filesystem encoding name. |
+# For example, we expect "utf-8", not "UTF8". |
+FS_ENCODING = codecs.lookup(FS_ENCODING).name |
+ |
+ |
+def register_surrogateescape(): |
+ """ |
+ Registers the surrogateescape error handler on Python 2 (only) |
+ """ |
+ if utils.PY3: |
+ return |
+ try: |
+ codecs.lookup_error(FS_ERRORS) |
+ except LookupError: |
+ codecs.register_error(FS_ERRORS, surrogateescape_handler) |
+ |
+ |
+if __name__ == '__main__': |
+ pass |
+ # # Tests: |
+ # register_surrogateescape() |
+ |
+ # b = decodefilename(fn) |
+ # assert b == encoded, "%r != %r" % (b, encoded) |
+ # c = encodefilename(b) |
+ # assert c == fn, '%r != %r' % (c, fn) |
+ # # print("ok") |
+ |
+ |