Index: third_party/cython/src/Cython/Compiler/StringEncoding.py |
diff --git a/third_party/cython/src/Cython/Compiler/StringEncoding.py b/third_party/cython/src/Cython/Compiler/StringEncoding.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..4d84afa209fc06133a70769e51b48bc3d669a9c6 |
--- /dev/null |
+++ b/third_party/cython/src/Cython/Compiler/StringEncoding.py |
@@ -0,0 +1,311 @@ |
+# |
+# Cython -- encoding related tools |
+# |
+ |
+import re |
+import sys |
+ |
+if sys.version_info[0] >= 3: |
+ _unicode, _str, _bytes = str, str, bytes |
+ IS_PYTHON3 = True |
+else: |
+ _unicode, _str, _bytes = unicode, str, str |
+ IS_PYTHON3 = False |
+ |
+empty_bytes = _bytes() |
+empty_unicode = _unicode() |
+ |
+join_bytes = empty_bytes.join |
+ |
+class UnicodeLiteralBuilder(object): |
+ """Assemble a unicode string. |
+ """ |
+ def __init__(self): |
+ self.chars = [] |
+ |
+ def append(self, characters): |
+ if isinstance(characters, _bytes): |
+ # this came from a Py2 string literal in the parser code |
+ characters = characters.decode("ASCII") |
+ assert isinstance(characters, _unicode), str(type(characters)) |
+ self.chars.append(characters) |
+ |
+ if sys.maxunicode == 65535: |
+ def append_charval(self, char_number): |
+ if char_number > 65535: |
+ # wide Unicode character on narrow platform => replace |
+ # by surrogate pair |
+ char_number -= 0x10000 |
+ self.chars.append( unichr((char_number // 1024) + 0xD800) ) |
+ self.chars.append( unichr((char_number % 1024) + 0xDC00) ) |
+ else: |
+ self.chars.append( unichr(char_number) ) |
+ else: |
+ def append_charval(self, char_number): |
+ self.chars.append( unichr(char_number) ) |
+ |
+ def append_uescape(self, char_number, escape_string): |
+ self.append_charval(char_number) |
+ |
+ def getstring(self): |
+ return EncodedString(u''.join(self.chars)) |
+ |
+ def getstrings(self): |
+ return (None, self.getstring()) |
+ |
+ |
+class BytesLiteralBuilder(object): |
+ """Assemble a byte string or char value. |
+ """ |
+ def __init__(self, target_encoding): |
+ self.chars = [] |
+ self.target_encoding = target_encoding |
+ |
+ def append(self, characters): |
+ if isinstance(characters, _unicode): |
+ characters = characters.encode(self.target_encoding) |
+ assert isinstance(characters, _bytes), str(type(characters)) |
+ self.chars.append(characters) |
+ |
+ def append_charval(self, char_number): |
+ self.chars.append( unichr(char_number).encode('ISO-8859-1') ) |
+ |
+ def append_uescape(self, char_number, escape_string): |
+ self.append(escape_string) |
+ |
+ def getstring(self): |
+ # this *must* return a byte string! |
+ s = BytesLiteral(join_bytes(self.chars)) |
+ s.encoding = self.target_encoding |
+ return s |
+ |
+ def getchar(self): |
+ # this *must* return a byte string! |
+ return self.getstring() |
+ |
+ def getstrings(self): |
+ return (self.getstring(), None) |
+ |
+class StrLiteralBuilder(object): |
+ """Assemble both a bytes and a unicode representation of a string. |
+ """ |
+ def __init__(self, target_encoding): |
+ self._bytes = BytesLiteralBuilder(target_encoding) |
+ self._unicode = UnicodeLiteralBuilder() |
+ |
+ def append(self, characters): |
+ self._bytes.append(characters) |
+ self._unicode.append(characters) |
+ |
+ def append_charval(self, char_number): |
+ self._bytes.append_charval(char_number) |
+ self._unicode.append_charval(char_number) |
+ |
+ def append_uescape(self, char_number, escape_string): |
+ self._bytes.append(escape_string) |
+ self._unicode.append_charval(char_number) |
+ |
+ def getstrings(self): |
+ return (self._bytes.getstring(), self._unicode.getstring()) |
+ |
+ |
+class EncodedString(_unicode): |
+ # unicode string subclass to keep track of the original encoding. |
+ # 'encoding' is None for unicode strings and the source encoding |
+ # otherwise |
+ encoding = None |
+ |
+ def __deepcopy__(self, memo): |
+ return self |
+ |
+ def byteencode(self): |
+ assert self.encoding is not None |
+ return self.encode(self.encoding) |
+ |
+ def utf8encode(self): |
+ assert self.encoding is None |
+ return self.encode("UTF-8") |
+ |
+ @property |
+ def is_unicode(self): |
+ return self.encoding is None |
+ |
+ def contains_surrogates(self): |
+ return string_contains_surrogates(self) |
+ |
+ |
+def string_contains_surrogates(ustring): |
+ """ |
+ Check if the unicode string contains surrogate code points |
+ on a CPython platform with wide (UCS-4) or narrow (UTF-16) |
+ Unicode, i.e. characters that would be spelled as two |
+ separate code units on a narrow platform. |
+ """ |
+ for c in map(ord, ustring): |
+ if c > 65535: # can only happen on wide platforms |
+ return True |
+ if 0xD800 <= c <= 0xDFFF: |
+ return True |
+ return False |
+ |
+ |
+class BytesLiteral(_bytes): |
+ # bytes subclass that is compatible with EncodedString |
+ encoding = None |
+ |
+ def __deepcopy__(self, memo): |
+ return self |
+ |
+ def byteencode(self): |
+ if IS_PYTHON3: |
+ return _bytes(self) |
+ else: |
+ # fake-recode the string to make it a plain bytes object |
+ return self.decode('ISO-8859-1').encode('ISO-8859-1') |
+ |
+ def utf8encode(self): |
+ assert False, "this is not a unicode string: %r" % self |
+ |
+ def __str__(self): |
+ """Fake-decode the byte string to unicode to support % |
+ formatting of unicode strings. |
+ """ |
+ return self.decode('ISO-8859-1') |
+ |
+ is_unicode = False |
+ |
+ |
+char_from_escape_sequence = { |
+ r'\a' : u'\a', |
+ r'\b' : u'\b', |
+ r'\f' : u'\f', |
+ r'\n' : u'\n', |
+ r'\r' : u'\r', |
+ r'\t' : u'\t', |
+ r'\v' : u'\v', |
+ }.get |
+ |
+_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) |
+ |
+ |
+def _to_escape_sequence(s): |
+ if s in '\n\r\t': |
+ return repr(s)[1:-1] |
+ elif s == '"': |
+ return r'\"' |
+ elif s == '\\': |
+ return r'\\' |
+ else: |
+ # within a character sequence, oct passes much better than hex |
+ return ''.join(['\\%03o' % ord(c) for c in s]) |
+ |
+ |
+def _build_specials_replacer(): |
+ subexps = [] |
+ replacements = {} |
+ for special in _c_special: |
+ regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) |
+ subexps.append(regexp) |
+ replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') |
+ sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub |
+ def replace_specials(m): |
+ return replacements[m.group(1)] |
+ def replace(s): |
+ return sub(replace_specials, s) |
+ return replace |
+ |
+_replace_specials = _build_specials_replacer() |
+ |
+ |
+def escape_char(c): |
+ if IS_PYTHON3: |
+ c = c.decode('ISO-8859-1') |
+ if c in '\n\r\t\\': |
+ return repr(c)[1:-1] |
+ elif c == "'": |
+ return "\\'" |
+ n = ord(c) |
+ if n < 32 or n > 127: |
+ # hex works well for characters |
+ return "\\x%02X" % n |
+ else: |
+ return c |
+ |
+def escape_byte_string(s): |
+ """Escape a byte string so that it can be written into C code. |
+ Note that this returns a Unicode string instead which, when |
+ encoded as ISO-8859-1, will result in the correct byte sequence |
+ being written. |
+ """ |
+ s = _replace_specials(s) |
+ try: |
+ return s.decode("ASCII") # trial decoding: plain ASCII => done |
+ except UnicodeDecodeError: |
+ pass |
+ if IS_PYTHON3: |
+ s_new = bytearray() |
+ append, extend = s_new.append, s_new.extend |
+ for b in s: |
+ if b >= 128: |
+ extend(('\\%3o' % b).encode('ASCII')) |
+ else: |
+ append(b) |
+ return s_new.decode('ISO-8859-1') |
+ else: |
+ l = [] |
+ append = l.append |
+ for c in s: |
+ o = ord(c) |
+ if o >= 128: |
+ append('\\%3o' % o) |
+ else: |
+ append(c) |
+ return join_bytes(l).decode('ISO-8859-1') |
+ |
+def split_string_literal(s, limit=2000): |
+ # MSVC can't handle long string literals. |
+ if len(s) < limit: |
+ return s |
+ else: |
+ start = 0 |
+ chunks = [] |
+ while start < len(s): |
+ end = start + limit |
+ if len(s) > end-4 and '\\' in s[end-4:end]: |
+ end -= 4 - s[end-4:end].find('\\') # just before the backslash |
+ while s[end-1] == '\\': |
+ end -= 1 |
+ if end == start: |
+ # must have been a long line of backslashes |
+ end = start + limit - (limit % 2) - 4 |
+ break |
+ chunks.append(s[start:end]) |
+ start = end |
+ return '""'.join(chunks) |
+ |
+def encode_pyunicode_string(s): |
+ """Create Py_UNICODE[] representation of a given unicode string. |
+ """ |
+ s = map(ord, s) + [0] |
+ |
+ if sys.maxunicode >= 0x10000: # Wide build or Py3.3 |
+ utf16, utf32 = [], s |
+ for code_point in s: |
+ if code_point >= 0x10000: # outside of BMP |
+ high, low = divmod(code_point - 0x10000, 1024) |
+ utf16.append(high + 0xD800) |
+ utf16.append(low + 0xDC00) |
+ else: |
+ utf16.append(code_point) |
+ else: |
+ utf16, utf32 = s, [] |
+ for code_unit in s: |
+ if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: |
+ high, low = utf32[-1], code_unit |
+ utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 |
+ else: |
+ utf32.append(code_unit) |
+ |
+ if utf16 == utf32: |
+ utf16 = [] |
+ return ",".join(map(unicode, utf16)), ",".join(map(unicode, utf32)) |