Index: third_party/google-endpoints/future/backports/email/_encoded_words.py |
diff --git a/third_party/google-endpoints/future/backports/email/_encoded_words.py b/third_party/google-endpoints/future/backports/email/_encoded_words.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..7c4a5291466b8022bfebf0dc7dc1667059ec0a43 |
--- /dev/null |
+++ b/third_party/google-endpoints/future/backports/email/_encoded_words.py |
@@ -0,0 +1,232 @@ |
+""" Routines for manipulating RFC2047 encoded words. |
+ |
+This is currently a package-private API, but will be considered for promotion |
+to a public API if there is demand. |
+ |
+""" |
+from __future__ import unicode_literals |
+from __future__ import division |
+from __future__ import absolute_import |
+from future.builtins import bytes |
+from future.builtins import chr |
+from future.builtins import int |
+from future.builtins import str |
+ |
+# An ecoded word looks like this: |
+# |
+# =?charset[*lang]?cte?encoded_string?= |
+# |
+# for more information about charset see the charset module. Here it is one |
+# of the preferred MIME charset names (hopefully; you never know when parsing). |
+# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In |
+# theory other letters could be used for other encodings, but in practice this |
+# (almost?) never happens. There could be a public API for adding entries |
+# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is |
+# Base64. The meaning of encoded_string should be obvious. 'lang' is optional |
+# as indicated by the brackets (they are not part of the syntax) but is almost |
+# never encountered in practice. |
+# |
+# The general interface for a CTE decoder is that it takes the encoded_string |
+# as its argument, and returns a tuple (cte_decoded_string, defects). The |
+# cte_decoded_string is the original binary that was encoded using the |
+# specified cte. 'defects' is a list of MessageDefect instances indicating any |
+# problems encountered during conversion. 'charset' and 'lang' are the |
+# corresponding strings extracted from the EW, case preserved. |
+# |
+# The general interface for a CTE encoder is that it takes a binary sequence |
+# as input and returns the cte_encoded_string, which is an ascii-only string. |
+# |
+# Each decoder must also supply a length function that takes the binary |
+# sequence as its argument and returns the length of the resulting encoded |
+# string. |
+# |
+# The main API functions for the module are decode, which calls the decoder |
+# referenced by the cte specifier, and encode, which adds the appropriate |
+# RFC 2047 "chrome" to the encoded string, and can optionally automatically |
+# select the shortest possible encoding. See their docstrings below for |
+# details. |
+ |
+import re |
+import base64 |
+import binascii |
+import functools |
+from string import ascii_letters, digits |
+from future.backports.email import errors |
+ |
+__all__ = ['decode_q', |
+ 'encode_q', |
+ 'decode_b', |
+ 'encode_b', |
+ 'len_q', |
+ 'len_b', |
+ 'decode', |
+ 'encode', |
+ ] |
+ |
+# |
+# Quoted Printable |
+# |
+ |
+# regex based decoder. |
+_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub, |
+ lambda m: bytes([int(m.group(1), 16)])) |
+ |
+def decode_q(encoded): |
+ encoded = bytes(encoded.replace(b'_', b' ')) |
+ return _q_byte_subber(encoded), [] |
+ |
+ |
+# dict mapping bytes to their encoded form |
+class _QByteMap(dict): |
+ |
+ safe = bytes(b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii')) |
+ |
+ def __missing__(self, key): |
+ if key in self.safe: |
+ self[key] = chr(key) |
+ else: |
+ self[key] = "={:02X}".format(key) |
+ return self[key] |
+ |
+_q_byte_map = _QByteMap() |
+ |
+# In headers spaces are mapped to '_'. |
+_q_byte_map[ord(' ')] = '_' |
+ |
+def encode_q(bstring): |
+ return str(''.join(_q_byte_map[x] for x in bytes(bstring))) |
+ |
+def len_q(bstring): |
+ return sum(len(_q_byte_map[x]) for x in bytes(bstring)) |
+ |
+ |
+# |
+# Base64 |
+# |
+ |
+def decode_b(encoded): |
+ defects = [] |
+ pad_err = len(encoded) % 4 |
+ if pad_err: |
+ defects.append(errors.InvalidBase64PaddingDefect()) |
+ padded_encoded = encoded + b'==='[:4-pad_err] |
+ else: |
+ padded_encoded = encoded |
+ try: |
+ # The validate kwarg to b64decode is not supported in Py2.x |
+ if not re.match(b'^[A-Za-z0-9+/]*={0,2}$', padded_encoded): |
+ raise binascii.Error('Non-base64 digit found') |
+ return base64.b64decode(padded_encoded), defects |
+ except binascii.Error: |
+ # Since we had correct padding, this must an invalid char error. |
+ defects = [errors.InvalidBase64CharactersDefect()] |
+ # The non-alphabet characters are ignored as far as padding |
+ # goes, but we don't know how many there are. So we'll just |
+ # try various padding lengths until something works. |
+ for i in 0, 1, 2, 3: |
+ try: |
+ return base64.b64decode(encoded+b'='*i), defects |
+ except (binascii.Error, TypeError): # Py2 raises a TypeError |
+ if i==0: |
+ defects.append(errors.InvalidBase64PaddingDefect()) |
+ else: |
+ # This should never happen. |
+ raise AssertionError("unexpected binascii.Error") |
+ |
+def encode_b(bstring): |
+ return base64.b64encode(bstring).decode('ascii') |
+ |
+def len_b(bstring): |
+ groups_of_3, leftover = divmod(len(bstring), 3) |
+ # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in. |
+ return groups_of_3 * 4 + (4 if leftover else 0) |
+ |
+ |
+_cte_decoders = { |
+ 'q': decode_q, |
+ 'b': decode_b, |
+ } |
+ |
+def decode(ew): |
+ """Decode encoded word and return (string, charset, lang, defects) tuple. |
+ |
+ An RFC 2047/2243 encoded word has the form: |
+ |
+ =?charset*lang?cte?encoded_string?= |
+ |
+ where '*lang' may be omitted but the other parts may not be. |
+ |
+ This function expects exactly such a string (that is, it does not check the |
+ syntax and may raise errors if the string is not well formed), and returns |
+ the encoded_string decoded first from its Content Transfer Encoding and |
+ then from the resulting bytes into unicode using the specified charset. If |
+ the cte-decoded string does not successfully decode using the specified |
+ character set, a defect is added to the defects list and the unknown octets |
+ are replaced by the unicode 'unknown' character \uFDFF. |
+ |
+ The specified charset and language are returned. The default for language, |
+ which is rarely if ever encountered, is the empty string. |
+ |
+ """ |
+ _, charset, cte, cte_string, _ = str(ew).split('?') |
+ charset, _, lang = charset.partition('*') |
+ cte = cte.lower() |
+ # Recover the original bytes and do CTE decoding. |
+ bstring = cte_string.encode('ascii', 'surrogateescape') |
+ bstring, defects = _cte_decoders[cte](bstring) |
+ # Turn the CTE decoded bytes into unicode. |
+ try: |
+ string = bstring.decode(charset) |
+ except UnicodeError: |
+ defects.append(errors.UndecodableBytesDefect("Encoded word " |
+ "contains bytes not decodable using {} charset".format(charset))) |
+ string = bstring.decode(charset, 'surrogateescape') |
+ except LookupError: |
+ string = bstring.decode('ascii', 'surrogateescape') |
+ if charset.lower() != 'unknown-8bit': |
+ defects.append(errors.CharsetError("Unknown charset {} " |
+ "in encoded word; decoded as unknown bytes".format(charset))) |
+ return string, charset, lang, defects |
+ |
+ |
+_cte_encoders = { |
+ 'q': encode_q, |
+ 'b': encode_b, |
+ } |
+ |
+_cte_encode_length = { |
+ 'q': len_q, |
+ 'b': len_b, |
+ } |
+ |
+def encode(string, charset='utf-8', encoding=None, lang=''): |
+ """Encode string using the CTE encoding that produces the shorter result. |
+ |
+ Produces an RFC 2047/2243 encoded word of the form: |
+ |
+ =?charset*lang?cte?encoded_string?= |
+ |
+ where '*lang' is omitted unless the 'lang' parameter is given a value. |
+ Optional argument charset (defaults to utf-8) specifies the charset to use |
+ to encode the string to binary before CTE encoding it. Optional argument |
+ 'encoding' is the cte specifier for the encoding that should be used ('q' |
+ or 'b'); if it is None (the default) the encoding which produces the |
+ shortest encoded sequence is used, except that 'q' is preferred if it is up |
+ to five characters longer. Optional argument 'lang' (default '') gives the |
+ RFC 2243 language string to specify in the encoded word. |
+ |
+ """ |
+ string = str(string) |
+ if charset == 'unknown-8bit': |
+ bstring = string.encode('ascii', 'surrogateescape') |
+ else: |
+ bstring = string.encode(charset) |
+ if encoding is None: |
+ qlen = _cte_encode_length['q'](bstring) |
+ blen = _cte_encode_length['b'](bstring) |
+ # Bias toward q. 5 is arbitrary. |
+ encoding = 'q' if qlen - blen < 5 else 'b' |
+ encoded = _cte_encoders[encoding](bstring) |
+ if lang: |
+ lang = '*' + lang |
+ return "=?{0}{1}?{2}?{3}?=".format(charset, lang, encoding, encoded) |