| Index: third_party/google-endpoints/future/backports/email/_encoded_words.py
|
| diff --git a/third_party/google-endpoints/future/backports/email/_encoded_words.py b/third_party/google-endpoints/future/backports/email/_encoded_words.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..7c4a5291466b8022bfebf0dc7dc1667059ec0a43
|
| --- /dev/null
|
| +++ b/third_party/google-endpoints/future/backports/email/_encoded_words.py
|
| @@ -0,0 +1,232 @@
|
| +""" Routines for manipulating RFC2047 encoded words.
|
| +
|
| +This is currently a package-private API, but will be considered for promotion
|
| +to a public API if there is demand.
|
| +
|
| +"""
|
| +from __future__ import unicode_literals
|
| +from __future__ import division
|
| +from __future__ import absolute_import
|
| +from future.builtins import bytes
|
| +from future.builtins import chr
|
| +from future.builtins import int
|
| +from future.builtins import str
|
| +
|
| +# An ecoded word looks like this:
|
| +#
|
| +# =?charset[*lang]?cte?encoded_string?=
|
| +#
|
| +# for more information about charset see the charset module. Here it is one
|
| +# of the preferred MIME charset names (hopefully; you never know when parsing).
|
| +# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In
|
| +# theory other letters could be used for other encodings, but in practice this
|
| +# (almost?) never happens. There could be a public API for adding entries
|
| +# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is
|
| +# Base64. The meaning of encoded_string should be obvious. 'lang' is optional
|
| +# as indicated by the brackets (they are not part of the syntax) but is almost
|
| +# never encountered in practice.
|
| +#
|
| +# The general interface for a CTE decoder is that it takes the encoded_string
|
| +# as its argument, and returns a tuple (cte_decoded_string, defects). The
|
| +# cte_decoded_string is the original binary that was encoded using the
|
| +# specified cte. 'defects' is a list of MessageDefect instances indicating any
|
| +# problems encountered during conversion. 'charset' and 'lang' are the
|
| +# corresponding strings extracted from the EW, case preserved.
|
| +#
|
| +# The general interface for a CTE encoder is that it takes a binary sequence
|
| +# as input and returns the cte_encoded_string, which is an ascii-only string.
|
| +#
|
| +# Each decoder must also supply a length function that takes the binary
|
| +# sequence as its argument and returns the length of the resulting encoded
|
| +# string.
|
| +#
|
| +# The main API functions for the module are decode, which calls the decoder
|
| +# referenced by the cte specifier, and encode, which adds the appropriate
|
| +# RFC 2047 "chrome" to the encoded string, and can optionally automatically
|
| +# select the shortest possible encoding. See their docstrings below for
|
| +# details.
|
| +
|
| +import re
|
| +import base64
|
| +import binascii
|
| +import functools
|
| +from string import ascii_letters, digits
|
| +from future.backports.email import errors
|
| +
|
| +__all__ = ['decode_q',
|
| + 'encode_q',
|
| + 'decode_b',
|
| + 'encode_b',
|
| + 'len_q',
|
| + 'len_b',
|
| + 'decode',
|
| + 'encode',
|
| + ]
|
| +
|
| +#
|
| +# Quoted Printable
|
| +#
|
| +
|
| +# regex based decoder.
|
| +_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub,
|
| + lambda m: bytes([int(m.group(1), 16)]))
|
| +
|
| +def decode_q(encoded):
|
| + encoded = bytes(encoded.replace(b'_', b' '))
|
| + return _q_byte_subber(encoded), []
|
| +
|
| +
|
| +# dict mapping bytes to their encoded form
|
| +class _QByteMap(dict):
|
| +
|
| + safe = bytes(b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'))
|
| +
|
| + def __missing__(self, key):
|
| + if key in self.safe:
|
| + self[key] = chr(key)
|
| + else:
|
| + self[key] = "={:02X}".format(key)
|
| + return self[key]
|
| +
|
| +_q_byte_map = _QByteMap()
|
| +
|
| +# In headers spaces are mapped to '_'.
|
| +_q_byte_map[ord(' ')] = '_'
|
| +
|
| +def encode_q(bstring):
|
| + return str(''.join(_q_byte_map[x] for x in bytes(bstring)))
|
| +
|
| +def len_q(bstring):
|
| + return sum(len(_q_byte_map[x]) for x in bytes(bstring))
|
| +
|
| +
|
| +#
|
| +# Base64
|
| +#
|
| +
|
| +def decode_b(encoded):
|
| + defects = []
|
| + pad_err = len(encoded) % 4
|
| + if pad_err:
|
| + defects.append(errors.InvalidBase64PaddingDefect())
|
| + padded_encoded = encoded + b'==='[:4-pad_err]
|
| + else:
|
| + padded_encoded = encoded
|
| + try:
|
| + # The validate kwarg to b64decode is not supported in Py2.x
|
| + if not re.match(b'^[A-Za-z0-9+/]*={0,2}$', padded_encoded):
|
| + raise binascii.Error('Non-base64 digit found')
|
| + return base64.b64decode(padded_encoded), defects
|
| + except binascii.Error:
|
| + # Since we had correct padding, this must an invalid char error.
|
| + defects = [errors.InvalidBase64CharactersDefect()]
|
| + # The non-alphabet characters are ignored as far as padding
|
| + # goes, but we don't know how many there are. So we'll just
|
| + # try various padding lengths until something works.
|
| + for i in 0, 1, 2, 3:
|
| + try:
|
| + return base64.b64decode(encoded+b'='*i), defects
|
| + except (binascii.Error, TypeError): # Py2 raises a TypeError
|
| + if i==0:
|
| + defects.append(errors.InvalidBase64PaddingDefect())
|
| + else:
|
| + # This should never happen.
|
| + raise AssertionError("unexpected binascii.Error")
|
| +
|
| +def encode_b(bstring):
|
| + return base64.b64encode(bstring).decode('ascii')
|
| +
|
| +def len_b(bstring):
|
| + groups_of_3, leftover = divmod(len(bstring), 3)
|
| + # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.
|
| + return groups_of_3 * 4 + (4 if leftover else 0)
|
| +
|
| +
|
| +_cte_decoders = {
|
| + 'q': decode_q,
|
| + 'b': decode_b,
|
| + }
|
| +
|
| +def decode(ew):
|
| + """Decode encoded word and return (string, charset, lang, defects) tuple.
|
| +
|
| + An RFC 2047/2243 encoded word has the form:
|
| +
|
| + =?charset*lang?cte?encoded_string?=
|
| +
|
| + where '*lang' may be omitted but the other parts may not be.
|
| +
|
| + This function expects exactly such a string (that is, it does not check the
|
| + syntax and may raise errors if the string is not well formed), and returns
|
| + the encoded_string decoded first from its Content Transfer Encoding and
|
| + then from the resulting bytes into unicode using the specified charset. If
|
| + the cte-decoded string does not successfully decode using the specified
|
| + character set, a defect is added to the defects list and the unknown octets
|
| + are replaced by the unicode 'unknown' character \uFDFF.
|
| +
|
| + The specified charset and language are returned. The default for language,
|
| + which is rarely if ever encountered, is the empty string.
|
| +
|
| + """
|
| + _, charset, cte, cte_string, _ = str(ew).split('?')
|
| + charset, _, lang = charset.partition('*')
|
| + cte = cte.lower()
|
| + # Recover the original bytes and do CTE decoding.
|
| + bstring = cte_string.encode('ascii', 'surrogateescape')
|
| + bstring, defects = _cte_decoders[cte](bstring)
|
| + # Turn the CTE decoded bytes into unicode.
|
| + try:
|
| + string = bstring.decode(charset)
|
| + except UnicodeError:
|
| + defects.append(errors.UndecodableBytesDefect("Encoded word "
|
| + "contains bytes not decodable using {} charset".format(charset)))
|
| + string = bstring.decode(charset, 'surrogateescape')
|
| + except LookupError:
|
| + string = bstring.decode('ascii', 'surrogateescape')
|
| + if charset.lower() != 'unknown-8bit':
|
| + defects.append(errors.CharsetError("Unknown charset {} "
|
| + "in encoded word; decoded as unknown bytes".format(charset)))
|
| + return string, charset, lang, defects
|
| +
|
| +
|
| +_cte_encoders = {
|
| + 'q': encode_q,
|
| + 'b': encode_b,
|
| + }
|
| +
|
| +_cte_encode_length = {
|
| + 'q': len_q,
|
| + 'b': len_b,
|
| + }
|
| +
|
| +def encode(string, charset='utf-8', encoding=None, lang=''):
|
| + """Encode string using the CTE encoding that produces the shorter result.
|
| +
|
| + Produces an RFC 2047/2243 encoded word of the form:
|
| +
|
| + =?charset*lang?cte?encoded_string?=
|
| +
|
| + where '*lang' is omitted unless the 'lang' parameter is given a value.
|
| + Optional argument charset (defaults to utf-8) specifies the charset to use
|
| + to encode the string to binary before CTE encoding it. Optional argument
|
| + 'encoding' is the cte specifier for the encoding that should be used ('q'
|
| + or 'b'); if it is None (the default) the encoding which produces the
|
| + shortest encoded sequence is used, except that 'q' is preferred if it is up
|
| + to five characters longer. Optional argument 'lang' (default '') gives the
|
| + RFC 2243 language string to specify in the encoded word.
|
| +
|
| + """
|
| + string = str(string)
|
| + if charset == 'unknown-8bit':
|
| + bstring = string.encode('ascii', 'surrogateescape')
|
| + else:
|
| + bstring = string.encode(charset)
|
| + if encoding is None:
|
| + qlen = _cte_encode_length['q'](bstring)
|
| + blen = _cte_encode_length['b'](bstring)
|
| + # Bias toward q. 5 is arbitrary.
|
| + encoding = 'q' if qlen - blen < 5 else 'b'
|
| + encoded = _cte_encoders[encoding](bstring)
|
| + if lang:
|
| + lang = '*' + lang
|
| + return "=?{0}{1}?{2}?{3}?=".format(charset, lang, encoding, encoded)
|
|
|