third_party/google-endpoints/future/backports/email/_encoded_words.py - Issue 2666783008: Add google-endpoints to third_party/.

Unified Diff: third_party/google-endpoints/future/backports/email/_encoded_words.py

Issue 2666783008: Add google-endpoints to third_party/. (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « third_party/google-endpoints/future/backports/email/__init__.py ('k') | third_party/google-endpoints/future/backports/email/_header_value_parser.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/google-endpoints/future/backports/email/_encoded_words.py

diff --git a/third_party/google-endpoints/future/backports/email/_encoded_words.py b/third_party/google-endpoints/future/backports/email/_encoded_words.py

new file mode 100644

index 0000000000000000000000000000000000000000..7c4a5291466b8022bfebf0dc7dc1667059ec0a43

--- /dev/null

+++ b/third_party/google-endpoints/future/backports/email/_encoded_words.py

@@ -0,0 +1,232 @@

+""" Routines for manipulating RFC2047 encoded words.

+This is currently a package-private API, but will be considered for promotion

+to a public API if there is demand.

+"""

+from __future__ import unicode_literals

+from __future__ import division

+from __future__ import absolute_import

+from future.builtins import bytes

+from future.builtins import chr

+from future.builtins import int

+from future.builtins import str

+# An ecoded word looks like this:

+# =?charset[*lang]?cte?encoded_string?=

+# for more information about charset see the charset module. Here it is one

+# of the preferred MIME charset names (hopefully; you never know when parsing).

+# cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In

+# theory other letters could be used for other encodings, but in practice this

+# (almost?) never happens. There could be a public API for adding entries

+# to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is

+# Base64. The meaning of encoded_string should be obvious. 'lang' is optional

+# as indicated by the brackets (they are not part of the syntax) but is almost

+# never encountered in practice.

+# The general interface for a CTE decoder is that it takes the encoded_string

+# as its argument, and returns a tuple (cte_decoded_string, defects). The

+# cte_decoded_string is the original binary that was encoded using the

+# specified cte. 'defects' is a list of MessageDefect instances indicating any

+# problems encountered during conversion. 'charset' and 'lang' are the

+# corresponding strings extracted from the EW, case preserved.

+# The general interface for a CTE encoder is that it takes a binary sequence

+# as input and returns the cte_encoded_string, which is an ascii-only string.

+# Each decoder must also supply a length function that takes the binary

+# sequence as its argument and returns the length of the resulting encoded

+# string.

+# The main API functions for the module are decode, which calls the decoder

+# referenced by the cte specifier, and encode, which adds the appropriate

+# RFC 2047 "chrome" to the encoded string, and can optionally automatically

+# select the shortest possible encoding. See their docstrings below for

+# details.

+import re

+import base64

+import binascii

+import functools

+from string import ascii_letters, digits

+from future.backports.email import errors

+__all__ = ['decode_q',

+ 'encode_q',

+ 'decode_b',

+ 'encode_b',

+ 'len_q',

+ 'len_b',

+ 'decode',

+ 'encode',

+ ]

+# Quoted Printable

+# regex based decoder.

+_q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub,

+ lambda m: bytes([int(m.group(1), 16)]))

+def decode_q(encoded):

+ encoded = bytes(encoded.replace(b'_', b' '))

+ return _q_byte_subber(encoded), []

+# dict mapping bytes to their encoded form

+class _QByteMap(dict):

+ safe = bytes(b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'))

+ def __missing__(self, key):

+ if key in self.safe:

+ self[key] = chr(key)

+ else:

+ self[key] = "={:02X}".format(key)

+ return self[key]

+_q_byte_map = _QByteMap()

+# In headers spaces are mapped to '_'.

+_q_byte_map[ord(' ')] = '_'

+def encode_q(bstring):

+ return str(''.join(_q_byte_map[x] for x in bytes(bstring)))

+def len_q(bstring):

+ return sum(len(_q_byte_map[x]) for x in bytes(bstring))

+# Base64

+def decode_b(encoded):

+ defects = []

+ pad_err = len(encoded) % 4

+ if pad_err:

+ defects.append(errors.InvalidBase64PaddingDefect())

+ padded_encoded = encoded + b'==='[:4-pad_err]

+ else:

+ padded_encoded = encoded

+ try:

+ # The validate kwarg to b64decode is not supported in Py2.x

+ if not re.match(b'^[A-Za-z0-9+/]*={0,2}$', padded_encoded):

+ raise binascii.Error('Non-base64 digit found')

+ return base64.b64decode(padded_encoded), defects

+ except binascii.Error:

+ # Since we had correct padding, this must an invalid char error.

+ defects = [errors.InvalidBase64CharactersDefect()]

+ # The non-alphabet characters are ignored as far as padding

+ # goes, but we don't know how many there are. So we'll just

+ # try various padding lengths until something works.

+ for i in 0, 1, 2, 3:

+ try:

+ return base64.b64decode(encoded+b'='*i), defects

+ except (binascii.Error, TypeError): # Py2 raises a TypeError

+ if i==0:

+ defects.append(errors.InvalidBase64PaddingDefect())

+ else:

+ # This should never happen.

+ raise AssertionError("unexpected binascii.Error")

+def encode_b(bstring):

+ return base64.b64encode(bstring).decode('ascii')

+def len_b(bstring):

+ groups_of_3, leftover = divmod(len(bstring), 3)

+ # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in.

+ return groups_of_3 * 4 + (4 if leftover else 0)

+_cte_decoders = {

+ 'q': decode_q,

+ 'b': decode_b,

+ }

+def decode(ew):

+ """Decode encoded word and return (string, charset, lang, defects) tuple.

+ An RFC 2047/2243 encoded word has the form:

+ =?charset*lang?cte?encoded_string?=

+ where '*lang' may be omitted but the other parts may not be.

+ This function expects exactly such a string (that is, it does not check the

+ syntax and may raise errors if the string is not well formed), and returns

+ the encoded_string decoded first from its Content Transfer Encoding and

+ then from the resulting bytes into unicode using the specified charset. If

+ the cte-decoded string does not successfully decode using the specified

+ character set, a defect is added to the defects list and the unknown octets

+ are replaced by the unicode 'unknown' character \uFDFF.

+ The specified charset and language are returned. The default for language,

+ which is rarely if ever encountered, is the empty string.

+ """

+ _, charset, cte, cte_string, _ = str(ew).split('?')

+ charset, _, lang = charset.partition('*')

+ cte = cte.lower()

+ # Recover the original bytes and do CTE decoding.

+ bstring = cte_string.encode('ascii', 'surrogateescape')

+ bstring, defects = _cte_decoders[cte](bstring)

+ # Turn the CTE decoded bytes into unicode.

+ try:

+ string = bstring.decode(charset)

+ except UnicodeError:

+ defects.append(errors.UndecodableBytesDefect("Encoded word "

+ "contains bytes not decodable using {} charset".format(charset)))

+ string = bstring.decode(charset, 'surrogateescape')

+ except LookupError:

+ string = bstring.decode('ascii', 'surrogateescape')

+ if charset.lower() != 'unknown-8bit':

+ defects.append(errors.CharsetError("Unknown charset {} "

+ "in encoded word; decoded as unknown bytes".format(charset)))

+ return string, charset, lang, defects

+_cte_encoders = {

+ 'q': encode_q,

+ 'b': encode_b,

+ }

+_cte_encode_length = {

+ 'q': len_q,

+ 'b': len_b,

+ }

+def encode(string, charset='utf-8', encoding=None, lang=''):

+ """Encode string using the CTE encoding that produces the shorter result.

+ Produces an RFC 2047/2243 encoded word of the form:

+ =?charset*lang?cte?encoded_string?=

+ where '*lang' is omitted unless the 'lang' parameter is given a value.

+ Optional argument charset (defaults to utf-8) specifies the charset to use

+ to encode the string to binary before CTE encoding it. Optional argument

+ 'encoding' is the cte specifier for the encoding that should be used ('q'

+ or 'b'); if it is None (the default) the encoding which produces the

+ shortest encoded sequence is used, except that 'q' is preferred if it is up

+ to five characters longer. Optional argument 'lang' (default '') gives the

+ RFC 2243 language string to specify in the encoded word.

+ """

+ string = str(string)

+ if charset == 'unknown-8bit':

+ bstring = string.encode('ascii', 'surrogateescape')

+ else:

+ bstring = string.encode(charset)

+ if encoding is None:

+ qlen = _cte_encode_length['q'](bstring)

+ blen = _cte_encode_length['b'](bstring)

+ # Bias toward q. 5 is arbitrary.

+ encoding = 'q' if qlen - blen < 5 else 'b'

+ encoded = _cte_encoders[encoding](bstring)

+ if lang:

+ lang = '*' + lang

+ return "=?{0}{1}?{2}?{3}?=".format(charset, lang, encoding, encoded)