OLD | NEW |
(Empty) | |
| 1 """ Routines for manipulating RFC2047 encoded words. |
| 2 |
| 3 This is currently a package-private API, but will be considered for promotion |
| 4 to a public API if there is demand. |
| 5 |
| 6 """ |
| 7 from __future__ import unicode_literals |
| 8 from __future__ import division |
| 9 from __future__ import absolute_import |
| 10 from future.builtins import bytes |
| 11 from future.builtins import chr |
| 12 from future.builtins import int |
| 13 from future.builtins import str |
| 14 |
| 15 # An ecoded word looks like this: |
| 16 # |
| 17 # =?charset[*lang]?cte?encoded_string?= |
| 18 # |
| 19 # for more information about charset see the charset module. Here it is one |
| 20 # of the preferred MIME charset names (hopefully; you never know when parsing). |
| 21 # cte (Content Transfer Encoding) is either 'q' or 'b' (ignoring case). In |
| 22 # theory other letters could be used for other encodings, but in practice this |
| 23 # (almost?) never happens. There could be a public API for adding entries |
| 24 # to the CTE tables, but YAGNI for now. 'q' is Quoted Printable, 'b' is |
| 25 # Base64. The meaning of encoded_string should be obvious. 'lang' is optional |
| 26 # as indicated by the brackets (they are not part of the syntax) but is almost |
| 27 # never encountered in practice. |
| 28 # |
| 29 # The general interface for a CTE decoder is that it takes the encoded_string |
| 30 # as its argument, and returns a tuple (cte_decoded_string, defects). The |
| 31 # cte_decoded_string is the original binary that was encoded using the |
| 32 # specified cte. 'defects' is a list of MessageDefect instances indicating any |
| 33 # problems encountered during conversion. 'charset' and 'lang' are the |
| 34 # corresponding strings extracted from the EW, case preserved. |
| 35 # |
| 36 # The general interface for a CTE encoder is that it takes a binary sequence |
| 37 # as input and returns the cte_encoded_string, which is an ascii-only string. |
| 38 # |
| 39 # Each decoder must also supply a length function that takes the binary |
| 40 # sequence as its argument and returns the length of the resulting encoded |
| 41 # string. |
| 42 # |
| 43 # The main API functions for the module are decode, which calls the decoder |
| 44 # referenced by the cte specifier, and encode, which adds the appropriate |
| 45 # RFC 2047 "chrome" to the encoded string, and can optionally automatically |
| 46 # select the shortest possible encoding. See their docstrings below for |
| 47 # details. |
| 48 |
| 49 import re |
| 50 import base64 |
| 51 import binascii |
| 52 import functools |
| 53 from string import ascii_letters, digits |
| 54 from future.backports.email import errors |
| 55 |
| 56 __all__ = ['decode_q', |
| 57 'encode_q', |
| 58 'decode_b', |
| 59 'encode_b', |
| 60 'len_q', |
| 61 'len_b', |
| 62 'decode', |
| 63 'encode', |
| 64 ] |
| 65 |
| 66 # |
| 67 # Quoted Printable |
| 68 # |
| 69 |
| 70 # regex based decoder. |
| 71 _q_byte_subber = functools.partial(re.compile(br'=([a-fA-F0-9]{2})').sub, |
| 72 lambda m: bytes([int(m.group(1), 16)])) |
| 73 |
| 74 def decode_q(encoded): |
| 75 encoded = bytes(encoded.replace(b'_', b' ')) |
| 76 return _q_byte_subber(encoded), [] |
| 77 |
| 78 |
| 79 # dict mapping bytes to their encoded form |
| 80 class _QByteMap(dict): |
| 81 |
| 82 safe = bytes(b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii
')) |
| 83 |
| 84 def __missing__(self, key): |
| 85 if key in self.safe: |
| 86 self[key] = chr(key) |
| 87 else: |
| 88 self[key] = "={:02X}".format(key) |
| 89 return self[key] |
| 90 |
| 91 _q_byte_map = _QByteMap() |
| 92 |
| 93 # In headers spaces are mapped to '_'. |
| 94 _q_byte_map[ord(' ')] = '_' |
| 95 |
| 96 def encode_q(bstring): |
| 97 return str(''.join(_q_byte_map[x] for x in bytes(bstring))) |
| 98 |
| 99 def len_q(bstring): |
| 100 return sum(len(_q_byte_map[x]) for x in bytes(bstring)) |
| 101 |
| 102 |
| 103 # |
| 104 # Base64 |
| 105 # |
| 106 |
| 107 def decode_b(encoded): |
| 108 defects = [] |
| 109 pad_err = len(encoded) % 4 |
| 110 if pad_err: |
| 111 defects.append(errors.InvalidBase64PaddingDefect()) |
| 112 padded_encoded = encoded + b'==='[:4-pad_err] |
| 113 else: |
| 114 padded_encoded = encoded |
| 115 try: |
| 116 # The validate kwarg to b64decode is not supported in Py2.x |
| 117 if not re.match(b'^[A-Za-z0-9+/]*={0,2}$', padded_encoded): |
| 118 raise binascii.Error('Non-base64 digit found') |
| 119 return base64.b64decode(padded_encoded), defects |
| 120 except binascii.Error: |
| 121 # Since we had correct padding, this must an invalid char error. |
| 122 defects = [errors.InvalidBase64CharactersDefect()] |
| 123 # The non-alphabet characters are ignored as far as padding |
| 124 # goes, but we don't know how many there are. So we'll just |
| 125 # try various padding lengths until something works. |
| 126 for i in 0, 1, 2, 3: |
| 127 try: |
| 128 return base64.b64decode(encoded+b'='*i), defects |
| 129 except (binascii.Error, TypeError): # Py2 raises a TypeError |
| 130 if i==0: |
| 131 defects.append(errors.InvalidBase64PaddingDefect()) |
| 132 else: |
| 133 # This should never happen. |
| 134 raise AssertionError("unexpected binascii.Error") |
| 135 |
| 136 def encode_b(bstring): |
| 137 return base64.b64encode(bstring).decode('ascii') |
| 138 |
| 139 def len_b(bstring): |
| 140 groups_of_3, leftover = divmod(len(bstring), 3) |
| 141 # 4 bytes out for each 3 bytes (or nonzero fraction thereof) in. |
| 142 return groups_of_3 * 4 + (4 if leftover else 0) |
| 143 |
| 144 |
| 145 _cte_decoders = { |
| 146 'q': decode_q, |
| 147 'b': decode_b, |
| 148 } |
| 149 |
| 150 def decode(ew): |
| 151 """Decode encoded word and return (string, charset, lang, defects) tuple. |
| 152 |
| 153 An RFC 2047/2243 encoded word has the form: |
| 154 |
| 155 =?charset*lang?cte?encoded_string?= |
| 156 |
| 157 where '*lang' may be omitted but the other parts may not be. |
| 158 |
| 159 This function expects exactly such a string (that is, it does not check the |
| 160 syntax and may raise errors if the string is not well formed), and returns |
| 161 the encoded_string decoded first from its Content Transfer Encoding and |
| 162 then from the resulting bytes into unicode using the specified charset. If |
| 163 the cte-decoded string does not successfully decode using the specified |
| 164 character set, a defect is added to the defects list and the unknown octets |
| 165 are replaced by the unicode 'unknown' character \uFDFF. |
| 166 |
| 167 The specified charset and language are returned. The default for language, |
| 168 which is rarely if ever encountered, is the empty string. |
| 169 |
| 170 """ |
| 171 _, charset, cte, cte_string, _ = str(ew).split('?') |
| 172 charset, _, lang = charset.partition('*') |
| 173 cte = cte.lower() |
| 174 # Recover the original bytes and do CTE decoding. |
| 175 bstring = cte_string.encode('ascii', 'surrogateescape') |
| 176 bstring, defects = _cte_decoders[cte](bstring) |
| 177 # Turn the CTE decoded bytes into unicode. |
| 178 try: |
| 179 string = bstring.decode(charset) |
| 180 except UnicodeError: |
| 181 defects.append(errors.UndecodableBytesDefect("Encoded word " |
| 182 "contains bytes not decodable using {} charset".format(charset))) |
| 183 string = bstring.decode(charset, 'surrogateescape') |
| 184 except LookupError: |
| 185 string = bstring.decode('ascii', 'surrogateescape') |
| 186 if charset.lower() != 'unknown-8bit': |
| 187 defects.append(errors.CharsetError("Unknown charset {} " |
| 188 "in encoded word; decoded as unknown bytes".format(charset))) |
| 189 return string, charset, lang, defects |
| 190 |
| 191 |
| 192 _cte_encoders = { |
| 193 'q': encode_q, |
| 194 'b': encode_b, |
| 195 } |
| 196 |
| 197 _cte_encode_length = { |
| 198 'q': len_q, |
| 199 'b': len_b, |
| 200 } |
| 201 |
| 202 def encode(string, charset='utf-8', encoding=None, lang=''): |
| 203 """Encode string using the CTE encoding that produces the shorter result. |
| 204 |
| 205 Produces an RFC 2047/2243 encoded word of the form: |
| 206 |
| 207 =?charset*lang?cte?encoded_string?= |
| 208 |
| 209 where '*lang' is omitted unless the 'lang' parameter is given a value. |
| 210 Optional argument charset (defaults to utf-8) specifies the charset to use |
| 211 to encode the string to binary before CTE encoding it. Optional argument |
| 212 'encoding' is the cte specifier for the encoding that should be used ('q' |
| 213 or 'b'); if it is None (the default) the encoding which produces the |
| 214 shortest encoded sequence is used, except that 'q' is preferred if it is up |
| 215 to five characters longer. Optional argument 'lang' (default '') gives the |
| 216 RFC 2243 language string to specify in the encoded word. |
| 217 |
| 218 """ |
| 219 string = str(string) |
| 220 if charset == 'unknown-8bit': |
| 221 bstring = string.encode('ascii', 'surrogateescape') |
| 222 else: |
| 223 bstring = string.encode(charset) |
| 224 if encoding is None: |
| 225 qlen = _cte_encode_length['q'](bstring) |
| 226 blen = _cte_encode_length['b'](bstring) |
| 227 # Bias toward q. 5 is arbitrary. |
| 228 encoding = 'q' if qlen - blen < 5 else 'b' |
| 229 encoded = _cte_encoders[encoding](bstring) |
| 230 if lang: |
| 231 lang = '*' + lang |
| 232 return "=?{0}{1}?{2}?{3}?=".format(charset, lang, encoding, encoded) |
OLD | NEW |