third_party/google-endpoints/future/backports/email/header.py - Issue 2666783008: Add google-endpoints to third_party/.

Side by Side Diff: third_party/google-endpoints/future/backports/email/header.py

Issue 2666783008: Add google-endpoints to third_party/. (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/google-endpoints/future/backports/email/generator.py ('k') | third_party/google-endpoints/future/backports/email/headerregistry.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 # Copyright (C) 2002-2007 Python Software Foundation

	2 # Author: Ben Gertzfield, Barry Warsaw

	3 # Contact: email-sig@python.org

	4

	5 """Header encoding and decoding functionality."""

	6 from __future__ import unicode_literals

	7 from __future__ import division

	8 from __future__ import absolute_import

	9 from future.builtins import bytes, range, str, super, zip

	10

	11 __all__ = [

	12 'Header',

	13 'decode_header',

	14 'make_header',

	15 ]

	16

	17 import re

	18 import binascii

	19

	20 from future.backports import email

	21 from future.backports.email import base64mime

	22 from future.backports.email.errors import HeaderParseError

	23 import future.backports.email.charset as _charset

	24

	25 # Helpers

	26 from future.backports.email.quoprimime import _max_append, header_decode

	27

	28 Charset = _charset.Charset

	29

	30 NL = '\n'

	31 SPACE = ' '

	32 BSPACE = b' '

	33 SPACE8 = ' ' * 8

	34 EMPTYSTRING = ''

	35 MAXLINELEN = 78

	36 FWS = ' \t'

	37

	38 USASCII = Charset('us-ascii')

	39 UTF8 = Charset('utf-8')

	40

	41 # Match encoded-word strings in the form =?charset?q?Hello_World?=

	42 ecre = re.compile(r'''

	43 =\? # literal =?

	44 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

	45 \? # literal ?

	46 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive

	47 \? # literal ?

	48 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

	49 \?= # literal ?=

	50 ''', re.VERBOSE \| re.IGNORECASE \| re.MULTILINE)

	51

	52 # Field name regexp, including trailing colon, but not separating whitespace,

	53 # according to RFC 2822. Character range is from tilde to exclamation mark.

	54 # For use with .match()

	55 fcre = re.compile(r'[\041-\176]+:$')

	56

	57 # Find a header embedded in a putative header value. Used to check for

	58 # header injection attack.

	59 _embeded_header = re.compile(r'\n[^ \t]+:')

	60

	61

	62 def decode_header(header):

	63 """Decode a message header value without converting charset.

	64

	65 Returns a list of (string, charset) pairs containing each of the decoded

	66 parts of the header. Charset is None for non-encoded parts of the header,

	67 otherwise a lower-case string containing the name of the character set

	68 specified in the encoded string.

	69

	70 header may be a string that may or may not contain RFC2047 encoded words,

	71 or it may be a Header object.

	72

	73 An email.errors.HeaderParseError may be raised when certain decoding error

	74 occurs (e.g. a base64 decoding exception).

	75 """

	76 # If it is a Header object, we can just return the encoded chunks.

	77 if hasattr(header, '_chunks'):

	78 return [(_charset._encode(string, str(charset)), str(charset))

	79 for string, charset in header._chunks]

	80 # If no encoding, just return the header with no charset.

	81 if not ecre.search(header):

	82 return [(header, None)]

	83 # First step is to parse all the encoded parts into triplets of the form

	84 # (encoded_string, encoding, charset). For unencoded strings, the last

	85 # two parts will be None.

	86 words = []

	87 for line in header.splitlines():

	88 parts = ecre.split(line)

	89 first = True

	90 while parts:

	91 unencoded = parts.pop(0)

	92 if first:

	93 unencoded = unencoded.lstrip()

	94 first = False

	95 if unencoded:

	96 words.append((unencoded, None, None))

	97 if parts:

	98 charset = parts.pop(0).lower()

	99 encoding = parts.pop(0).lower()

	100 encoded = parts.pop(0)

	101 words.append((encoded, encoding, charset))

	102 # Now loop over words and remove words that consist of whitespace

	103 # between two encoded strings.

	104 import sys

	105 droplist = []

	106 for n, w in enumerate(words):

	107 if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():

	108 droplist.append(n-1)

	109 for d in reversed(droplist):

	110 del words[d]

	111

	112 # The next step is to decode each encoded word by applying the reverse

	113 # base64 or quopri transformation. decoded_words is now a list of the

	114 # form (decoded_word, charset).

	115 decoded_words = []

	116 for encoded_string, encoding, charset in words:

	117 if encoding is None:

	118 # This is an unencoded word.

	119 decoded_words.append((encoded_string, charset))

	120 elif encoding == 'q':

	121 word = header_decode(encoded_string)

	122 decoded_words.append((word, charset))

	123 elif encoding == 'b':

	124 paderr = len(encoded_string) % 4 # Postel's law: add missing paddi ng

	125 if paderr:

	126 encoded_string += '==='[:4 - paderr]

	127 try:

	128 word = base64mime.decode(encoded_string)

	129 except binascii.Error:

	130 raise HeaderParseError('Base64 decoding error')

	131 else:

	132 decoded_words.append((word, charset))

	133 else:

	134 raise AssertionError('Unexpected encoding: ' + encoding)

	135 # Now convert all words to bytes and collapse consecutive runs of

	136 # similarly encoded words.

	137 collapsed = []

	138 last_word = last_charset = None

	139 for word, charset in decoded_words:

	140 if isinstance(word, str):

	141 word = bytes(word, 'raw-unicode-escape')

	142 if last_word is None:

	143 last_word = word

	144 last_charset = charset

	145 elif charset != last_charset:

	146 collapsed.append((last_word, last_charset))

	147 last_word = word

	148 last_charset = charset

	149 elif last_charset is None:

	150 last_word += BSPACE + word

	151 else:

	152 last_word += word

	153 collapsed.append((last_word, last_charset))

	154 return collapsed

	155

	156

	157 def make_header(decoded_seq, maxlinelen=None, header_name=None,

	158 continuation_ws=' '):

	159 """Create a Header from a sequence of pairs as returned by decode_header()

	160

	161 decode_header() takes a header value string and returns a sequence of

	162 pairs of the format (decoded_string, charset) where charset is the string

	163 name of the character set.

	164

	165 This function takes one of those sequence of pairs and returns a Header

	166 instance. Optional maxlinelen, header_name, and continuation_ws are as in

	167 the Header constructor.

	168 """

	169 h = Header(maxlinelen=maxlinelen, header_name=header_name,

	170 continuation_ws=continuation_ws)

	171 for s, charset in decoded_seq:

	172 # None means us-ascii but we can simply pass it on to h.append()

	173 if charset is not None and not isinstance(charset, Charset):

	174 charset = Charset(charset)

	175 h.append(s, charset)

	176 return h

	177

	178

	179 class Header(object):

	180 def __init__(self, s=None, charset=None,

	181 maxlinelen=None, header_name=None,

	182 continuation_ws=' ', errors='strict'):

	183 """Create a MIME-compliant header that can contain many character sets.

	184

	185 Optional s is the initial header value. If None, the initial header

	186 value is not set. You can later append to the header with .append()

	187 method calls. s may be a byte string or a Unicode string, but see the

	188 .append() documentation for semantics.

	189

	190 Optional charset serves two purposes: it has the same meaning as the

	191 charset argument to the .append() method. It also sets the default

	192 character set for all subsequent .append() calls that omit the charset

	193 argument. If charset is not provided in the constructor, the us-ascii

	194 charset is used both as s's initial charset and as the default for

	195 subsequent .append() calls.

	196

	197 The maximum line length can be specified explicitly via maxlinelen. For

	198 splitting the first line to a shorter value (to account for the field

	199 header which isn't included in s, e.g. `Subject') pass in the name of

	200 the field in header_name. The default maxlinelen is 78 as recommended

	201 by RFC 2822.

	202

	203 continuation_ws must be RFC 2822 compliant folding whitespace (usually

	204 either a space or a hard tab) which will be prepended to continuation

	205 lines.

	206

	207 errors is passed through to the .append() call.

	208 """

	209 if charset is None:

	210 charset = USASCII

	211 elif not isinstance(charset, Charset):

	212 charset = Charset(charset)

	213 self._charset = charset

	214 self._continuation_ws = continuation_ws

	215 self._chunks = []

	216 if s is not None:

	217 self.append(s, charset, errors)

	218 if maxlinelen is None:

	219 maxlinelen = MAXLINELEN

	220 self._maxlinelen = maxlinelen

	221 if header_name is None:

	222 self._headerlen = 0

	223 else:

	224 # Take the separating colon and space into account.

	225 self._headerlen = len(header_name) + 2

	226

	227 def __str__(self):

	228 """Return the string value of the header."""

	229 self._normalize()

	230 uchunks = []

	231 lastcs = None

	232 lastspace = None

	233 for string, charset in self._chunks:

	234 # We must preserve spaces between encoded and non-encoded word

	235 # boundaries, which means for us we need to add a space when we go

	236 # from a charset to None/us-ascii, or from None/us-ascii to a

	237 # charset. Only do this for the second and subsequent chunks.

	238 # Don't add a space if the None/us-ascii string already has

	239 # a space (trailing or leading depending on transition)

	240 nextcs = charset

	241 if nextcs == _charset.UNKNOWN8BIT:

	242 original_bytes = string.encode('ascii', 'surrogateescape')

	243 string = original_bytes.decode('ascii', 'replace')

	244 if uchunks:

	245 hasspace = string and self._nonctext(string[0])

	246 if lastcs not in (None, 'us-ascii'):

	247 if nextcs in (None, 'us-ascii') and not hasspace:

	248 uchunks.append(SPACE)

	249 nextcs = None

	250 elif nextcs not in (None, 'us-ascii') and not lastspace:

	251 uchunks.append(SPACE)

	252 lastspace = string and self._nonctext(string[-1])

	253 lastcs = nextcs

	254 uchunks.append(string)

	255 return EMPTYSTRING.join(uchunks)

	256

	257 # Rich comparison operators for equality only. BAW: does it make sense to

	258 # have or explicitly disable <, <=, >, >= operators?

	259 def __eq__(self, other):

	260 # other may be a Header or a string. Both are fine so coerce

	261 # ourselves to a unicode (of the unencoded header value), swap the

	262 # args and do another comparison.

	263 return other == str(self)

	264

	265 def __ne__(self, other):

	266 return not self == other

	267

	268 def append(self, s, charset=None, errors='strict'):

	269 """Append a string to the MIME header.

	270

	271 Optional charset, if given, should be a Charset instance or the name

	272 of a character set (which will be converted to a Charset instance). A

	273 value of None (the default) means that the charset given in the

	274 constructor is used.

	275

	276 s may be a byte string or a Unicode string. If it is a byte string

	277 (i.e. isinstance(s, str) is false), then charset is the encoding of

	278 that byte string, and a UnicodeError will be raised if the string

	279 cannot be decoded with that charset. If s is a Unicode string, then

	280 charset is a hint specifying the character set of the characters in

	281 the string. In either case, when producing an RFC 2822 compliant

	282 header using RFC 2047 rules, the string will be encoded using the

	283 output codec of the charset. If the string cannot be encoded to the

	284 output codec, a UnicodeError will be raised.

	285

	286 Optional `errors' is passed as the errors argument to the decode

	287 call if s is a byte string.

	288 """

	289 if charset is None:

	290 charset = self._charset

	291 elif not isinstance(charset, Charset):

	292 charset = Charset(charset)

	293 if not isinstance(s, str):

	294 input_charset = charset.input_codec or 'us-ascii'

	295 if input_charset == _charset.UNKNOWN8BIT:

	296 s = s.decode('us-ascii', 'surrogateescape')

	297 else:

	298 s = s.decode(input_charset, errors)

	299 # Ensure that the bytes we're storing can be decoded to the output

	300 # character set, otherwise an early error is raised.

	301 output_charset = charset.output_codec or 'us-ascii'

	302 if output_charset != _charset.UNKNOWN8BIT:

	303 try:

	304 s.encode(output_charset, errors)

	305 except UnicodeEncodeError:

	306 if output_charset!='us-ascii':

	307 raise

	308 charset = UTF8

	309 self._chunks.append((s, charset))

	310

	311 def _nonctext(self, s):

	312 """True if string s is not a ctext character of RFC822.

	313 """

	314 return s.isspace() or s in ('(', ')', '\\')

	315

	316 def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):

	317 r"""Encode a message header into an RFC-compliant format.

	318

	319 There are many issues involved in converting a given string for use in

	320 an email header. Only certain character sets are readable in most

	321 email clients, and as header strings can only contain a subset of

	322 7-bit ASCII, care must be taken to properly convert and encode (with

	323 Base64 or quoted-printable) header strings. In addition, there is a

	324 75-character length limit on any given encoded header field, so

	325 line-wrapping must be performed, even with double-byte character sets.

	326

	327 Optional maxlinelen specifies the maximum length of each generated

	328 line, exclusive of the linesep string. Individual lines may be longer

	329 than maxlinelen if a folding point cannot be found. The first line

	330 will be shorter by the length of the header name plus ": " if a header

	331 name was specified at Header construction time. The default value for

	332 maxlinelen is determined at header construction time.

	333

	334 Optional splitchars is a string containing characters which should be

	335 given extra weight by the splitting algorithm during normal header

	336 wrapping. This is in very rough support of RFC 2822's `higher level

	337 syntactic breaks': split points preceded by a splitchar are preferred

	338 during line splitting, with the characters preferred in the order in

	339 which they appear in the string. Space and tab may be included in the

	340 string to indicate whether preference should be given to one over the

	341 other as a split point when other split chars do not appear in the line

	342 being split. Splitchars does not affect RFC 2047 encoded lines.

	343

	344 Optional linesep is a string to be used to separate the lines of

	345 the value. The default value is the most useful for typical

	346 Python applications, but it can be set to \r\n to produce RFC-compliant

	347 line separators when needed.

	348 """

	349 self._normalize()

	350 if maxlinelen is None:

	351 maxlinelen = self._maxlinelen

	352 # A maxlinelen of 0 means don't wrap. For all practical purposes,

	353 # choosing a huge number here accomplishes that and makes the

	354 # _ValueFormatter algorithm much simpler.

	355 if maxlinelen == 0:

	356 maxlinelen = 1000000

	357 formatter = _ValueFormatter(self._headerlen, maxlinelen,

	358 self._continuation_ws, splitchars)

	359 lastcs = None

	360 hasspace = lastspace = None

	361 for string, charset in self._chunks:

	362 if hasspace is not None:

	363 hasspace = string and self._nonctext(string[0])

	364 import sys

	365 if lastcs not in (None, 'us-ascii'):

	366 if not hasspace or charset not in (None, 'us-ascii'):

	367 formatter.add_transition()

	368 elif charset not in (None, 'us-ascii') and not lastspace:

	369 formatter.add_transition()

	370 lastspace = string and self._nonctext(string[-1])

	371 lastcs = charset

	372 hasspace = False

	373 lines = string.splitlines()

	374 if lines:

	375 formatter.feed('', lines[0], charset)

	376 else:

	377 formatter.feed('', '', charset)

	378 for line in lines[1:]:

	379 formatter.newline()

	380 if charset.header_encoding is not None:

	381 formatter.feed(self._continuation_ws, ' ' + line.lstrip(),

	382 charset)

	383 else:

	384 sline = line.lstrip()

	385 fws = line[:len(line)-len(sline)]

	386 formatter.feed(fws, sline, charset)

	387 if len(lines) > 1:

	388 formatter.newline()

	389 if self._chunks:

	390 formatter.add_transition()

	391 value = formatter._str(linesep)

	392 if _embeded_header.search(value):

	393 raise HeaderParseError("header value appears to contain "

	394 "an embedded header: {!r}".format(value))

	395 return value

	396

	397 def _normalize(self):

	398 # Step 1: Normalize the chunks so that all runs of identical charsets

	399 # get collapsed into a single unicode string.

	400 chunks = []

	401 last_charset = None

	402 last_chunk = []

	403 for string, charset in self._chunks:

	404 if charset == last_charset:

	405 last_chunk.append(string)

	406 else:

	407 if last_charset is not None:

	408 chunks.append((SPACE.join(last_chunk), last_charset))

	409 last_chunk = [string]

	410 last_charset = charset

	411 if last_chunk:

	412 chunks.append((SPACE.join(last_chunk), last_charset))

	413 self._chunks = chunks

	414

	415

	416 class _ValueFormatter(object):

	417 def __init__(self, headerlen, maxlen, continuation_ws, splitchars):

	418 self._maxlen = maxlen

	419 self._continuation_ws = continuation_ws

	420 self._continuation_ws_len = len(continuation_ws)

	421 self._splitchars = splitchars

	422 self._lines = []

	423 self._current_line = _Accumulator(headerlen)

	424

	425 def _str(self, linesep):

	426 self.newline()

	427 return linesep.join(self._lines)

	428

	429 def __str__(self):

	430 return self._str(NL)

	431

	432 def newline(self):

	433 end_of_line = self._current_line.pop()

	434 if end_of_line != (' ', ''):

	435 self._current_line.push(*end_of_line)

	436 if len(self._current_line) > 0:

	437 if self._current_line.is_onlyws():

	438 self._lines[-1] += str(self._current_line)

	439 else:

	440 self._lines.append(str(self._current_line))

	441 self._current_line.reset()

	442

	443 def add_transition(self):

	444 self._current_line.push(' ', '')

	445

	446 def feed(self, fws, string, charset):

	447 # If the charset has no header encoding (i.e. it is an ASCII encoding)

	448 # then we must split the header at the "highest level syntactic break"

	449 # possible. Note that we don't have a lot of smarts about field

	450 # syntax; we just try to break on semi-colons, then commas, then

	451 # whitespace. Eventually, this should be pluggable.

	452 if charset.header_encoding is None:

	453 self._ascii_split(fws, string, self._splitchars)

	454 return

	455 # Otherwise, we're doing either a Base64 or a quoted-printable

	456 # encoding which means we don't need to split the line on syntactic

	457 # breaks. We can basically just find enough characters to fit on the

	458 # current line, minus the RFC 2047 chrome. What makes this trickier

	459 # though is that we have to split at octet boundaries, not character

	460 # boundaries but it's only safe to split at character boundaries so at

	461 # best we can only get close.

	462 encoded_lines = charset.header_encode_lines(string, self._maxlengths())

	463 # The first element extends the current line, but if it's None then

	464 # nothing more fit on the current line so start a new line.

	465 try:

	466 first_line = encoded_lines.pop(0)

	467 except IndexError:

	468 # There are no encoded lines, so we're done.

	469 return

	470 if first_line is not None:

	471 self._append_chunk(fws, first_line)

	472 try:

	473 last_line = encoded_lines.pop()

	474 except IndexError:

	475 # There was only one line.

	476 return

	477 self.newline()

	478 self._current_line.push(self._continuation_ws, last_line)

	479 # Everything else are full lines in themselves.

	480 for line in encoded_lines:

	481 self._lines.append(self._continuation_ws + line)

	482

	483 def _maxlengths(self):

	484 # The first line's length.

	485 yield self._maxlen - len(self._current_line)

	486 while True:

	487 yield self._maxlen - self._continuation_ws_len

	488

	489 def _ascii_split(self, fws, string, splitchars):

	490 # The RFC 2822 header folding algorithm is simple in principle but

	491 # complex in practice. Lines may be folded any place where "folding

	492 # white space" appears by inserting a linesep character in front of the

	493 # FWS. The complication is that not all spaces or tabs qualify as FWS,

	494 # and we are also supposed to prefer to break at "higher level

	495 # syntactic breaks". We can't do either of these without intimate

	496 # knowledge of the structure of structured headers, which we don't have

	497 # here. So the best we can do here is prefer to break at the specified

	498 # splitchars, and hope that we don't choose any spaces or tabs that

	499 # aren't legal FWS. (This is at least better than the old algorithm,

	500 # where we would sometimes introduce FWS after a splitchar, or the

	501 # algorithm before that, where we would turn all white space runs into

	502 # single spaces or tabs.)

	503 parts = re.split("(["+FWS+"]+)", fws+string)

	504 if parts[0]:

	505 parts[:0] = ['']

	506 else:

	507 parts.pop(0)

	508 for fws, part in zip([iter(parts)]2):

	509 self._append_chunk(fws, part)

	510

	511 def _append_chunk(self, fws, string):

	512 self._current_line.push(fws, string)

	513 if len(self._current_line) > self._maxlen:

	514 # Find the best split point, working backward from the end.

	515 # There might be none, on a long first line.

	516 for ch in self._splitchars:

	517 for i in range(self._current_line.part_count()-1, 0, -1):

	518 if ch.isspace():

	519 fws = self._current_line[i][0]

	520 if fws and fws[0]==ch:

	521 break

	522 prevpart = self._current_line[i-1][1]

	523 if prevpart and prevpart[-1]==ch:

	524 break

	525 else:

	526 continue

	527 break

	528 else:

	529 fws, part = self._current_line.pop()

	530 if self._current_line._initial_size > 0:

	531 # There will be a header, so leave it on a line by itself.

	532 self.newline()

	533 if not fws:

	534 # We don't use continuation_ws here because the whitespa ce

	535 # after a header should always be a space.

	536 fws = ' '

	537 self._current_line.push(fws, part)

	538 return

	539 remainder = self._current_line.pop_from(i)

	540 self._lines.append(str(self._current_line))

	541 self._current_line.reset(remainder)

	542

	543

	544 class _Accumulator(list):

	545

	546 def __init__(self, initial_size=0):

	547 self._initial_size = initial_size

	548 super().__init__()

	549

	550 def push(self, fws, string):

	551 self.append((fws, string))

	552

	553 def pop_from(self, i=0):

	554 popped = self[i:]

	555 self[i:] = []

	556 return popped

	557

	558 def pop(self):

	559 if self.part_count()==0:

	560 return ('', '')

	561 return super().pop()

	562

	563 def __len__(self):

	564 return sum((len(fws)+len(part) for fws, part in self),

	565 self._initial_size)

	566

	567 def __str__(self):

	568 return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))

	569 for fws, part in self))

	570

	571 def reset(self, startval=None):

	572 if startval is None:

	573 startval = []

	574 self[:] = startval

	575 self._initial_size = 0

	576

	577 def is_onlyws(self):

	578 return self._initial_size==0 and (not self or str(self).isspace())

	579

	580 def part_count(self):

	581 return super().__len__()

OLD	NEW