OLD | NEW |
(Empty) | |
| 1 """Header value parser implementing various email-related RFC parsing rules. |
| 2 |
| 3 The parsing methods defined in this module implement various email related |
| 4 parsing rules. Principal among them is RFC 5322, which is the followon |
| 5 to RFC 2822 and primarily a clarification of the former. It also implements |
| 6 RFC 2047 encoded word decoding. |
| 7 |
| 8 RFC 5322 goes to considerable trouble to maintain backward compatibility with |
| 9 RFC 822 in the parse phase, while cleaning up the structure on the generation |
| 10 phase. This parser supports correct RFC 5322 generation by tagging white space |
| 11 as folding white space only when folding is allowed in the non-obsolete rule |
| 12 sets. Actually, the parser is even more generous when accepting input than RFC |
| 13 5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages. |
| 14 Where possible deviations from the standard are annotated on the 'defects' |
| 15 attribute of tokens that deviate. |
| 16 |
| 17 The general structure of the parser follows RFC 5322, and uses its terminology |
| 18 where there is a direct correspondence. Where the implementation requires a |
| 19 somewhat different structure than that used by the formal grammar, new terms |
| 20 that mimic the closest existing terms are used. Thus, it really helps to have |
| 21 a copy of RFC 5322 handy when studying this code. |
| 22 |
| 23 Input to the parser is a string that has already been unfolded according to |
| 24 RFC 5322 rules. According to the RFC this unfolding is the very first step, and |
| 25 this parser leaves the unfolding step to a higher level message parser, which |
| 26 will have already detected the line breaks that need unfolding while |
| 27 determining the beginning and end of each header. |
| 28 |
| 29 The output of the parser is a TokenList object, which is a list subclass. A |
| 30 TokenList is a recursive data structure. The terminal nodes of the structure |
| 31 are Terminal objects, which are subclasses of str. These do not correspond |
| 32 directly to terminal objects in the formal grammar, but are instead more |
| 33 practical higher level combinations of true terminals. |
| 34 |
| 35 All TokenList and Terminal objects have a 'value' attribute, which produces the |
| 36 semantically meaningful value of that part of the parse subtree. The value of |
| 37 all whitespace tokens (no matter how many sub-tokens they may contain) is a |
| 38 single space, as per the RFC rules. This includes 'CFWS', which is herein |
| 39 included in the general class of whitespace tokens. There is one exception to |
| 40 the rule that whitespace tokens are collapsed into single spaces in values: in |
| 41 the value of a 'bare-quoted-string' (a quoted-string with no leading or |
| 42 trailing whitespace), any whitespace that appeared between the quotation marks |
| 43 is preserved in the returned value. Note that in all Terminal strings quoted |
| 44 pairs are turned into their unquoted values. |
| 45 |
| 46 All TokenList and Terminal objects also have a string value, which attempts to |
| 47 be a "canonical" representation of the RFC-compliant form of the substring that |
| 48 produced the parsed subtree, including minimal use of quoted pair quoting. |
| 49 Whitespace runs are not collapsed. |
| 50 |
| 51 Comment tokens also have a 'content' attribute providing the string found |
| 52 between the parens (including any nested comments) with whitespace preserved. |
| 53 |
| 54 All TokenList and Terminal objects have a 'defects' attribute which is a |
| 55 possibly empty list all of the defects found while creating the token. Defects |
| 56 may appear on any token in the tree, and a composite list of all defects in the |
| 57 subtree is available through the 'all_defects' attribute of any node. (For |
| 58 Terminal notes x.defects == x.all_defects.) |
| 59 |
| 60 Each object in a parse tree is called a 'token', and each has a 'token_type' |
| 61 attribute that gives the name from the RFC 5322 grammar that it represents. |
| 62 Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that |
| 63 may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters. |
| 64 It is returned in place of lists of (ctext/quoted-pair) and |
| 65 (qtext/quoted-pair). |
| 66 |
| 67 XXX: provide complete list of token types. |
| 68 """ |
| 69 from __future__ import print_function |
| 70 from __future__ import unicode_literals |
| 71 from __future__ import division |
| 72 from __future__ import absolute_import |
| 73 from future.builtins import int, range, str, super, list |
| 74 |
| 75 import re |
| 76 from collections import namedtuple, OrderedDict |
| 77 |
| 78 from future.backports.urllib.parse import (unquote, unquote_to_bytes) |
| 79 from future.backports.email import _encoded_words as _ew |
| 80 from future.backports.email import errors |
| 81 from future.backports.email import utils |
| 82 |
| 83 # |
| 84 # Useful constants and functions |
| 85 # |
| 86 |
| 87 WSP = set(' \t') |
| 88 CFWS_LEADER = WSP | set('(') |
| 89 SPECIALS = set(r'()<>@,:;.\"[]') |
| 90 ATOM_ENDS = SPECIALS | WSP |
| 91 DOT_ATOM_ENDS = ATOM_ENDS - set('.') |
| 92 # '.', '"', and '(' do not end phrases in order to support obs-phrase |
| 93 PHRASE_ENDS = SPECIALS - set('."(') |
| 94 TSPECIALS = (SPECIALS | set('/?=')) - set('.') |
| 95 TOKEN_ENDS = TSPECIALS | WSP |
| 96 ASPECIALS = TSPECIALS | set("*'%") |
| 97 ATTRIBUTE_ENDS = ASPECIALS | WSP |
| 98 EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') |
| 99 |
| 100 def quote_string(value): |
| 101 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' |
| 102 |
| 103 # |
| 104 # Accumulator for header folding |
| 105 # |
| 106 |
| 107 class _Folded(object): |
| 108 |
| 109 def __init__(self, maxlen, policy): |
| 110 self.maxlen = maxlen |
| 111 self.policy = policy |
| 112 self.lastlen = 0 |
| 113 self.stickyspace = None |
| 114 self.firstline = True |
| 115 self.done = [] |
| 116 self.current = list() # uses l.clear() |
| 117 |
| 118 def newline(self): |
| 119 self.done.extend(self.current) |
| 120 self.done.append(self.policy.linesep) |
| 121 self.current.clear() |
| 122 self.lastlen = 0 |
| 123 |
| 124 def finalize(self): |
| 125 if self.current: |
| 126 self.newline() |
| 127 |
| 128 def __str__(self): |
| 129 return ''.join(self.done) |
| 130 |
| 131 def append(self, stoken): |
| 132 self.current.append(stoken) |
| 133 |
| 134 def append_if_fits(self, token, stoken=None): |
| 135 if stoken is None: |
| 136 stoken = str(token) |
| 137 l = len(stoken) |
| 138 if self.stickyspace is not None: |
| 139 stickyspace_len = len(self.stickyspace) |
| 140 if self.lastlen + stickyspace_len + l <= self.maxlen: |
| 141 self.current.append(self.stickyspace) |
| 142 self.lastlen += stickyspace_len |
| 143 self.current.append(stoken) |
| 144 self.lastlen += l |
| 145 self.stickyspace = None |
| 146 self.firstline = False |
| 147 return True |
| 148 if token.has_fws: |
| 149 ws = token.pop_leading_fws() |
| 150 if ws is not None: |
| 151 self.stickyspace += str(ws) |
| 152 stickyspace_len += len(ws) |
| 153 token._fold(self) |
| 154 return True |
| 155 if stickyspace_len and l + 1 <= self.maxlen: |
| 156 margin = self.maxlen - l |
| 157 if 0 < margin < stickyspace_len: |
| 158 trim = stickyspace_len - margin |
| 159 self.current.append(self.stickyspace[:trim]) |
| 160 self.stickyspace = self.stickyspace[trim:] |
| 161 stickyspace_len = trim |
| 162 self.newline() |
| 163 self.current.append(self.stickyspace) |
| 164 self.current.append(stoken) |
| 165 self.lastlen = l + stickyspace_len |
| 166 self.stickyspace = None |
| 167 self.firstline = False |
| 168 return True |
| 169 if not self.firstline: |
| 170 self.newline() |
| 171 self.current.append(self.stickyspace) |
| 172 self.current.append(stoken) |
| 173 self.stickyspace = None |
| 174 self.firstline = False |
| 175 return True |
| 176 if self.lastlen + l <= self.maxlen: |
| 177 self.current.append(stoken) |
| 178 self.lastlen += l |
| 179 return True |
| 180 if l < self.maxlen: |
| 181 self.newline() |
| 182 self.current.append(stoken) |
| 183 self.lastlen = l |
| 184 return True |
| 185 return False |
| 186 |
| 187 # |
| 188 # TokenList and its subclasses |
| 189 # |
| 190 |
| 191 class TokenList(list): |
| 192 |
| 193 token_type = None |
| 194 |
| 195 def __init__(self, *args, **kw): |
| 196 super(TokenList, self).__init__(*args, **kw) |
| 197 self.defects = [] |
| 198 |
| 199 def __str__(self): |
| 200 return ''.join(str(x) for x in self) |
| 201 |
| 202 def __repr__(self): |
| 203 return '{}({})'.format(self.__class__.__name__, |
| 204 super(TokenList, self).__repr__()) |
| 205 |
| 206 @property |
| 207 def value(self): |
| 208 return ''.join(x.value for x in self if x.value) |
| 209 |
| 210 @property |
| 211 def all_defects(self): |
| 212 return sum((x.all_defects for x in self), self.defects) |
| 213 |
| 214 # |
| 215 # Folding API |
| 216 # |
| 217 # parts(): |
| 218 # |
| 219 # return a list of objects that constitute the "higher level syntactic |
| 220 # objects" specified by the RFC as the best places to fold a header line. |
| 221 # The returned objects must include leading folding white space, even if |
| 222 # this means mutating the underlying parse tree of the object. Each object |
| 223 # is only responsible for returning *its* parts, and should not drill down |
| 224 # to any lower level except as required to meet the leading folding white |
| 225 # space constraint. |
| 226 # |
| 227 # _fold(folded): |
| 228 # |
| 229 # folded: the result accumulator. This is an instance of _Folded. |
| 230 # (XXX: I haven't finished factoring this out yet, the folding code |
| 231 # pretty much uses this as a state object.) When the folded.current |
| 232 # contains as much text as will fit, the _fold method should call |
| 233 # folded.newline. |
| 234 # folded.lastlen: the current length of the test stored in folded.current. |
| 235 # folded.maxlen: The maximum number of characters that may appear on a |
| 236 # folded line. Differs from the policy setting in that "no limit" is |
| 237 # represented by +inf, which means it can be used in the trivially |
| 238 # logical fashion in comparisons. |
| 239 # |
| 240 # Currently no subclasses implement parts, and I think this will remain |
| 241 # true. A subclass only needs to implement _fold when the generic version |
| 242 # isn't sufficient. _fold will need to be implemented primarily when it is |
| 243 # possible for encoded words to appear in the specialized token-list, since |
| 244 # there is no generic algorithm that can know where exactly the encoded |
| 245 # words are allowed. A _fold implementation is responsible for filling |
| 246 # lines in the same general way that the top level _fold does. It may, and |
| 247 # should, call the _fold method of sub-objects in a similar fashion to that |
| 248 # of the top level _fold. |
| 249 # |
| 250 # XXX: I'm hoping it will be possible to factor the existing code further |
| 251 # to reduce redundancy and make the logic clearer. |
| 252 |
| 253 @property |
| 254 def parts(self): |
| 255 klass = self.__class__ |
| 256 this = list() |
| 257 for token in self: |
| 258 if token.startswith_fws(): |
| 259 if this: |
| 260 yield this[0] if len(this)==1 else klass(this) |
| 261 this.clear() |
| 262 end_ws = token.pop_trailing_ws() |
| 263 this.append(token) |
| 264 if end_ws: |
| 265 yield klass(this) |
| 266 this = [end_ws] |
| 267 if this: |
| 268 yield this[0] if len(this)==1 else klass(this) |
| 269 |
| 270 def startswith_fws(self): |
| 271 return self[0].startswith_fws() |
| 272 |
| 273 def pop_leading_fws(self): |
| 274 if self[0].token_type == 'fws': |
| 275 return self.pop(0) |
| 276 return self[0].pop_leading_fws() |
| 277 |
| 278 def pop_trailing_ws(self): |
| 279 if self[-1].token_type == 'cfws': |
| 280 return self.pop(-1) |
| 281 return self[-1].pop_trailing_ws() |
| 282 |
| 283 @property |
| 284 def has_fws(self): |
| 285 for part in self: |
| 286 if part.has_fws: |
| 287 return True |
| 288 return False |
| 289 |
| 290 def has_leading_comment(self): |
| 291 return self[0].has_leading_comment() |
| 292 |
| 293 @property |
| 294 def comments(self): |
| 295 comments = [] |
| 296 for token in self: |
| 297 comments.extend(token.comments) |
| 298 return comments |
| 299 |
| 300 def fold(self, **_3to2kwargs): |
| 301 # max_line_length 0/None means no limit, ie: infinitely long. |
| 302 policy = _3to2kwargs['policy']; del _3to2kwargs['policy'] |
| 303 maxlen = policy.max_line_length or float("+inf") |
| 304 folded = _Folded(maxlen, policy) |
| 305 self._fold(folded) |
| 306 folded.finalize() |
| 307 return str(folded) |
| 308 |
| 309 def as_encoded_word(self, charset): |
| 310 # This works only for things returned by 'parts', which include |
| 311 # the leading fws, if any, that should be used. |
| 312 res = [] |
| 313 ws = self.pop_leading_fws() |
| 314 if ws: |
| 315 res.append(ws) |
| 316 trailer = self.pop(-1) if self[-1].token_type=='fws' else '' |
| 317 res.append(_ew.encode(str(self), charset)) |
| 318 res.append(trailer) |
| 319 return ''.join(res) |
| 320 |
| 321 def cte_encode(self, charset, policy): |
| 322 res = [] |
| 323 for part in self: |
| 324 res.append(part.cte_encode(charset, policy)) |
| 325 return ''.join(res) |
| 326 |
| 327 def _fold(self, folded): |
| 328 for part in self.parts: |
| 329 tstr = str(part) |
| 330 tlen = len(tstr) |
| 331 try: |
| 332 str(part).encode('us-ascii') |
| 333 except UnicodeEncodeError: |
| 334 if any(isinstance(x, errors.UndecodableBytesDefect) |
| 335 for x in part.all_defects): |
| 336 charset = 'unknown-8bit' |
| 337 else: |
| 338 # XXX: this should be a policy setting |
| 339 charset = 'utf-8' |
| 340 tstr = part.cte_encode(charset, folded.policy) |
| 341 tlen = len(tstr) |
| 342 if folded.append_if_fits(part, tstr): |
| 343 continue |
| 344 # Peel off the leading whitespace if any and make it sticky, to |
| 345 # avoid infinite recursion. |
| 346 ws = part.pop_leading_fws() |
| 347 if ws is not None: |
| 348 # Peel off the leading whitespace and make it sticky, to |
| 349 # avoid infinite recursion. |
| 350 folded.stickyspace = str(part.pop(0)) |
| 351 if folded.append_if_fits(part): |
| 352 continue |
| 353 if part.has_fws: |
| 354 part._fold(folded) |
| 355 continue |
| 356 # There are no fold points in this one; it is too long for a single |
| 357 # line and can't be split...we just have to put it on its own line. |
| 358 folded.append(tstr) |
| 359 folded.newline() |
| 360 |
| 361 def pprint(self, indent=''): |
| 362 print('\n'.join(self._pp(indent=''))) |
| 363 |
| 364 def ppstr(self, indent=''): |
| 365 return '\n'.join(self._pp(indent='')) |
| 366 |
| 367 def _pp(self, indent=''): |
| 368 yield '{}{}/{}('.format( |
| 369 indent, |
| 370 self.__class__.__name__, |
| 371 self.token_type) |
| 372 for token in self: |
| 373 if not hasattr(token, '_pp'): |
| 374 yield (indent + ' !! invalid element in token ' |
| 375 'list: {!r}'.format(token)) |
| 376 else: |
| 377 for line in token._pp(indent+' '): |
| 378 yield line |
| 379 if self.defects: |
| 380 extra = ' Defects: {}'.format(self.defects) |
| 381 else: |
| 382 extra = '' |
| 383 yield '{}){}'.format(indent, extra) |
| 384 |
| 385 |
| 386 class WhiteSpaceTokenList(TokenList): |
| 387 |
| 388 @property |
| 389 def value(self): |
| 390 return ' ' |
| 391 |
| 392 @property |
| 393 def comments(self): |
| 394 return [x.content for x in self if x.token_type=='comment'] |
| 395 |
| 396 |
| 397 class UnstructuredTokenList(TokenList): |
| 398 |
| 399 token_type = 'unstructured' |
| 400 |
| 401 def _fold(self, folded): |
| 402 if any(x.token_type=='encoded-word' for x in self): |
| 403 return self._fold_encoded(folded) |
| 404 # Here we can have either a pure ASCII string that may or may not |
| 405 # have surrogateescape encoded bytes, or a unicode string. |
| 406 last_ew = None |
| 407 for part in self.parts: |
| 408 tstr = str(part) |
| 409 is_ew = False |
| 410 try: |
| 411 str(part).encode('us-ascii') |
| 412 except UnicodeEncodeError: |
| 413 if any(isinstance(x, errors.UndecodableBytesDefect) |
| 414 for x in part.all_defects): |
| 415 charset = 'unknown-8bit' |
| 416 else: |
| 417 charset = 'utf-8' |
| 418 if last_ew is not None: |
| 419 # We've already done an EW, combine this one with it |
| 420 # if there's room. |
| 421 chunk = get_unstructured( |
| 422 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_wor
d(charset) |
| 423 oldlastlen = sum(len(x) for x in folded.current[:last_ew]) |
| 424 schunk = str(chunk) |
| 425 lchunk = len(schunk) |
| 426 if oldlastlen + lchunk <= folded.maxlen: |
| 427 del folded.current[last_ew:] |
| 428 folded.append(schunk) |
| 429 folded.lastlen = oldlastlen + lchunk |
| 430 continue |
| 431 tstr = part.as_encoded_word(charset) |
| 432 is_ew = True |
| 433 if folded.append_if_fits(part, tstr): |
| 434 if is_ew: |
| 435 last_ew = len(folded.current) - 1 |
| 436 continue |
| 437 if is_ew or last_ew: |
| 438 # It's too big to fit on the line, but since we've |
| 439 # got encoded words we can use encoded word folding. |
| 440 part._fold_as_ew(folded) |
| 441 continue |
| 442 # Peel off the leading whitespace if any and make it sticky, to |
| 443 # avoid infinite recursion. |
| 444 ws = part.pop_leading_fws() |
| 445 if ws is not None: |
| 446 folded.stickyspace = str(ws) |
| 447 if folded.append_if_fits(part): |
| 448 continue |
| 449 if part.has_fws: |
| 450 part.fold(folded) |
| 451 continue |
| 452 # It can't be split...we just have to put it on its own line. |
| 453 folded.append(tstr) |
| 454 folded.newline() |
| 455 last_ew = None |
| 456 |
| 457 def cte_encode(self, charset, policy): |
| 458 res = [] |
| 459 last_ew = None |
| 460 for part in self: |
| 461 spart = str(part) |
| 462 try: |
| 463 spart.encode('us-ascii') |
| 464 res.append(spart) |
| 465 except UnicodeEncodeError: |
| 466 if last_ew is None: |
| 467 res.append(part.cte_encode(charset, policy)) |
| 468 last_ew = len(res) |
| 469 else: |
| 470 tl = get_unstructured(''.join(res[last_ew:] + [spart])) |
| 471 res.append(tl.as_encoded_word()) |
| 472 return ''.join(res) |
| 473 |
| 474 |
| 475 class Phrase(TokenList): |
| 476 |
| 477 token_type = 'phrase' |
| 478 |
| 479 def _fold(self, folded): |
| 480 # As with Unstructured, we can have pure ASCII with or without |
| 481 # surrogateescape encoded bytes, or we could have unicode. But this |
| 482 # case is more complicated, since we have to deal with the various |
| 483 # sub-token types and how they can be composed in the face of |
| 484 # unicode-that-needs-CTE-encoding, and the fact that if a token a |
| 485 # comment that becomes a barrier across which we can't compose encoded |
| 486 # words. |
| 487 last_ew = None |
| 488 for part in self.parts: |
| 489 tstr = str(part) |
| 490 tlen = len(tstr) |
| 491 has_ew = False |
| 492 try: |
| 493 str(part).encode('us-ascii') |
| 494 except UnicodeEncodeError: |
| 495 if any(isinstance(x, errors.UndecodableBytesDefect) |
| 496 for x in part.all_defects): |
| 497 charset = 'unknown-8bit' |
| 498 else: |
| 499 charset = 'utf-8' |
| 500 if last_ew is not None and not part.has_leading_comment(): |
| 501 # We've already done an EW, let's see if we can combine |
| 502 # this one with it. The last_ew logic ensures that all we |
| 503 # have at this point is atoms, no comments or quoted |
| 504 # strings. So we can treat the text between the last |
| 505 # encoded word and the content of this token as |
| 506 # unstructured text, and things will work correctly. But |
| 507 # we have to strip off any trailing comment on this token |
| 508 # first, and if it is a quoted string we have to pull out |
| 509 # the content (we're encoding it, so it no longer needs to |
| 510 # be quoted). |
| 511 if part[-1].token_type == 'cfws' and part.comments: |
| 512 remainder = part.pop(-1) |
| 513 else: |
| 514 remainder = '' |
| 515 for i, token in enumerate(part): |
| 516 if token.token_type == 'bare-quoted-string': |
| 517 part[i] = UnstructuredTokenList(token[:]) |
| 518 chunk = get_unstructured( |
| 519 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_wor
d(charset) |
| 520 schunk = str(chunk) |
| 521 lchunk = len(schunk) |
| 522 if last_ew + lchunk <= folded.maxlen: |
| 523 del folded.current[last_ew:] |
| 524 folded.append(schunk) |
| 525 folded.lastlen = sum(len(x) for x in folded.current) |
| 526 continue |
| 527 tstr = part.as_encoded_word(charset) |
| 528 tlen = len(tstr) |
| 529 has_ew = True |
| 530 if folded.append_if_fits(part, tstr): |
| 531 if has_ew and not part.comments: |
| 532 last_ew = len(folded.current) - 1 |
| 533 elif part.comments or part.token_type == 'quoted-string': |
| 534 # If a comment is involved we can't combine EWs. And if a |
| 535 # quoted string is involved, it's not worth the effort to |
| 536 # try to combine them. |
| 537 last_ew = None |
| 538 continue |
| 539 part._fold(folded) |
| 540 |
| 541 def cte_encode(self, charset, policy): |
| 542 res = [] |
| 543 last_ew = None |
| 544 is_ew = False |
| 545 for part in self: |
| 546 spart = str(part) |
| 547 try: |
| 548 spart.encode('us-ascii') |
| 549 res.append(spart) |
| 550 except UnicodeEncodeError: |
| 551 is_ew = True |
| 552 if last_ew is None: |
| 553 if not part.comments: |
| 554 last_ew = len(res) |
| 555 res.append(part.cte_encode(charset, policy)) |
| 556 elif not part.has_leading_comment(): |
| 557 if part[-1].token_type == 'cfws' and part.comments: |
| 558 remainder = part.pop(-1) |
| 559 else: |
| 560 remainder = '' |
| 561 for i, token in enumerate(part): |
| 562 if token.token_type == 'bare-quoted-string': |
| 563 part[i] = UnstructuredTokenList(token[:]) |
| 564 tl = get_unstructured(''.join(res[last_ew:] + [spart])) |
| 565 res[last_ew:] = [tl.as_encoded_word(charset)] |
| 566 if part.comments or (not is_ew and part.token_type == 'quoted-string
'): |
| 567 last_ew = None |
| 568 return ''.join(res) |
| 569 |
| 570 class Word(TokenList): |
| 571 |
| 572 token_type = 'word' |
| 573 |
| 574 |
| 575 class CFWSList(WhiteSpaceTokenList): |
| 576 |
| 577 token_type = 'cfws' |
| 578 |
| 579 def has_leading_comment(self): |
| 580 return bool(self.comments) |
| 581 |
| 582 |
| 583 class Atom(TokenList): |
| 584 |
| 585 token_type = 'atom' |
| 586 |
| 587 |
| 588 class Token(TokenList): |
| 589 |
| 590 token_type = 'token' |
| 591 |
| 592 |
| 593 class EncodedWord(TokenList): |
| 594 |
| 595 token_type = 'encoded-word' |
| 596 cte = None |
| 597 charset = None |
| 598 lang = None |
| 599 |
| 600 @property |
| 601 def encoded(self): |
| 602 if self.cte is not None: |
| 603 return self.cte |
| 604 _ew.encode(str(self), self.charset) |
| 605 |
| 606 |
| 607 |
| 608 class QuotedString(TokenList): |
| 609 |
| 610 token_type = 'quoted-string' |
| 611 |
| 612 @property |
| 613 def content(self): |
| 614 for x in self: |
| 615 if x.token_type == 'bare-quoted-string': |
| 616 return x.value |
| 617 |
| 618 @property |
| 619 def quoted_value(self): |
| 620 res = [] |
| 621 for x in self: |
| 622 if x.token_type == 'bare-quoted-string': |
| 623 res.append(str(x)) |
| 624 else: |
| 625 res.append(x.value) |
| 626 return ''.join(res) |
| 627 |
| 628 @property |
| 629 def stripped_value(self): |
| 630 for token in self: |
| 631 if token.token_type == 'bare-quoted-string': |
| 632 return token.value |
| 633 |
| 634 |
| 635 class BareQuotedString(QuotedString): |
| 636 |
| 637 token_type = 'bare-quoted-string' |
| 638 |
| 639 def __str__(self): |
| 640 return quote_string(''.join(str(x) for x in self)) |
| 641 |
| 642 @property |
| 643 def value(self): |
| 644 return ''.join(str(x) for x in self) |
| 645 |
| 646 |
| 647 class Comment(WhiteSpaceTokenList): |
| 648 |
| 649 token_type = 'comment' |
| 650 |
| 651 def __str__(self): |
| 652 return ''.join(sum([ |
| 653 ["("], |
| 654 [self.quote(x) for x in self], |
| 655 [")"], |
| 656 ], [])) |
| 657 |
| 658 def quote(self, value): |
| 659 if value.token_type == 'comment': |
| 660 return str(value) |
| 661 return str(value).replace('\\', '\\\\').replace( |
| 662 '(', '\(').replace( |
| 663 ')', '\)') |
| 664 |
| 665 @property |
| 666 def content(self): |
| 667 return ''.join(str(x) for x in self) |
| 668 |
| 669 @property |
| 670 def comments(self): |
| 671 return [self.content] |
| 672 |
| 673 class AddressList(TokenList): |
| 674 |
| 675 token_type = 'address-list' |
| 676 |
| 677 @property |
| 678 def addresses(self): |
| 679 return [x for x in self if x.token_type=='address'] |
| 680 |
| 681 @property |
| 682 def mailboxes(self): |
| 683 return sum((x.mailboxes |
| 684 for x in self if x.token_type=='address'), []) |
| 685 |
| 686 @property |
| 687 def all_mailboxes(self): |
| 688 return sum((x.all_mailboxes |
| 689 for x in self if x.token_type=='address'), []) |
| 690 |
| 691 |
| 692 class Address(TokenList): |
| 693 |
| 694 token_type = 'address' |
| 695 |
| 696 @property |
| 697 def display_name(self): |
| 698 if self[0].token_type == 'group': |
| 699 return self[0].display_name |
| 700 |
| 701 @property |
| 702 def mailboxes(self): |
| 703 if self[0].token_type == 'mailbox': |
| 704 return [self[0]] |
| 705 elif self[0].token_type == 'invalid-mailbox': |
| 706 return [] |
| 707 return self[0].mailboxes |
| 708 |
| 709 @property |
| 710 def all_mailboxes(self): |
| 711 if self[0].token_type == 'mailbox': |
| 712 return [self[0]] |
| 713 elif self[0].token_type == 'invalid-mailbox': |
| 714 return [self[0]] |
| 715 return self[0].all_mailboxes |
| 716 |
| 717 class MailboxList(TokenList): |
| 718 |
| 719 token_type = 'mailbox-list' |
| 720 |
| 721 @property |
| 722 def mailboxes(self): |
| 723 return [x for x in self if x.token_type=='mailbox'] |
| 724 |
| 725 @property |
| 726 def all_mailboxes(self): |
| 727 return [x for x in self |
| 728 if x.token_type in ('mailbox', 'invalid-mailbox')] |
| 729 |
| 730 |
| 731 class GroupList(TokenList): |
| 732 |
| 733 token_type = 'group-list' |
| 734 |
| 735 @property |
| 736 def mailboxes(self): |
| 737 if not self or self[0].token_type != 'mailbox-list': |
| 738 return [] |
| 739 return self[0].mailboxes |
| 740 |
| 741 @property |
| 742 def all_mailboxes(self): |
| 743 if not self or self[0].token_type != 'mailbox-list': |
| 744 return [] |
| 745 return self[0].all_mailboxes |
| 746 |
| 747 |
| 748 class Group(TokenList): |
| 749 |
| 750 token_type = "group" |
| 751 |
| 752 @property |
| 753 def mailboxes(self): |
| 754 if self[2].token_type != 'group-list': |
| 755 return [] |
| 756 return self[2].mailboxes |
| 757 |
| 758 @property |
| 759 def all_mailboxes(self): |
| 760 if self[2].token_type != 'group-list': |
| 761 return [] |
| 762 return self[2].all_mailboxes |
| 763 |
| 764 @property |
| 765 def display_name(self): |
| 766 return self[0].display_name |
| 767 |
| 768 |
| 769 class NameAddr(TokenList): |
| 770 |
| 771 token_type = 'name-addr' |
| 772 |
| 773 @property |
| 774 def display_name(self): |
| 775 if len(self) == 1: |
| 776 return None |
| 777 return self[0].display_name |
| 778 |
| 779 @property |
| 780 def local_part(self): |
| 781 return self[-1].local_part |
| 782 |
| 783 @property |
| 784 def domain(self): |
| 785 return self[-1].domain |
| 786 |
| 787 @property |
| 788 def route(self): |
| 789 return self[-1].route |
| 790 |
| 791 @property |
| 792 def addr_spec(self): |
| 793 return self[-1].addr_spec |
| 794 |
| 795 |
| 796 class AngleAddr(TokenList): |
| 797 |
| 798 token_type = 'angle-addr' |
| 799 |
| 800 @property |
| 801 def local_part(self): |
| 802 for x in self: |
| 803 if x.token_type == 'addr-spec': |
| 804 return x.local_part |
| 805 |
| 806 @property |
| 807 def domain(self): |
| 808 for x in self: |
| 809 if x.token_type == 'addr-spec': |
| 810 return x.domain |
| 811 |
| 812 @property |
| 813 def route(self): |
| 814 for x in self: |
| 815 if x.token_type == 'obs-route': |
| 816 return x.domains |
| 817 |
| 818 @property |
| 819 def addr_spec(self): |
| 820 for x in self: |
| 821 if x.token_type == 'addr-spec': |
| 822 return x.addr_spec |
| 823 else: |
| 824 return '<>' |
| 825 |
| 826 |
| 827 class ObsRoute(TokenList): |
| 828 |
| 829 token_type = 'obs-route' |
| 830 |
| 831 @property |
| 832 def domains(self): |
| 833 return [x.domain for x in self if x.token_type == 'domain'] |
| 834 |
| 835 |
| 836 class Mailbox(TokenList): |
| 837 |
| 838 token_type = 'mailbox' |
| 839 |
| 840 @property |
| 841 def display_name(self): |
| 842 if self[0].token_type == 'name-addr': |
| 843 return self[0].display_name |
| 844 |
| 845 @property |
| 846 def local_part(self): |
| 847 return self[0].local_part |
| 848 |
| 849 @property |
| 850 def domain(self): |
| 851 return self[0].domain |
| 852 |
| 853 @property |
| 854 def route(self): |
| 855 if self[0].token_type == 'name-addr': |
| 856 return self[0].route |
| 857 |
| 858 @property |
| 859 def addr_spec(self): |
| 860 return self[0].addr_spec |
| 861 |
| 862 |
| 863 class InvalidMailbox(TokenList): |
| 864 |
| 865 token_type = 'invalid-mailbox' |
| 866 |
| 867 @property |
| 868 def display_name(self): |
| 869 return None |
| 870 |
| 871 local_part = domain = route = addr_spec = display_name |
| 872 |
| 873 |
| 874 class Domain(TokenList): |
| 875 |
| 876 token_type = 'domain' |
| 877 |
| 878 @property |
| 879 def domain(self): |
| 880 return ''.join(super(Domain, self).value.split()) |
| 881 |
| 882 |
| 883 class DotAtom(TokenList): |
| 884 |
| 885 token_type = 'dot-atom' |
| 886 |
| 887 |
| 888 class DotAtomText(TokenList): |
| 889 |
| 890 token_type = 'dot-atom-text' |
| 891 |
| 892 |
| 893 class AddrSpec(TokenList): |
| 894 |
| 895 token_type = 'addr-spec' |
| 896 |
| 897 @property |
| 898 def local_part(self): |
| 899 return self[0].local_part |
| 900 |
| 901 @property |
| 902 def domain(self): |
| 903 if len(self) < 3: |
| 904 return None |
| 905 return self[-1].domain |
| 906 |
| 907 @property |
| 908 def value(self): |
| 909 if len(self) < 3: |
| 910 return self[0].value |
| 911 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip() |
| 912 |
| 913 @property |
| 914 def addr_spec(self): |
| 915 nameset = set(self.local_part) |
| 916 if len(nameset) > len(nameset-DOT_ATOM_ENDS): |
| 917 lp = quote_string(self.local_part) |
| 918 else: |
| 919 lp = self.local_part |
| 920 if self.domain is not None: |
| 921 return lp + '@' + self.domain |
| 922 return lp |
| 923 |
| 924 |
| 925 class ObsLocalPart(TokenList): |
| 926 |
| 927 token_type = 'obs-local-part' |
| 928 |
| 929 |
| 930 class DisplayName(Phrase): |
| 931 |
| 932 token_type = 'display-name' |
| 933 |
| 934 @property |
| 935 def display_name(self): |
| 936 res = TokenList(self) |
| 937 if res[0].token_type == 'cfws': |
| 938 res.pop(0) |
| 939 else: |
| 940 if res[0][0].token_type == 'cfws': |
| 941 res[0] = TokenList(res[0][1:]) |
| 942 if res[-1].token_type == 'cfws': |
| 943 res.pop() |
| 944 else: |
| 945 if res[-1][-1].token_type == 'cfws': |
| 946 res[-1] = TokenList(res[-1][:-1]) |
| 947 return res.value |
| 948 |
| 949 @property |
| 950 def value(self): |
| 951 quote = False |
| 952 if self.defects: |
| 953 quote = True |
| 954 else: |
| 955 for x in self: |
| 956 if x.token_type == 'quoted-string': |
| 957 quote = True |
| 958 if quote: |
| 959 pre = post = '' |
| 960 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws': |
| 961 pre = ' ' |
| 962 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws': |
| 963 post = ' ' |
| 964 return pre+quote_string(self.display_name)+post |
| 965 else: |
| 966 return super(DisplayName, self).value |
| 967 |
| 968 |
| 969 class LocalPart(TokenList): |
| 970 |
| 971 token_type = 'local-part' |
| 972 |
| 973 @property |
| 974 def value(self): |
| 975 if self[0].token_type == "quoted-string": |
| 976 return self[0].quoted_value |
| 977 else: |
| 978 return self[0].value |
| 979 |
| 980 @property |
| 981 def local_part(self): |
| 982 # Strip whitespace from front, back, and around dots. |
| 983 res = [DOT] |
| 984 last = DOT |
| 985 last_is_tl = False |
| 986 for tok in self[0] + [DOT]: |
| 987 if tok.token_type == 'cfws': |
| 988 continue |
| 989 if (last_is_tl and tok.token_type == 'dot' and |
| 990 last[-1].token_type == 'cfws'): |
| 991 res[-1] = TokenList(last[:-1]) |
| 992 is_tl = isinstance(tok, TokenList) |
| 993 if (is_tl and last.token_type == 'dot' and |
| 994 tok[0].token_type == 'cfws'): |
| 995 res.append(TokenList(tok[1:])) |
| 996 else: |
| 997 res.append(tok) |
| 998 last = res[-1] |
| 999 last_is_tl = is_tl |
| 1000 res = TokenList(res[1:-1]) |
| 1001 return res.value |
| 1002 |
| 1003 |
| 1004 class DomainLiteral(TokenList): |
| 1005 |
| 1006 token_type = 'domain-literal' |
| 1007 |
| 1008 @property |
| 1009 def domain(self): |
| 1010 return ''.join(super(DomainLiteral, self).value.split()) |
| 1011 |
| 1012 @property |
| 1013 def ip(self): |
| 1014 for x in self: |
| 1015 if x.token_type == 'ptext': |
| 1016 return x.value |
| 1017 |
| 1018 |
| 1019 class MIMEVersion(TokenList): |
| 1020 |
| 1021 token_type = 'mime-version' |
| 1022 major = None |
| 1023 minor = None |
| 1024 |
| 1025 |
| 1026 class Parameter(TokenList): |
| 1027 |
| 1028 token_type = 'parameter' |
| 1029 sectioned = False |
| 1030 extended = False |
| 1031 charset = 'us-ascii' |
| 1032 |
| 1033 @property |
| 1034 def section_number(self): |
| 1035 # Because the first token, the attribute (name) eats CFWS, the second |
| 1036 # token is always the section if there is one. |
| 1037 return self[1].number if self.sectioned else 0 |
| 1038 |
| 1039 @property |
| 1040 def param_value(self): |
| 1041 # This is part of the "handle quoted extended parameters" hack. |
| 1042 for token in self: |
| 1043 if token.token_type == 'value': |
| 1044 return token.stripped_value |
| 1045 if token.token_type == 'quoted-string': |
| 1046 for token in token: |
| 1047 if token.token_type == 'bare-quoted-string': |
| 1048 for token in token: |
| 1049 if token.token_type == 'value': |
| 1050 return token.stripped_value |
| 1051 return '' |
| 1052 |
| 1053 |
| 1054 class InvalidParameter(Parameter): |
| 1055 |
| 1056 token_type = 'invalid-parameter' |
| 1057 |
| 1058 |
| 1059 class Attribute(TokenList): |
| 1060 |
| 1061 token_type = 'attribute' |
| 1062 |
| 1063 @property |
| 1064 def stripped_value(self): |
| 1065 for token in self: |
| 1066 if token.token_type.endswith('attrtext'): |
| 1067 return token.value |
| 1068 |
| 1069 class Section(TokenList): |
| 1070 |
| 1071 token_type = 'section' |
| 1072 number = None |
| 1073 |
| 1074 |
| 1075 class Value(TokenList): |
| 1076 |
| 1077 token_type = 'value' |
| 1078 |
| 1079 @property |
| 1080 def stripped_value(self): |
| 1081 token = self[0] |
| 1082 if token.token_type == 'cfws': |
| 1083 token = self[1] |
| 1084 if token.token_type.endswith( |
| 1085 ('quoted-string', 'attribute', 'extended-attribute')): |
| 1086 return token.stripped_value |
| 1087 return self.value |
| 1088 |
| 1089 |
| 1090 class MimeParameters(TokenList): |
| 1091 |
| 1092 token_type = 'mime-parameters' |
| 1093 |
| 1094 @property |
| 1095 def params(self): |
| 1096 # The RFC specifically states that the ordering of parameters is not |
| 1097 # guaranteed and may be reordered by the transport layer. So we have |
| 1098 # to assume the RFC 2231 pieces can come in any order. However, we |
| 1099 # output them in the order that we first see a given name, which gives |
| 1100 # us a stable __str__. |
| 1101 params = OrderedDict() |
| 1102 for token in self: |
| 1103 if not token.token_type.endswith('parameter'): |
| 1104 continue |
| 1105 if token[0].token_type != 'attribute': |
| 1106 continue |
| 1107 name = token[0].value.strip() |
| 1108 if name not in params: |
| 1109 params[name] = [] |
| 1110 params[name].append((token.section_number, token)) |
| 1111 for name, parts in params.items(): |
| 1112 parts = sorted(parts) |
| 1113 # XXX: there might be more recovery we could do here if, for |
| 1114 # example, this is really a case of a duplicate attribute name. |
| 1115 value_parts = [] |
| 1116 charset = parts[0][1].charset |
| 1117 for i, (section_number, param) in enumerate(parts): |
| 1118 if section_number != i: |
| 1119 param.defects.append(errors.InvalidHeaderDefect( |
| 1120 "inconsistent multipart parameter numbering")) |
| 1121 value = param.param_value |
| 1122 if param.extended: |
| 1123 try: |
| 1124 value = unquote_to_bytes(value) |
| 1125 except UnicodeEncodeError: |
| 1126 # source had surrogate escaped bytes. What we do now |
| 1127 # is a bit of an open question. I'm not sure this is |
| 1128 # the best choice, but it is what the old algorithm did |
| 1129 value = unquote(value, encoding='latin-1') |
| 1130 else: |
| 1131 try: |
| 1132 value = value.decode(charset, 'surrogateescape') |
| 1133 except LookupError: |
| 1134 # XXX: there should really be a custom defect for |
| 1135 # unknown character set to make it easy to find, |
| 1136 # because otherwise unknown charset is a silent |
| 1137 # failure. |
| 1138 value = value.decode('us-ascii', 'surrogateescape') |
| 1139 if utils._has_surrogates(value): |
| 1140 param.defects.append(errors.UndecodableBytesDefect()
) |
| 1141 value_parts.append(value) |
| 1142 value = ''.join(value_parts) |
| 1143 yield name, value |
| 1144 |
| 1145 def __str__(self): |
| 1146 params = [] |
| 1147 for name, value in self.params: |
| 1148 if value: |
| 1149 params.append('{}={}'.format(name, quote_string(value))) |
| 1150 else: |
| 1151 params.append(name) |
| 1152 params = '; '.join(params) |
| 1153 return ' ' + params if params else '' |
| 1154 |
| 1155 |
| 1156 class ParameterizedHeaderValue(TokenList): |
| 1157 |
| 1158 @property |
| 1159 def params(self): |
| 1160 for token in reversed(self): |
| 1161 if token.token_type == 'mime-parameters': |
| 1162 return token.params |
| 1163 return {} |
| 1164 |
| 1165 @property |
| 1166 def parts(self): |
| 1167 if self and self[-1].token_type == 'mime-parameters': |
| 1168 # We don't want to start a new line if all of the params don't fit |
| 1169 # after the value, so unwrap the parameter list. |
| 1170 return TokenList(self[:-1] + self[-1]) |
| 1171 return TokenList(self).parts |
| 1172 |
| 1173 |
| 1174 class ContentType(ParameterizedHeaderValue): |
| 1175 |
| 1176 token_type = 'content-type' |
| 1177 maintype = 'text' |
| 1178 subtype = 'plain' |
| 1179 |
| 1180 |
| 1181 class ContentDisposition(ParameterizedHeaderValue): |
| 1182 |
| 1183 token_type = 'content-disposition' |
| 1184 content_disposition = None |
| 1185 |
| 1186 |
| 1187 class ContentTransferEncoding(TokenList): |
| 1188 |
| 1189 token_type = 'content-transfer-encoding' |
| 1190 cte = '7bit' |
| 1191 |
| 1192 |
| 1193 class HeaderLabel(TokenList): |
| 1194 |
| 1195 token_type = 'header-label' |
| 1196 |
| 1197 |
| 1198 class Header(TokenList): |
| 1199 |
| 1200 token_type = 'header' |
| 1201 |
| 1202 def _fold(self, folded): |
| 1203 folded.append(str(self.pop(0))) |
| 1204 folded.lastlen = len(folded.current[0]) |
| 1205 # The first line of the header is different from all others: we don't |
| 1206 # want to start a new object on a new line if it has any fold points in |
| 1207 # it that would allow part of it to be on the first header line. |
| 1208 # Further, if the first fold point would fit on the new line, we want |
| 1209 # to do that, but if it doesn't we want to put it on the first line. |
| 1210 # Folded supports this via the stickyspace attribute. If this |
| 1211 # attribute is not None, it does the special handling. |
| 1212 folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' el
se '' |
| 1213 rest = self.pop(0) |
| 1214 if self: |
| 1215 raise ValueError("Malformed Header token list") |
| 1216 rest._fold(folded) |
| 1217 |
| 1218 |
| 1219 # |
| 1220 # Terminal classes and instances |
| 1221 # |
| 1222 |
| 1223 class Terminal(str): |
| 1224 |
| 1225 def __new__(cls, value, token_type): |
| 1226 self = super(Terminal, cls).__new__(cls, value) |
| 1227 self.token_type = token_type |
| 1228 self.defects = [] |
| 1229 return self |
| 1230 |
| 1231 def __repr__(self): |
| 1232 return "{}({})".format(self.__class__.__name__, super(Terminal, self).__
repr__()) |
| 1233 |
| 1234 @property |
| 1235 def all_defects(self): |
| 1236 return list(self.defects) |
| 1237 |
| 1238 def _pp(self, indent=''): |
| 1239 return ["{}{}/{}({}){}".format( |
| 1240 indent, |
| 1241 self.__class__.__name__, |
| 1242 self.token_type, |
| 1243 super(Terminal, self).__repr__(), |
| 1244 '' if not self.defects else ' {}'.format(self.defects), |
| 1245 )] |
| 1246 |
| 1247 def cte_encode(self, charset, policy): |
| 1248 value = str(self) |
| 1249 try: |
| 1250 value.encode('us-ascii') |
| 1251 return value |
| 1252 except UnicodeEncodeError: |
| 1253 return _ew.encode(value, charset) |
| 1254 |
| 1255 def pop_trailing_ws(self): |
| 1256 # This terminates the recursion. |
| 1257 return None |
| 1258 |
| 1259 def pop_leading_fws(self): |
| 1260 # This terminates the recursion. |
| 1261 return None |
| 1262 |
| 1263 @property |
| 1264 def comments(self): |
| 1265 return [] |
| 1266 |
| 1267 def has_leading_comment(self): |
| 1268 return False |
| 1269 |
| 1270 def __getnewargs__(self): |
| 1271 return(str(self), self.token_type) |
| 1272 |
| 1273 |
| 1274 class WhiteSpaceTerminal(Terminal): |
| 1275 |
| 1276 @property |
| 1277 def value(self): |
| 1278 return ' ' |
| 1279 |
| 1280 def startswith_fws(self): |
| 1281 return True |
| 1282 |
| 1283 has_fws = True |
| 1284 |
| 1285 |
| 1286 class ValueTerminal(Terminal): |
| 1287 |
| 1288 @property |
| 1289 def value(self): |
| 1290 return self |
| 1291 |
| 1292 def startswith_fws(self): |
| 1293 return False |
| 1294 |
| 1295 has_fws = False |
| 1296 |
| 1297 def as_encoded_word(self, charset): |
| 1298 return _ew.encode(str(self), charset) |
| 1299 |
| 1300 |
| 1301 class EWWhiteSpaceTerminal(WhiteSpaceTerminal): |
| 1302 |
| 1303 @property |
| 1304 def value(self): |
| 1305 return '' |
| 1306 |
| 1307 @property |
| 1308 def encoded(self): |
| 1309 return self[:] |
| 1310 |
| 1311 def __str__(self): |
| 1312 return '' |
| 1313 |
| 1314 has_fws = True |
| 1315 |
| 1316 |
| 1317 # XXX these need to become classes and used as instances so |
| 1318 # that a program can't change them in a parse tree and screw |
| 1319 # up other parse trees. Maybe should have tests for that, too. |
| 1320 DOT = ValueTerminal('.', 'dot') |
| 1321 ListSeparator = ValueTerminal(',', 'list-separator') |
| 1322 RouteComponentMarker = ValueTerminal('@', 'route-component-marker') |
| 1323 |
| 1324 # |
| 1325 # Parser |
| 1326 # |
| 1327 |
| 1328 """Parse strings according to RFC822/2047/2822/5322 rules. |
| 1329 |
| 1330 This is a stateless parser. Each get_XXX function accepts a string and |
| 1331 returns either a Terminal or a TokenList representing the RFC object named |
| 1332 by the method and a string containing the remaining unparsed characters |
| 1333 from the input. Thus a parser method consumes the next syntactic construct |
| 1334 of a given type and returns a token representing the construct plus the |
| 1335 unparsed remainder of the input string. |
| 1336 |
| 1337 For example, if the first element of a structured header is a 'phrase', |
| 1338 then: |
| 1339 |
| 1340 phrase, value = get_phrase(value) |
| 1341 |
| 1342 returns the complete phrase from the start of the string value, plus any |
| 1343 characters left in the string after the phrase is removed. |
| 1344 |
| 1345 """ |
| 1346 |
| 1347 _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split |
| 1348 _non_atom_end_matcher = re.compile(r"[^{}]+".format( |
| 1349 ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']','\]'))).match |
| 1350 _non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall |
| 1351 _non_token_end_matcher = re.compile(r"[^{}]+".format( |
| 1352 ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']','\]'))).match |
| 1353 _non_attribute_end_matcher = re.compile(r"[^{}]+".format( |
| 1354 ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']','\]'))).match |
| 1355 _non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( |
| 1356 ''.join(EXTENDED_ATTRIBUTE_ENDS).replace( |
| 1357 '\\','\\\\').replace(']','\]'))).match |
| 1358 |
| 1359 def _validate_xtext(xtext): |
| 1360 """If input token contains ASCII non-printables, register a defect.""" |
| 1361 |
| 1362 non_printables = _non_printable_finder(xtext) |
| 1363 if non_printables: |
| 1364 xtext.defects.append(errors.NonPrintableDefect(non_printables)) |
| 1365 if utils._has_surrogates(xtext): |
| 1366 xtext.defects.append(errors.UndecodableBytesDefect( |
| 1367 "Non-ASCII characters found in header token")) |
| 1368 |
| 1369 def _get_ptext_to_endchars(value, endchars): |
| 1370 """Scan printables/quoted-pairs until endchars and return unquoted ptext. |
| 1371 |
| 1372 This function turns a run of qcontent, ccontent-without-comments, or |
| 1373 dtext-with-quoted-printables into a single string by unquoting any |
| 1374 quoted printables. It returns the string, the remaining value, and |
| 1375 a flag that is True iff there were any quoted printables decoded. |
| 1376 |
| 1377 """ |
| 1378 _3to2list = list(_wsp_splitter(value, 1)) |
| 1379 fragment, remainder, = _3to2list[:1] + [_3to2list[1:]] |
| 1380 vchars = [] |
| 1381 escape = False |
| 1382 had_qp = False |
| 1383 for pos in range(len(fragment)): |
| 1384 if fragment[pos] == '\\': |
| 1385 if escape: |
| 1386 escape = False |
| 1387 had_qp = True |
| 1388 else: |
| 1389 escape = True |
| 1390 continue |
| 1391 if escape: |
| 1392 escape = False |
| 1393 elif fragment[pos] in endchars: |
| 1394 break |
| 1395 vchars.append(fragment[pos]) |
| 1396 else: |
| 1397 pos = pos + 1 |
| 1398 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp |
| 1399 |
| 1400 def _decode_ew_run(value): |
| 1401 """ Decode a run of RFC2047 encoded words. |
| 1402 |
| 1403 _decode_ew_run(value) -> (text, value, defects) |
| 1404 |
| 1405 Scans the supplied value for a run of tokens that look like they are RFC |
| 1406 2047 encoded words, decodes those words into text according to RFC 2047 |
| 1407 rules (whitespace between encoded words is discarded), and returns the text |
| 1408 and the remaining value (including any leading whitespace on the remaining |
| 1409 value), as well as a list of any defects encountered while decoding. The |
| 1410 input value may not have any leading whitespace. |
| 1411 |
| 1412 """ |
| 1413 res = [] |
| 1414 defects = [] |
| 1415 last_ws = '' |
| 1416 while value: |
| 1417 try: |
| 1418 tok, ws, value = _wsp_splitter(value, 1) |
| 1419 except ValueError: |
| 1420 tok, ws, value = value, '', '' |
| 1421 if not (tok.startswith('=?') and tok.endswith('?=')): |
| 1422 return ''.join(res), last_ws + tok + ws + value, defects |
| 1423 text, charset, lang, new_defects = _ew.decode(tok) |
| 1424 res.append(text) |
| 1425 defects.extend(new_defects) |
| 1426 last_ws = ws |
| 1427 return ''.join(res), last_ws, defects |
| 1428 |
| 1429 def get_fws(value): |
| 1430 """FWS = 1*WSP |
| 1431 |
| 1432 This isn't the RFC definition. We're using fws to represent tokens where |
| 1433 folding can be done, but when we are parsing the *un*folding has already |
| 1434 been done so we don't need to watch out for CRLF. |
| 1435 |
| 1436 """ |
| 1437 newvalue = value.lstrip() |
| 1438 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') |
| 1439 return fws, newvalue |
| 1440 |
| 1441 def get_encoded_word(value): |
| 1442 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" |
| 1443 |
| 1444 """ |
| 1445 ew = EncodedWord() |
| 1446 if not value.startswith('=?'): |
| 1447 raise errors.HeaderParseError( |
| 1448 "expected encoded word but found {}".format(value)) |
| 1449 _3to2list1 = list(value[2:].split('?=', 1)) |
| 1450 tok, remainder, = _3to2list1[:1] + [_3to2list1[1:]] |
| 1451 if tok == value[2:]: |
| 1452 raise errors.HeaderParseError( |
| 1453 "expected encoded word but found {}".format(value)) |
| 1454 remstr = ''.join(remainder) |
| 1455 if remstr[:2].isdigit(): |
| 1456 _3to2list3 = list(remstr.split('?=', 1)) |
| 1457 rest, remainder, = _3to2list3[:1] + [_3to2list3[1:]] |
| 1458 tok = tok + '?=' + rest |
| 1459 if len(tok.split()) > 1: |
| 1460 ew.defects.append(errors.InvalidHeaderDefect( |
| 1461 "whitespace inside encoded word")) |
| 1462 ew.cte = value |
| 1463 value = ''.join(remainder) |
| 1464 try: |
| 1465 text, charset, lang, defects = _ew.decode('=?' + tok + '?=') |
| 1466 except ValueError: |
| 1467 raise errors.HeaderParseError( |
| 1468 "encoded word format invalid: '{}'".format(ew.cte)) |
| 1469 ew.charset = charset |
| 1470 ew.lang = lang |
| 1471 ew.defects.extend(defects) |
| 1472 while text: |
| 1473 if text[0] in WSP: |
| 1474 token, text = get_fws(text) |
| 1475 ew.append(token) |
| 1476 continue |
| 1477 _3to2list5 = list(_wsp_splitter(text, 1)) |
| 1478 chars, remainder, = _3to2list5[:1] + [_3to2list5[1:]] |
| 1479 vtext = ValueTerminal(chars, 'vtext') |
| 1480 _validate_xtext(vtext) |
| 1481 ew.append(vtext) |
| 1482 text = ''.join(remainder) |
| 1483 return ew, value |
| 1484 |
| 1485 def get_unstructured(value): |
| 1486 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct |
| 1487 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) |
| 1488 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR |
| 1489 |
| 1490 obs-NO-WS-CTL is control characters except WSP/CR/LF. |
| 1491 |
| 1492 So, basically, we have printable runs, plus control characters or nulls in |
| 1493 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the |
| 1494 obsolete syntax in its specification, but requires whitespace on either |
| 1495 side of the encoded words, I can see no reason to need to separate the |
| 1496 non-printable-non-whitespace from the printable runs if they occur, so we |
| 1497 parse this into xtext tokens separated by WSP tokens. |
| 1498 |
| 1499 Because an 'unstructured' value must by definition constitute the entire |
| 1500 value, this 'get' routine does not return a remaining value, only the |
| 1501 parsed TokenList. |
| 1502 |
| 1503 """ |
| 1504 # XXX: but what about bare CR and LF? They might signal the start or |
| 1505 # end of an encoded word. YAGNI for now, since out current parsers |
| 1506 # will never send us strings with bard CR or LF. |
| 1507 |
| 1508 unstructured = UnstructuredTokenList() |
| 1509 while value: |
| 1510 if value[0] in WSP: |
| 1511 token, value = get_fws(value) |
| 1512 unstructured.append(token) |
| 1513 continue |
| 1514 if value.startswith('=?'): |
| 1515 try: |
| 1516 token, value = get_encoded_word(value) |
| 1517 except errors.HeaderParseError: |
| 1518 pass |
| 1519 else: |
| 1520 have_ws = True |
| 1521 if len(unstructured) > 0: |
| 1522 if unstructured[-1].token_type != 'fws': |
| 1523 unstructured.defects.append(errors.InvalidHeaderDefect( |
| 1524 "missing whitespace before encoded word")) |
| 1525 have_ws = False |
| 1526 if have_ws and len(unstructured) > 1: |
| 1527 if unstructured[-2].token_type == 'encoded-word': |
| 1528 unstructured[-1] = EWWhiteSpaceTerminal( |
| 1529 unstructured[-1], 'fws') |
| 1530 unstructured.append(token) |
| 1531 continue |
| 1532 _3to2list7 = list(_wsp_splitter(value, 1)) |
| 1533 tok, remainder, = _3to2list7[:1] + [_3to2list7[1:]] |
| 1534 vtext = ValueTerminal(tok, 'vtext') |
| 1535 _validate_xtext(vtext) |
| 1536 unstructured.append(vtext) |
| 1537 value = ''.join(remainder) |
| 1538 return unstructured |
| 1539 |
| 1540 def get_qp_ctext(value): |
| 1541 """ctext = <printable ascii except \ ( )> |
| 1542 |
| 1543 This is not the RFC ctext, since we are handling nested comments in comment |
| 1544 and unquoting quoted-pairs here. We allow anything except the '()' |
| 1545 characters, but if we find any ASCII other than the RFC defined printable |
| 1546 ASCII an NonPrintableDefect is added to the token's defects list. Since |
| 1547 quoted pairs are converted to their unquoted values, what is returned is |
| 1548 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value |
| 1549 is ' '. |
| 1550 |
| 1551 """ |
| 1552 ptext, value, _ = _get_ptext_to_endchars(value, '()') |
| 1553 ptext = WhiteSpaceTerminal(ptext, 'ptext') |
| 1554 _validate_xtext(ptext) |
| 1555 return ptext, value |
| 1556 |
| 1557 def get_qcontent(value): |
| 1558 """qcontent = qtext / quoted-pair |
| 1559 |
| 1560 We allow anything except the DQUOTE character, but if we find any ASCII |
| 1561 other than the RFC defined printable ASCII an NonPrintableDefect is |
| 1562 added to the token's defects list. Any quoted pairs are converted to their |
| 1563 unquoted values, so what is returned is a 'ptext' token. In this case it |
| 1564 is a ValueTerminal. |
| 1565 |
| 1566 """ |
| 1567 ptext, value, _ = _get_ptext_to_endchars(value, '"') |
| 1568 ptext = ValueTerminal(ptext, 'ptext') |
| 1569 _validate_xtext(ptext) |
| 1570 return ptext, value |
| 1571 |
| 1572 def get_atext(value): |
| 1573 """atext = <matches _atext_matcher> |
| 1574 |
| 1575 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to |
| 1576 the token's defects list if we find non-atext characters. |
| 1577 """ |
| 1578 m = _non_atom_end_matcher(value) |
| 1579 if not m: |
| 1580 raise errors.HeaderParseError( |
| 1581 "expected atext but found '{}'".format(value)) |
| 1582 atext = m.group() |
| 1583 value = value[len(atext):] |
| 1584 atext = ValueTerminal(atext, 'atext') |
| 1585 _validate_xtext(atext) |
| 1586 return atext, value |
| 1587 |
| 1588 def get_bare_quoted_string(value): |
| 1589 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE |
| 1590 |
| 1591 A quoted-string without the leading or trailing white space. Its |
| 1592 value is the text between the quote marks, with whitespace |
| 1593 preserved and quoted pairs decoded. |
| 1594 """ |
| 1595 if value[0] != '"': |
| 1596 raise errors.HeaderParseError( |
| 1597 "expected '\"' but found '{}'".format(value)) |
| 1598 bare_quoted_string = BareQuotedString() |
| 1599 value = value[1:] |
| 1600 while value and value[0] != '"': |
| 1601 if value[0] in WSP: |
| 1602 token, value = get_fws(value) |
| 1603 else: |
| 1604 token, value = get_qcontent(value) |
| 1605 bare_quoted_string.append(token) |
| 1606 if not value: |
| 1607 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( |
| 1608 "end of header inside quoted string")) |
| 1609 return bare_quoted_string, value |
| 1610 return bare_quoted_string, value[1:] |
| 1611 |
| 1612 def get_comment(value): |
| 1613 """comment = "(" *([FWS] ccontent) [FWS] ")" |
| 1614 ccontent = ctext / quoted-pair / comment |
| 1615 |
| 1616 We handle nested comments here, and quoted-pair in our qp-ctext routine. |
| 1617 """ |
| 1618 if value and value[0] != '(': |
| 1619 raise errors.HeaderParseError( |
| 1620 "expected '(' but found '{}'".format(value)) |
| 1621 comment = Comment() |
| 1622 value = value[1:] |
| 1623 while value and value[0] != ")": |
| 1624 if value[0] in WSP: |
| 1625 token, value = get_fws(value) |
| 1626 elif value[0] == '(': |
| 1627 token, value = get_comment(value) |
| 1628 else: |
| 1629 token, value = get_qp_ctext(value) |
| 1630 comment.append(token) |
| 1631 if not value: |
| 1632 comment.defects.append(errors.InvalidHeaderDefect( |
| 1633 "end of header inside comment")) |
| 1634 return comment, value |
| 1635 return comment, value[1:] |
| 1636 |
| 1637 def get_cfws(value): |
| 1638 """CFWS = (1*([FWS] comment) [FWS]) / FWS |
| 1639 |
| 1640 """ |
| 1641 cfws = CFWSList() |
| 1642 while value and value[0] in CFWS_LEADER: |
| 1643 if value[0] in WSP: |
| 1644 token, value = get_fws(value) |
| 1645 else: |
| 1646 token, value = get_comment(value) |
| 1647 cfws.append(token) |
| 1648 return cfws, value |
| 1649 |
| 1650 def get_quoted_string(value): |
| 1651 """quoted-string = [CFWS] <bare-quoted-string> [CFWS] |
| 1652 |
| 1653 'bare-quoted-string' is an intermediate class defined by this |
| 1654 parser and not by the RFC grammar. It is the quoted string |
| 1655 without any attached CFWS. |
| 1656 """ |
| 1657 quoted_string = QuotedString() |
| 1658 if value and value[0] in CFWS_LEADER: |
| 1659 token, value = get_cfws(value) |
| 1660 quoted_string.append(token) |
| 1661 token, value = get_bare_quoted_string(value) |
| 1662 quoted_string.append(token) |
| 1663 if value and value[0] in CFWS_LEADER: |
| 1664 token, value = get_cfws(value) |
| 1665 quoted_string.append(token) |
| 1666 return quoted_string, value |
| 1667 |
| 1668 def get_atom(value): |
| 1669 """atom = [CFWS] 1*atext [CFWS] |
| 1670 |
| 1671 """ |
| 1672 atom = Atom() |
| 1673 if value and value[0] in CFWS_LEADER: |
| 1674 token, value = get_cfws(value) |
| 1675 atom.append(token) |
| 1676 if value and value[0] in ATOM_ENDS: |
| 1677 raise errors.HeaderParseError( |
| 1678 "expected atom but found '{}'".format(value)) |
| 1679 token, value = get_atext(value) |
| 1680 atom.append(token) |
| 1681 if value and value[0] in CFWS_LEADER: |
| 1682 token, value = get_cfws(value) |
| 1683 atom.append(token) |
| 1684 return atom, value |
| 1685 |
| 1686 def get_dot_atom_text(value): |
| 1687 """ dot-text = 1*atext *("." 1*atext) |
| 1688 |
| 1689 """ |
| 1690 dot_atom_text = DotAtomText() |
| 1691 if not value or value[0] in ATOM_ENDS: |
| 1692 raise errors.HeaderParseError("expected atom at a start of " |
| 1693 "dot-atom-text but found '{}'".format(value)) |
| 1694 while value and value[0] not in ATOM_ENDS: |
| 1695 token, value = get_atext(value) |
| 1696 dot_atom_text.append(token) |
| 1697 if value and value[0] == '.': |
| 1698 dot_atom_text.append(DOT) |
| 1699 value = value[1:] |
| 1700 if dot_atom_text[-1] is DOT: |
| 1701 raise errors.HeaderParseError("expected atom at end of dot-atom-text " |
| 1702 "but found '{}'".format('.'+value)) |
| 1703 return dot_atom_text, value |
| 1704 |
| 1705 def get_dot_atom(value): |
| 1706 """ dot-atom = [CFWS] dot-atom-text [CFWS] |
| 1707 |
| 1708 """ |
| 1709 dot_atom = DotAtom() |
| 1710 if value[0] in CFWS_LEADER: |
| 1711 token, value = get_cfws(value) |
| 1712 dot_atom.append(token) |
| 1713 token, value = get_dot_atom_text(value) |
| 1714 dot_atom.append(token) |
| 1715 if value and value[0] in CFWS_LEADER: |
| 1716 token, value = get_cfws(value) |
| 1717 dot_atom.append(token) |
| 1718 return dot_atom, value |
| 1719 |
| 1720 def get_word(value): |
| 1721 """word = atom / quoted-string |
| 1722 |
| 1723 Either atom or quoted-string may start with CFWS. We have to peel off this |
| 1724 CFWS first to determine which type of word to parse. Afterward we splice |
| 1725 the leading CFWS, if any, into the parsed sub-token. |
| 1726 |
| 1727 If neither an atom or a quoted-string is found before the next special, a |
| 1728 HeaderParseError is raised. |
| 1729 |
| 1730 The token returned is either an Atom or a QuotedString, as appropriate. |
| 1731 This means the 'word' level of the formal grammar is not represented in the |
| 1732 parse tree; this is because having that extra layer when manipulating the |
| 1733 parse tree is more confusing than it is helpful. |
| 1734 |
| 1735 """ |
| 1736 if value[0] in CFWS_LEADER: |
| 1737 leader, value = get_cfws(value) |
| 1738 else: |
| 1739 leader = None |
| 1740 if value[0]=='"': |
| 1741 token, value = get_quoted_string(value) |
| 1742 elif value[0] in SPECIALS: |
| 1743 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " |
| 1744 "but found '{}'".format(value)) |
| 1745 else: |
| 1746 token, value = get_atom(value) |
| 1747 if leader is not None: |
| 1748 token[:0] = [leader] |
| 1749 return token, value |
| 1750 |
| 1751 def get_phrase(value): |
| 1752 """ phrase = 1*word / obs-phrase |
| 1753 obs-phrase = word *(word / "." / CFWS) |
| 1754 |
| 1755 This means a phrase can be a sequence of words, periods, and CFWS in any |
| 1756 order as long as it starts with at least one word. If anything other than |
| 1757 words is detected, an ObsoleteHeaderDefect is added to the token's defect |
| 1758 list. We also accept a phrase that starts with CFWS followed by a dot; |
| 1759 this is registered as an InvalidHeaderDefect, since it is not supported by |
| 1760 even the obsolete grammar. |
| 1761 |
| 1762 """ |
| 1763 phrase = Phrase() |
| 1764 try: |
| 1765 token, value = get_word(value) |
| 1766 phrase.append(token) |
| 1767 except errors.HeaderParseError: |
| 1768 phrase.defects.append(errors.InvalidHeaderDefect( |
| 1769 "phrase does not start with word")) |
| 1770 while value and value[0] not in PHRASE_ENDS: |
| 1771 if value[0]=='.': |
| 1772 phrase.append(DOT) |
| 1773 phrase.defects.append(errors.ObsoleteHeaderDefect( |
| 1774 "period in 'phrase'")) |
| 1775 value = value[1:] |
| 1776 else: |
| 1777 try: |
| 1778 token, value = get_word(value) |
| 1779 except errors.HeaderParseError: |
| 1780 if value[0] in CFWS_LEADER: |
| 1781 token, value = get_cfws(value) |
| 1782 phrase.defects.append(errors.ObsoleteHeaderDefect( |
| 1783 "comment found without atom")) |
| 1784 else: |
| 1785 raise |
| 1786 phrase.append(token) |
| 1787 return phrase, value |
| 1788 |
| 1789 def get_local_part(value): |
| 1790 """ local-part = dot-atom / quoted-string / obs-local-part |
| 1791 |
| 1792 """ |
| 1793 local_part = LocalPart() |
| 1794 leader = None |
| 1795 if value[0] in CFWS_LEADER: |
| 1796 leader, value = get_cfws(value) |
| 1797 if not value: |
| 1798 raise errors.HeaderParseError( |
| 1799 "expected local-part but found '{}'".format(value)) |
| 1800 try: |
| 1801 token, value = get_dot_atom(value) |
| 1802 except errors.HeaderParseError: |
| 1803 try: |
| 1804 token, value = get_word(value) |
| 1805 except errors.HeaderParseError: |
| 1806 if value[0] != '\\' and value[0] in PHRASE_ENDS: |
| 1807 raise |
| 1808 token = TokenList() |
| 1809 if leader is not None: |
| 1810 token[:0] = [leader] |
| 1811 local_part.append(token) |
| 1812 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): |
| 1813 obs_local_part, value = get_obs_local_part(str(local_part) + value) |
| 1814 if obs_local_part.token_type == 'invalid-obs-local-part': |
| 1815 local_part.defects.append(errors.InvalidHeaderDefect( |
| 1816 "local-part is not dot-atom, quoted-string, or obs-local-part")) |
| 1817 else: |
| 1818 local_part.defects.append(errors.ObsoleteHeaderDefect( |
| 1819 "local-part is not a dot-atom (contains CFWS)")) |
| 1820 local_part[0] = obs_local_part |
| 1821 try: |
| 1822 local_part.value.encode('ascii') |
| 1823 except UnicodeEncodeError: |
| 1824 local_part.defects.append(errors.NonASCIILocalPartDefect( |
| 1825 "local-part contains non-ASCII characters)")) |
| 1826 return local_part, value |
| 1827 |
| 1828 def get_obs_local_part(value): |
| 1829 """ obs-local-part = word *("." word) |
| 1830 """ |
| 1831 obs_local_part = ObsLocalPart() |
| 1832 last_non_ws_was_dot = False |
| 1833 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): |
| 1834 if value[0] == '.': |
| 1835 if last_non_ws_was_dot: |
| 1836 obs_local_part.defects.append(errors.InvalidHeaderDefect( |
| 1837 "invalid repeated '.'")) |
| 1838 obs_local_part.append(DOT) |
| 1839 last_non_ws_was_dot = True |
| 1840 value = value[1:] |
| 1841 continue |
| 1842 elif value[0]=='\\': |
| 1843 obs_local_part.append(ValueTerminal(value[0], |
| 1844 'misplaced-special')) |
| 1845 value = value[1:] |
| 1846 obs_local_part.defects.append(errors.InvalidHeaderDefect( |
| 1847 "'\\' character outside of quoted-string/ccontent")) |
| 1848 last_non_ws_was_dot = False |
| 1849 continue |
| 1850 if obs_local_part and obs_local_part[-1].token_type != 'dot': |
| 1851 obs_local_part.defects.append(errors.InvalidHeaderDefect( |
| 1852 "missing '.' between words")) |
| 1853 try: |
| 1854 token, value = get_word(value) |
| 1855 last_non_ws_was_dot = False |
| 1856 except errors.HeaderParseError: |
| 1857 if value[0] not in CFWS_LEADER: |
| 1858 raise |
| 1859 token, value = get_cfws(value) |
| 1860 obs_local_part.append(token) |
| 1861 if (obs_local_part[0].token_type == 'dot' or |
| 1862 obs_local_part[0].token_type=='cfws' and |
| 1863 obs_local_part[1].token_type=='dot'): |
| 1864 obs_local_part.defects.append(errors.InvalidHeaderDefect( |
| 1865 "Invalid leading '.' in local part")) |
| 1866 if (obs_local_part[-1].token_type == 'dot' or |
| 1867 obs_local_part[-1].token_type=='cfws' and |
| 1868 obs_local_part[-2].token_type=='dot'): |
| 1869 obs_local_part.defects.append(errors.InvalidHeaderDefect( |
| 1870 "Invalid trailing '.' in local part")) |
| 1871 if obs_local_part.defects: |
| 1872 obs_local_part.token_type = 'invalid-obs-local-part' |
| 1873 return obs_local_part, value |
| 1874 |
| 1875 def get_dtext(value): |
| 1876 """ dtext = <printable ascii except \ [ ]> / obs-dtext |
| 1877 obs-dtext = obs-NO-WS-CTL / quoted-pair |
| 1878 |
| 1879 We allow anything except the excluded characters, but if we find any |
| 1880 ASCII other than the RFC defined printable ASCII an NonPrintableDefect is |
| 1881 added to the token's defects list. Quoted pairs are converted to their |
| 1882 unquoted values, so what is returned is a ptext token, in this case a |
| 1883 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is |
| 1884 added to the returned token's defect list. |
| 1885 |
| 1886 """ |
| 1887 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]') |
| 1888 ptext = ValueTerminal(ptext, 'ptext') |
| 1889 if had_qp: |
| 1890 ptext.defects.append(errors.ObsoleteHeaderDefect( |
| 1891 "quoted printable found in domain-literal")) |
| 1892 _validate_xtext(ptext) |
| 1893 return ptext, value |
| 1894 |
| 1895 def _check_for_early_dl_end(value, domain_literal): |
| 1896 if value: |
| 1897 return False |
| 1898 domain_literal.append(errors.InvalidHeaderDefect( |
| 1899 "end of input inside domain-literal")) |
| 1900 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) |
| 1901 return True |
| 1902 |
| 1903 def get_domain_literal(value): |
| 1904 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] |
| 1905 |
| 1906 """ |
| 1907 domain_literal = DomainLiteral() |
| 1908 if value[0] in CFWS_LEADER: |
| 1909 token, value = get_cfws(value) |
| 1910 domain_literal.append(token) |
| 1911 if not value: |
| 1912 raise errors.HeaderParseError("expected domain-literal") |
| 1913 if value[0] != '[': |
| 1914 raise errors.HeaderParseError("expected '[' at start of domain-literal " |
| 1915 "but found '{}'".format(value)) |
| 1916 value = value[1:] |
| 1917 if _check_for_early_dl_end(value, domain_literal): |
| 1918 return domain_literal, value |
| 1919 domain_literal.append(ValueTerminal('[', 'domain-literal-start')) |
| 1920 if value[0] in WSP: |
| 1921 token, value = get_fws(value) |
| 1922 domain_literal.append(token) |
| 1923 token, value = get_dtext(value) |
| 1924 domain_literal.append(token) |
| 1925 if _check_for_early_dl_end(value, domain_literal): |
| 1926 return domain_literal, value |
| 1927 if value[0] in WSP: |
| 1928 token, value = get_fws(value) |
| 1929 domain_literal.append(token) |
| 1930 if _check_for_early_dl_end(value, domain_literal): |
| 1931 return domain_literal, value |
| 1932 if value[0] != ']': |
| 1933 raise errors.HeaderParseError("expected ']' at end of domain-literal " |
| 1934 "but found '{}'".format(value)) |
| 1935 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) |
| 1936 value = value[1:] |
| 1937 if value and value[0] in CFWS_LEADER: |
| 1938 token, value = get_cfws(value) |
| 1939 domain_literal.append(token) |
| 1940 return domain_literal, value |
| 1941 |
| 1942 def get_domain(value): |
| 1943 """ domain = dot-atom / domain-literal / obs-domain |
| 1944 obs-domain = atom *("." atom)) |
| 1945 |
| 1946 """ |
| 1947 domain = Domain() |
| 1948 leader = None |
| 1949 if value[0] in CFWS_LEADER: |
| 1950 leader, value = get_cfws(value) |
| 1951 if not value: |
| 1952 raise errors.HeaderParseError( |
| 1953 "expected domain but found '{}'".format(value)) |
| 1954 if value[0] == '[': |
| 1955 token, value = get_domain_literal(value) |
| 1956 if leader is not None: |
| 1957 token[:0] = [leader] |
| 1958 domain.append(token) |
| 1959 return domain, value |
| 1960 try: |
| 1961 token, value = get_dot_atom(value) |
| 1962 except errors.HeaderParseError: |
| 1963 token, value = get_atom(value) |
| 1964 if leader is not None: |
| 1965 token[:0] = [leader] |
| 1966 domain.append(token) |
| 1967 if value and value[0] == '.': |
| 1968 domain.defects.append(errors.ObsoleteHeaderDefect( |
| 1969 "domain is not a dot-atom (contains CFWS)")) |
| 1970 if domain[0].token_type == 'dot-atom': |
| 1971 domain[:] = domain[0] |
| 1972 while value and value[0] == '.': |
| 1973 domain.append(DOT) |
| 1974 token, value = get_atom(value[1:]) |
| 1975 domain.append(token) |
| 1976 return domain, value |
| 1977 |
| 1978 def get_addr_spec(value): |
| 1979 """ addr-spec = local-part "@" domain |
| 1980 |
| 1981 """ |
| 1982 addr_spec = AddrSpec() |
| 1983 token, value = get_local_part(value) |
| 1984 addr_spec.append(token) |
| 1985 if not value or value[0] != '@': |
| 1986 addr_spec.defects.append(errors.InvalidHeaderDefect( |
| 1987 "add-spec local part with no domain")) |
| 1988 return addr_spec, value |
| 1989 addr_spec.append(ValueTerminal('@', 'address-at-symbol')) |
| 1990 token, value = get_domain(value[1:]) |
| 1991 addr_spec.append(token) |
| 1992 return addr_spec, value |
| 1993 |
| 1994 def get_obs_route(value): |
| 1995 """ obs-route = obs-domain-list ":" |
| 1996 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain]) |
| 1997 |
| 1998 Returns an obs-route token with the appropriate sub-tokens (that is, |
| 1999 there is no obs-domain-list in the parse tree). |
| 2000 """ |
| 2001 obs_route = ObsRoute() |
| 2002 while value and (value[0]==',' or value[0] in CFWS_LEADER): |
| 2003 if value[0] in CFWS_LEADER: |
| 2004 token, value = get_cfws(value) |
| 2005 obs_route.append(token) |
| 2006 elif value[0] == ',': |
| 2007 obs_route.append(ListSeparator) |
| 2008 value = value[1:] |
| 2009 if not value or value[0] != '@': |
| 2010 raise errors.HeaderParseError( |
| 2011 "expected obs-route domain but found '{}'".format(value)) |
| 2012 obs_route.append(RouteComponentMarker) |
| 2013 token, value = get_domain(value[1:]) |
| 2014 obs_route.append(token) |
| 2015 while value and value[0]==',': |
| 2016 obs_route.append(ListSeparator) |
| 2017 value = value[1:] |
| 2018 if not value: |
| 2019 break |
| 2020 if value[0] in CFWS_LEADER: |
| 2021 token, value = get_cfws(value) |
| 2022 obs_route.append(token) |
| 2023 if value[0] == '@': |
| 2024 obs_route.append(RouteComponentMarker) |
| 2025 token, value = get_domain(value[1:]) |
| 2026 obs_route.append(token) |
| 2027 if not value: |
| 2028 raise errors.HeaderParseError("end of header while parsing obs-route") |
| 2029 if value[0] != ':': |
| 2030 raise errors.HeaderParseError( "expected ':' marking end of " |
| 2031 "obs-route but found '{}'".format(value)) |
| 2032 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker')) |
| 2033 return obs_route, value[1:] |
| 2034 |
| 2035 def get_angle_addr(value): |
| 2036 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr |
| 2037 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] |
| 2038 |
| 2039 """ |
| 2040 angle_addr = AngleAddr() |
| 2041 if value[0] in CFWS_LEADER: |
| 2042 token, value = get_cfws(value) |
| 2043 angle_addr.append(token) |
| 2044 if not value or value[0] != '<': |
| 2045 raise errors.HeaderParseError( |
| 2046 "expected angle-addr but found '{}'".format(value)) |
| 2047 angle_addr.append(ValueTerminal('<', 'angle-addr-start')) |
| 2048 value = value[1:] |
| 2049 # Although it is not legal per RFC5322, SMTP uses '<>' in certain |
| 2050 # circumstances. |
| 2051 if value[0] == '>': |
| 2052 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) |
| 2053 angle_addr.defects.append(errors.InvalidHeaderDefect( |
| 2054 "null addr-spec in angle-addr")) |
| 2055 value = value[1:] |
| 2056 return angle_addr, value |
| 2057 try: |
| 2058 token, value = get_addr_spec(value) |
| 2059 except errors.HeaderParseError: |
| 2060 try: |
| 2061 token, value = get_obs_route(value) |
| 2062 angle_addr.defects.append(errors.ObsoleteHeaderDefect( |
| 2063 "obsolete route specification in angle-addr")) |
| 2064 except errors.HeaderParseError: |
| 2065 raise errors.HeaderParseError( |
| 2066 "expected addr-spec or obs-route but found '{}'".format(value)) |
| 2067 angle_addr.append(token) |
| 2068 token, value = get_addr_spec(value) |
| 2069 angle_addr.append(token) |
| 2070 if value and value[0] == '>': |
| 2071 value = value[1:] |
| 2072 else: |
| 2073 angle_addr.defects.append(errors.InvalidHeaderDefect( |
| 2074 "missing trailing '>' on angle-addr")) |
| 2075 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) |
| 2076 if value and value[0] in CFWS_LEADER: |
| 2077 token, value = get_cfws(value) |
| 2078 angle_addr.append(token) |
| 2079 return angle_addr, value |
| 2080 |
| 2081 def get_display_name(value): |
| 2082 """ display-name = phrase |
| 2083 |
| 2084 Because this is simply a name-rule, we don't return a display-name |
| 2085 token containing a phrase, but rather a display-name token with |
| 2086 the content of the phrase. |
| 2087 |
| 2088 """ |
| 2089 display_name = DisplayName() |
| 2090 token, value = get_phrase(value) |
| 2091 display_name.extend(token[:]) |
| 2092 display_name.defects = token.defects[:] |
| 2093 return display_name, value |
| 2094 |
| 2095 |
| 2096 def get_name_addr(value): |
| 2097 """ name-addr = [display-name] angle-addr |
| 2098 |
| 2099 """ |
| 2100 name_addr = NameAddr() |
| 2101 # Both the optional display name and the angle-addr can start with cfws. |
| 2102 leader = None |
| 2103 if value[0] in CFWS_LEADER: |
| 2104 leader, value = get_cfws(value) |
| 2105 if not value: |
| 2106 raise errors.HeaderParseError( |
| 2107 "expected name-addr but found '{}'".format(leader)) |
| 2108 if value[0] != '<': |
| 2109 if value[0] in PHRASE_ENDS: |
| 2110 raise errors.HeaderParseError( |
| 2111 "expected name-addr but found '{}'".format(value)) |
| 2112 token, value = get_display_name(value) |
| 2113 if not value: |
| 2114 raise errors.HeaderParseError( |
| 2115 "expected name-addr but found '{}'".format(token)) |
| 2116 if leader is not None: |
| 2117 token[0][:0] = [leader] |
| 2118 leader = None |
| 2119 name_addr.append(token) |
| 2120 token, value = get_angle_addr(value) |
| 2121 if leader is not None: |
| 2122 token[:0] = [leader] |
| 2123 name_addr.append(token) |
| 2124 return name_addr, value |
| 2125 |
| 2126 def get_mailbox(value): |
| 2127 """ mailbox = name-addr / addr-spec |
| 2128 |
| 2129 """ |
| 2130 # The only way to figure out if we are dealing with a name-addr or an |
| 2131 # addr-spec is to try parsing each one. |
| 2132 mailbox = Mailbox() |
| 2133 try: |
| 2134 token, value = get_name_addr(value) |
| 2135 except errors.HeaderParseError: |
| 2136 try: |
| 2137 token, value = get_addr_spec(value) |
| 2138 except errors.HeaderParseError: |
| 2139 raise errors.HeaderParseError( |
| 2140 "expected mailbox but found '{}'".format(value)) |
| 2141 if any(isinstance(x, errors.InvalidHeaderDefect) |
| 2142 for x in token.all_defects): |
| 2143 mailbox.token_type = 'invalid-mailbox' |
| 2144 mailbox.append(token) |
| 2145 return mailbox, value |
| 2146 |
| 2147 def get_invalid_mailbox(value, endchars): |
| 2148 """ Read everything up to one of the chars in endchars. |
| 2149 |
| 2150 This is outside the formal grammar. The InvalidMailbox TokenList that is |
| 2151 returned acts like a Mailbox, but the data attributes are None. |
| 2152 |
| 2153 """ |
| 2154 invalid_mailbox = InvalidMailbox() |
| 2155 while value and value[0] not in endchars: |
| 2156 if value[0] in PHRASE_ENDS: |
| 2157 invalid_mailbox.append(ValueTerminal(value[0], |
| 2158 'misplaced-special')) |
| 2159 value = value[1:] |
| 2160 else: |
| 2161 token, value = get_phrase(value) |
| 2162 invalid_mailbox.append(token) |
| 2163 return invalid_mailbox, value |
| 2164 |
| 2165 def get_mailbox_list(value): |
| 2166 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list |
| 2167 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) |
| 2168 |
| 2169 For this routine we go outside the formal grammar in order to improve error |
| 2170 handling. We recognize the end of the mailbox list only at the end of the |
| 2171 value or at a ';' (the group terminator). This is so that we can turn |
| 2172 invalid mailboxes into InvalidMailbox tokens and continue parsing any |
| 2173 remaining valid mailboxes. We also allow all mailbox entries to be null, |
| 2174 and this condition is handled appropriately at a higher level. |
| 2175 |
| 2176 """ |
| 2177 mailbox_list = MailboxList() |
| 2178 while value and value[0] != ';': |
| 2179 try: |
| 2180 token, value = get_mailbox(value) |
| 2181 mailbox_list.append(token) |
| 2182 except errors.HeaderParseError: |
| 2183 leader = None |
| 2184 if value[0] in CFWS_LEADER: |
| 2185 leader, value = get_cfws(value) |
| 2186 if not value or value[0] in ',;': |
| 2187 mailbox_list.append(leader) |
| 2188 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( |
| 2189 "empty element in mailbox-list")) |
| 2190 else: |
| 2191 token, value = get_invalid_mailbox(value, ',;') |
| 2192 if leader is not None: |
| 2193 token[:0] = [leader] |
| 2194 mailbox_list.append(token) |
| 2195 mailbox_list.defects.append(errors.InvalidHeaderDefect( |
| 2196 "invalid mailbox in mailbox-list")) |
| 2197 elif value[0] == ',': |
| 2198 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( |
| 2199 "empty element in mailbox-list")) |
| 2200 else: |
| 2201 token, value = get_invalid_mailbox(value, ',;') |
| 2202 if leader is not None: |
| 2203 token[:0] = [leader] |
| 2204 mailbox_list.append(token) |
| 2205 mailbox_list.defects.append(errors.InvalidHeaderDefect( |
| 2206 "invalid mailbox in mailbox-list")) |
| 2207 if value and value[0] not in ',;': |
| 2208 # Crap after mailbox; treat it as an invalid mailbox. |
| 2209 # The mailbox info will still be available. |
| 2210 mailbox = mailbox_list[-1] |
| 2211 mailbox.token_type = 'invalid-mailbox' |
| 2212 token, value = get_invalid_mailbox(value, ',;') |
| 2213 mailbox.extend(token) |
| 2214 mailbox_list.defects.append(errors.InvalidHeaderDefect( |
| 2215 "invalid mailbox in mailbox-list")) |
| 2216 if value and value[0] == ',': |
| 2217 mailbox_list.append(ListSeparator) |
| 2218 value = value[1:] |
| 2219 return mailbox_list, value |
| 2220 |
| 2221 |
| 2222 def get_group_list(value): |
| 2223 """ group-list = mailbox-list / CFWS / obs-group-list |
| 2224 obs-group-list = 1*([CFWS] ",") [CFWS] |
| 2225 |
| 2226 """ |
| 2227 group_list = GroupList() |
| 2228 if not value: |
| 2229 group_list.defects.append(errors.InvalidHeaderDefect( |
| 2230 "end of header before group-list")) |
| 2231 return group_list, value |
| 2232 leader = None |
| 2233 if value and value[0] in CFWS_LEADER: |
| 2234 leader, value = get_cfws(value) |
| 2235 if not value: |
| 2236 # This should never happen in email parsing, since CFWS-only is a |
| 2237 # legal alternative to group-list in a group, which is the only |
| 2238 # place group-list appears. |
| 2239 group_list.defects.append(errors.InvalidHeaderDefect( |
| 2240 "end of header in group-list")) |
| 2241 group_list.append(leader) |
| 2242 return group_list, value |
| 2243 if value[0] == ';': |
| 2244 group_list.append(leader) |
| 2245 return group_list, value |
| 2246 token, value = get_mailbox_list(value) |
| 2247 if len(token.all_mailboxes)==0: |
| 2248 if leader is not None: |
| 2249 group_list.append(leader) |
| 2250 group_list.extend(token) |
| 2251 group_list.defects.append(errors.ObsoleteHeaderDefect( |
| 2252 "group-list with empty entries")) |
| 2253 return group_list, value |
| 2254 if leader is not None: |
| 2255 token[:0] = [leader] |
| 2256 group_list.append(token) |
| 2257 return group_list, value |
| 2258 |
| 2259 def get_group(value): |
| 2260 """ group = display-name ":" [group-list] ";" [CFWS] |
| 2261 |
| 2262 """ |
| 2263 group = Group() |
| 2264 token, value = get_display_name(value) |
| 2265 if not value or value[0] != ':': |
| 2266 raise errors.HeaderParseError("expected ':' at end of group " |
| 2267 "display name but found '{}'".format(value)) |
| 2268 group.append(token) |
| 2269 group.append(ValueTerminal(':', 'group-display-name-terminator')) |
| 2270 value = value[1:] |
| 2271 if value and value[0] == ';': |
| 2272 group.append(ValueTerminal(';', 'group-terminator')) |
| 2273 return group, value[1:] |
| 2274 token, value = get_group_list(value) |
| 2275 group.append(token) |
| 2276 if not value: |
| 2277 group.defects.append(errors.InvalidHeaderDefect( |
| 2278 "end of header in group")) |
| 2279 if value[0] != ';': |
| 2280 raise errors.HeaderParseError( |
| 2281 "expected ';' at end of group but found {}".format(value)) |
| 2282 group.append(ValueTerminal(';', 'group-terminator')) |
| 2283 value = value[1:] |
| 2284 if value and value[0] in CFWS_LEADER: |
| 2285 token, value = get_cfws(value) |
| 2286 group.append(token) |
| 2287 return group, value |
| 2288 |
| 2289 def get_address(value): |
| 2290 """ address = mailbox / group |
| 2291 |
| 2292 Note that counter-intuitively, an address can be either a single address or |
| 2293 a list of addresses (a group). This is why the returned Address object has |
| 2294 a 'mailboxes' attribute which treats a single address as a list of length |
| 2295 one. When you need to differentiate between to two cases, extract the singl
e |
| 2296 element, which is either a mailbox or a group token. |
| 2297 |
| 2298 """ |
| 2299 # The formal grammar isn't very helpful when parsing an address. mailbox |
| 2300 # and group, especially when allowing for obsolete forms, start off very |
| 2301 # similarly. It is only when you reach one of @, <, or : that you know |
| 2302 # what you've got. So, we try each one in turn, starting with the more |
| 2303 # likely of the two. We could perhaps make this more efficient by looking |
| 2304 # for a phrase and then branching based on the next character, but that |
| 2305 # would be a premature optimization. |
| 2306 address = Address() |
| 2307 try: |
| 2308 token, value = get_group(value) |
| 2309 except errors.HeaderParseError: |
| 2310 try: |
| 2311 token, value = get_mailbox(value) |
| 2312 except errors.HeaderParseError: |
| 2313 raise errors.HeaderParseError( |
| 2314 "expected address but found '{}'".format(value)) |
| 2315 address.append(token) |
| 2316 return address, value |
| 2317 |
| 2318 def get_address_list(value): |
| 2319 """ address_list = (address *("," address)) / obs-addr-list |
| 2320 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) |
| 2321 |
| 2322 We depart from the formal grammar here by continuing to parse until the end |
| 2323 of the input, assuming the input to be entirely composed of an |
| 2324 address-list. This is always true in email parsing, and allows us |
| 2325 to skip invalid addresses to parse additional valid ones. |
| 2326 |
| 2327 """ |
| 2328 address_list = AddressList() |
| 2329 while value: |
| 2330 try: |
| 2331 token, value = get_address(value) |
| 2332 address_list.append(token) |
| 2333 except errors.HeaderParseError as err: |
| 2334 leader = None |
| 2335 if value[0] in CFWS_LEADER: |
| 2336 leader, value = get_cfws(value) |
| 2337 if not value or value[0] == ',': |
| 2338 address_list.append(leader) |
| 2339 address_list.defects.append(errors.ObsoleteHeaderDefect( |
| 2340 "address-list entry with no content")) |
| 2341 else: |
| 2342 token, value = get_invalid_mailbox(value, ',') |
| 2343 if leader is not None: |
| 2344 token[:0] = [leader] |
| 2345 address_list.append(Address([token])) |
| 2346 address_list.defects.append(errors.InvalidHeaderDefect( |
| 2347 "invalid address in address-list")) |
| 2348 elif value[0] == ',': |
| 2349 address_list.defects.append(errors.ObsoleteHeaderDefect( |
| 2350 "empty element in address-list")) |
| 2351 else: |
| 2352 token, value = get_invalid_mailbox(value, ',') |
| 2353 if leader is not None: |
| 2354 token[:0] = [leader] |
| 2355 address_list.append(Address([token])) |
| 2356 address_list.defects.append(errors.InvalidHeaderDefect( |
| 2357 "invalid address in address-list")) |
| 2358 if value and value[0] != ',': |
| 2359 # Crap after address; treat it as an invalid mailbox. |
| 2360 # The mailbox info will still be available. |
| 2361 mailbox = address_list[-1][0] |
| 2362 mailbox.token_type = 'invalid-mailbox' |
| 2363 token, value = get_invalid_mailbox(value, ',') |
| 2364 mailbox.extend(token) |
| 2365 address_list.defects.append(errors.InvalidHeaderDefect( |
| 2366 "invalid address in address-list")) |
| 2367 if value: # Must be a , at this point. |
| 2368 address_list.append(ValueTerminal(',', 'list-separator')) |
| 2369 value = value[1:] |
| 2370 return address_list, value |
| 2371 |
| 2372 # |
| 2373 # XXX: As I begin to add additional header parsers, I'm realizing we probably |
| 2374 # have two level of parser routines: the get_XXX methods that get a token in |
| 2375 # the grammar, and parse_XXX methods that parse an entire field value. So |
| 2376 # get_address_list above should really be a parse_ method, as probably should |
| 2377 # be get_unstructured. |
| 2378 # |
| 2379 |
| 2380 def parse_mime_version(value): |
| 2381 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS] |
| 2382 |
| 2383 """ |
| 2384 # The [CFWS] is implicit in the RFC 2045 BNF. |
| 2385 # XXX: This routine is a bit verbose, should factor out a get_int method. |
| 2386 mime_version = MIMEVersion() |
| 2387 if not value: |
| 2388 mime_version.defects.append(errors.HeaderMissingRequiredValue( |
| 2389 "Missing MIME version number (eg: 1.0)")) |
| 2390 return mime_version |
| 2391 if value[0] in CFWS_LEADER: |
| 2392 token, value = get_cfws(value) |
| 2393 mime_version.append(token) |
| 2394 if not value: |
| 2395 mime_version.defects.append(errors.HeaderMissingRequiredValue( |
| 2396 "Expected MIME version number but found only CFWS")) |
| 2397 digits = '' |
| 2398 while value and value[0] != '.' and value[0] not in CFWS_LEADER: |
| 2399 digits += value[0] |
| 2400 value = value[1:] |
| 2401 if not digits.isdigit(): |
| 2402 mime_version.defects.append(errors.InvalidHeaderDefect( |
| 2403 "Expected MIME major version number but found {!r}".format(digits))) |
| 2404 mime_version.append(ValueTerminal(digits, 'xtext')) |
| 2405 else: |
| 2406 mime_version.major = int(digits) |
| 2407 mime_version.append(ValueTerminal(digits, 'digits')) |
| 2408 if value and value[0] in CFWS_LEADER: |
| 2409 token, value = get_cfws(value) |
| 2410 mime_version.append(token) |
| 2411 if not value or value[0] != '.': |
| 2412 if mime_version.major is not None: |
| 2413 mime_version.defects.append(errors.InvalidHeaderDefect( |
| 2414 "Incomplete MIME version; found only major number")) |
| 2415 if value: |
| 2416 mime_version.append(ValueTerminal(value, 'xtext')) |
| 2417 return mime_version |
| 2418 mime_version.append(ValueTerminal('.', 'version-separator')) |
| 2419 value = value[1:] |
| 2420 if value and value[0] in CFWS_LEADER: |
| 2421 token, value = get_cfws(value) |
| 2422 mime_version.append(token) |
| 2423 if not value: |
| 2424 if mime_version.major is not None: |
| 2425 mime_version.defects.append(errors.InvalidHeaderDefect( |
| 2426 "Incomplete MIME version; found only major number")) |
| 2427 return mime_version |
| 2428 digits = '' |
| 2429 while value and value[0] not in CFWS_LEADER: |
| 2430 digits += value[0] |
| 2431 value = value[1:] |
| 2432 if not digits.isdigit(): |
| 2433 mime_version.defects.append(errors.InvalidHeaderDefect( |
| 2434 "Expected MIME minor version number but found {!r}".format(digits))) |
| 2435 mime_version.append(ValueTerminal(digits, 'xtext')) |
| 2436 else: |
| 2437 mime_version.minor = int(digits) |
| 2438 mime_version.append(ValueTerminal(digits, 'digits')) |
| 2439 if value and value[0] in CFWS_LEADER: |
| 2440 token, value = get_cfws(value) |
| 2441 mime_version.append(token) |
| 2442 if value: |
| 2443 mime_version.defects.append(errors.InvalidHeaderDefect( |
| 2444 "Excess non-CFWS text after MIME version")) |
| 2445 mime_version.append(ValueTerminal(value, 'xtext')) |
| 2446 return mime_version |
| 2447 |
| 2448 def get_invalid_parameter(value): |
| 2449 """ Read everything up to the next ';'. |
| 2450 |
| 2451 This is outside the formal grammar. The InvalidParameter TokenList that is |
| 2452 returned acts like a Parameter, but the data attributes are None. |
| 2453 |
| 2454 """ |
| 2455 invalid_parameter = InvalidParameter() |
| 2456 while value and value[0] != ';': |
| 2457 if value[0] in PHRASE_ENDS: |
| 2458 invalid_parameter.append(ValueTerminal(value[0], |
| 2459 'misplaced-special')) |
| 2460 value = value[1:] |
| 2461 else: |
| 2462 token, value = get_phrase(value) |
| 2463 invalid_parameter.append(token) |
| 2464 return invalid_parameter, value |
| 2465 |
| 2466 def get_ttext(value): |
| 2467 """ttext = <matches _ttext_matcher> |
| 2468 |
| 2469 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's |
| 2470 defects list if we find non-ttext characters. We also register defects for |
| 2471 *any* non-printables even though the RFC doesn't exclude all of them, |
| 2472 because we follow the spirit of RFC 5322. |
| 2473 |
| 2474 """ |
| 2475 m = _non_token_end_matcher(value) |
| 2476 if not m: |
| 2477 raise errors.HeaderParseError( |
| 2478 "expected ttext but found '{}'".format(value)) |
| 2479 ttext = m.group() |
| 2480 value = value[len(ttext):] |
| 2481 ttext = ValueTerminal(ttext, 'ttext') |
| 2482 _validate_xtext(ttext) |
| 2483 return ttext, value |
| 2484 |
| 2485 def get_token(value): |
| 2486 """token = [CFWS] 1*ttext [CFWS] |
| 2487 |
| 2488 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or |
| 2489 tspecials. We also exclude tabs even though the RFC doesn't. |
| 2490 |
| 2491 The RFC implies the CFWS but is not explicit about it in the BNF. |
| 2492 |
| 2493 """ |
| 2494 mtoken = Token() |
| 2495 if value and value[0] in CFWS_LEADER: |
| 2496 token, value = get_cfws(value) |
| 2497 mtoken.append(token) |
| 2498 if value and value[0] in TOKEN_ENDS: |
| 2499 raise errors.HeaderParseError( |
| 2500 "expected token but found '{}'".format(value)) |
| 2501 token, value = get_ttext(value) |
| 2502 mtoken.append(token) |
| 2503 if value and value[0] in CFWS_LEADER: |
| 2504 token, value = get_cfws(value) |
| 2505 mtoken.append(token) |
| 2506 return mtoken, value |
| 2507 |
| 2508 def get_attrtext(value): |
| 2509 """attrtext = 1*(any non-ATTRIBUTE_ENDS character) |
| 2510 |
| 2511 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the |
| 2512 token's defects list if we find non-attrtext characters. We also register |
| 2513 defects for *any* non-printables even though the RFC doesn't exclude all of |
| 2514 them, because we follow the spirit of RFC 5322. |
| 2515 |
| 2516 """ |
| 2517 m = _non_attribute_end_matcher(value) |
| 2518 if not m: |
| 2519 raise errors.HeaderParseError( |
| 2520 "expected attrtext but found {!r}".format(value)) |
| 2521 attrtext = m.group() |
| 2522 value = value[len(attrtext):] |
| 2523 attrtext = ValueTerminal(attrtext, 'attrtext') |
| 2524 _validate_xtext(attrtext) |
| 2525 return attrtext, value |
| 2526 |
| 2527 def get_attribute(value): |
| 2528 """ [CFWS] 1*attrtext [CFWS] |
| 2529 |
| 2530 This version of the BNF makes the CFWS explicit, and as usual we use a |
| 2531 value terminal for the actual run of characters. The RFC equivalent of |
| 2532 attrtext is the token characters, with the subtraction of '*', "'", and '%'. |
| 2533 We include tab in the excluded set just as we do for token. |
| 2534 |
| 2535 """ |
| 2536 attribute = Attribute() |
| 2537 if value and value[0] in CFWS_LEADER: |
| 2538 token, value = get_cfws(value) |
| 2539 attribute.append(token) |
| 2540 if value and value[0] in ATTRIBUTE_ENDS: |
| 2541 raise errors.HeaderParseError( |
| 2542 "expected token but found '{}'".format(value)) |
| 2543 token, value = get_attrtext(value) |
| 2544 attribute.append(token) |
| 2545 if value and value[0] in CFWS_LEADER: |
| 2546 token, value = get_cfws(value) |
| 2547 attribute.append(token) |
| 2548 return attribute, value |
| 2549 |
| 2550 def get_extended_attrtext(value): |
| 2551 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%') |
| 2552 |
| 2553 This is a special parsing routine so that we get a value that |
| 2554 includes % escapes as a single string (which we decode as a single |
| 2555 string later). |
| 2556 |
| 2557 """ |
| 2558 m = _non_extended_attribute_end_matcher(value) |
| 2559 if not m: |
| 2560 raise errors.HeaderParseError( |
| 2561 "expected extended attrtext but found {!r}".format(value)) |
| 2562 attrtext = m.group() |
| 2563 value = value[len(attrtext):] |
| 2564 attrtext = ValueTerminal(attrtext, 'extended-attrtext') |
| 2565 _validate_xtext(attrtext) |
| 2566 return attrtext, value |
| 2567 |
| 2568 def get_extended_attribute(value): |
| 2569 """ [CFWS] 1*extended_attrtext [CFWS] |
| 2570 |
| 2571 This is like the non-extended version except we allow % characters, so that |
| 2572 we can pick up an encoded value as a single string. |
| 2573 |
| 2574 """ |
| 2575 # XXX: should we have an ExtendedAttribute TokenList? |
| 2576 attribute = Attribute() |
| 2577 if value and value[0] in CFWS_LEADER: |
| 2578 token, value = get_cfws(value) |
| 2579 attribute.append(token) |
| 2580 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS: |
| 2581 raise errors.HeaderParseError( |
| 2582 "expected token but found '{}'".format(value)) |
| 2583 token, value = get_extended_attrtext(value) |
| 2584 attribute.append(token) |
| 2585 if value and value[0] in CFWS_LEADER: |
| 2586 token, value = get_cfws(value) |
| 2587 attribute.append(token) |
| 2588 return attribute, value |
| 2589 |
| 2590 def get_section(value): |
| 2591 """ '*' digits |
| 2592 |
| 2593 The formal BNF is more complicated because leading 0s are not allowed. We |
| 2594 check for that and add a defect. We also assume no CFWS is allowed between |
| 2595 the '*' and the digits, though the RFC is not crystal clear on that. |
| 2596 The caller should already have dealt with leading CFWS. |
| 2597 |
| 2598 """ |
| 2599 section = Section() |
| 2600 if not value or value[0] != '*': |
| 2601 raise errors.HeaderParseError("Expected section but found {}".format( |
| 2602 value)) |
| 2603 section.append(ValueTerminal('*', 'section-marker')) |
| 2604 value = value[1:] |
| 2605 if not value or not value[0].isdigit(): |
| 2606 raise errors.HeaderParseError("Expected section number but " |
| 2607 "found {}".format(value)) |
| 2608 digits = '' |
| 2609 while value and value[0].isdigit(): |
| 2610 digits += value[0] |
| 2611 value = value[1:] |
| 2612 if digits[0] == '0' and digits != '0': |
| 2613 section.defects.append(errors.InvalidHeaderError("section number" |
| 2614 "has an invalid leading 0")) |
| 2615 section.number = int(digits) |
| 2616 section.append(ValueTerminal(digits, 'digits')) |
| 2617 return section, value |
| 2618 |
| 2619 |
| 2620 def get_value(value): |
| 2621 """ quoted-string / attribute |
| 2622 |
| 2623 """ |
| 2624 v = Value() |
| 2625 if not value: |
| 2626 raise errors.HeaderParseError("Expected value but found end of string") |
| 2627 leader = None |
| 2628 if value[0] in CFWS_LEADER: |
| 2629 leader, value = get_cfws(value) |
| 2630 if not value: |
| 2631 raise errors.HeaderParseError("Expected value but found " |
| 2632 "only {}".format(leader)) |
| 2633 if value[0] == '"': |
| 2634 token, value = get_quoted_string(value) |
| 2635 else: |
| 2636 token, value = get_extended_attribute(value) |
| 2637 if leader is not None: |
| 2638 token[:0] = [leader] |
| 2639 v.append(token) |
| 2640 return v, value |
| 2641 |
| 2642 def get_parameter(value): |
| 2643 """ attribute [section] ["*"] [CFWS] "=" value |
| 2644 |
| 2645 The CFWS is implied by the RFC but not made explicit in the BNF. This |
| 2646 simplified form of the BNF from the RFC is made to conform with the RFC BNF |
| 2647 through some extra checks. We do it this way because it makes both error |
| 2648 recovery and working with the resulting parse tree easier. |
| 2649 """ |
| 2650 # It is possible CFWS would also be implicitly allowed between the section |
| 2651 # and the 'extended-attribute' marker (the '*') , but we've never seen that |
| 2652 # in the wild and we will therefore ignore the possibility. |
| 2653 param = Parameter() |
| 2654 token, value = get_attribute(value) |
| 2655 param.append(token) |
| 2656 if not value or value[0] == ';': |
| 2657 param.defects.append(errors.InvalidHeaderDefect("Parameter contains " |
| 2658 "name ({}) but no value".format(token))) |
| 2659 return param, value |
| 2660 if value[0] == '*': |
| 2661 try: |
| 2662 token, value = get_section(value) |
| 2663 param.sectioned = True |
| 2664 param.append(token) |
| 2665 except errors.HeaderParseError: |
| 2666 pass |
| 2667 if not value: |
| 2668 raise errors.HeaderParseError("Incomplete parameter") |
| 2669 if value[0] == '*': |
| 2670 param.append(ValueTerminal('*', 'extended-parameter-marker')) |
| 2671 value = value[1:] |
| 2672 param.extended = True |
| 2673 if value[0] != '=': |
| 2674 raise errors.HeaderParseError("Parameter not followed by '='") |
| 2675 param.append(ValueTerminal('=', 'parameter-separator')) |
| 2676 value = value[1:] |
| 2677 leader = None |
| 2678 if value and value[0] in CFWS_LEADER: |
| 2679 token, value = get_cfws(value) |
| 2680 param.append(token) |
| 2681 remainder = None |
| 2682 appendto = param |
| 2683 if param.extended and value and value[0] == '"': |
| 2684 # Now for some serious hackery to handle the common invalid case of |
| 2685 # double quotes around an extended value. We also accept (with defect) |
| 2686 # a value marked as encoded that isn't really. |
| 2687 qstring, remainder = get_quoted_string(value) |
| 2688 inner_value = qstring.stripped_value |
| 2689 semi_valid = False |
| 2690 if param.section_number == 0: |
| 2691 if inner_value and inner_value[0] == "'": |
| 2692 semi_valid = True |
| 2693 else: |
| 2694 token, rest = get_attrtext(inner_value) |
| 2695 if rest and rest[0] == "'": |
| 2696 semi_valid = True |
| 2697 else: |
| 2698 try: |
| 2699 token, rest = get_extended_attrtext(inner_value) |
| 2700 except: |
| 2701 pass |
| 2702 else: |
| 2703 if not rest: |
| 2704 semi_valid = True |
| 2705 if semi_valid: |
| 2706 param.defects.append(errors.InvalidHeaderDefect( |
| 2707 "Quoted string value for extended parameter is invalid")) |
| 2708 param.append(qstring) |
| 2709 for t in qstring: |
| 2710 if t.token_type == 'bare-quoted-string': |
| 2711 t[:] = [] |
| 2712 appendto = t |
| 2713 break |
| 2714 value = inner_value |
| 2715 else: |
| 2716 remainder = None |
| 2717 param.defects.append(errors.InvalidHeaderDefect( |
| 2718 "Parameter marked as extended but appears to have a " |
| 2719 "quoted string value that is non-encoded")) |
| 2720 if value and value[0] == "'": |
| 2721 token = None |
| 2722 else: |
| 2723 token, value = get_value(value) |
| 2724 if not param.extended or param.section_number > 0: |
| 2725 if not value or value[0] != "'": |
| 2726 appendto.append(token) |
| 2727 if remainder is not None: |
| 2728 assert not value, value |
| 2729 value = remainder |
| 2730 return param, value |
| 2731 param.defects.append(errors.InvalidHeaderDefect( |
| 2732 "Apparent initial-extended-value but attribute " |
| 2733 "was not marked as extended or was not initial section")) |
| 2734 if not value: |
| 2735 # Assume the charset/lang is missing and the token is the value. |
| 2736 param.defects.append(errors.InvalidHeaderDefect( |
| 2737 "Missing required charset/lang delimiters")) |
| 2738 appendto.append(token) |
| 2739 if remainder is None: |
| 2740 return param, value |
| 2741 else: |
| 2742 if token is not None: |
| 2743 for t in token: |
| 2744 if t.token_type == 'extended-attrtext': |
| 2745 break |
| 2746 t.token_type == 'attrtext' |
| 2747 appendto.append(t) |
| 2748 param.charset = t.value |
| 2749 if value[0] != "'": |
| 2750 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " |
| 2751 "delimiter, but found {!r}".format(val
ue)) |
| 2752 appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) |
| 2753 value = value[1:] |
| 2754 if value and value[0] != "'": |
| 2755 token, value = get_attrtext(value) |
| 2756 appendto.append(token) |
| 2757 param.lang = token.value |
| 2758 if not value or value[0] != "'": |
| 2759 raise errors.HeaderParseError("Expected RFC2231 char/lang encodi
ng " |
| 2760 "delimiter, but found {}".format(value)) |
| 2761 appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) |
| 2762 value = value[1:] |
| 2763 if remainder is not None: |
| 2764 # Treat the rest of value as bare quoted string content. |
| 2765 v = Value() |
| 2766 while value: |
| 2767 if value[0] in WSP: |
| 2768 token, value = get_fws(value) |
| 2769 else: |
| 2770 token, value = get_qcontent(value) |
| 2771 v.append(token) |
| 2772 token = v |
| 2773 else: |
| 2774 token, value = get_value(value) |
| 2775 appendto.append(token) |
| 2776 if remainder is not None: |
| 2777 assert not value, value |
| 2778 value = remainder |
| 2779 return param, value |
| 2780 |
| 2781 def parse_mime_parameters(value): |
| 2782 """ parameter *( ";" parameter ) |
| 2783 |
| 2784 That BNF is meant to indicate this routine should only be called after |
| 2785 finding and handling the leading ';'. There is no corresponding rule in |
| 2786 the formal RFC grammar, but it is more convenient for us for the set of |
| 2787 parameters to be treated as its own TokenList. |
| 2788 |
| 2789 This is 'parse' routine because it consumes the reminaing value, but it |
| 2790 would never be called to parse a full header. Instead it is called to |
| 2791 parse everything after the non-parameter value of a specific MIME header. |
| 2792 |
| 2793 """ |
| 2794 mime_parameters = MimeParameters() |
| 2795 while value: |
| 2796 try: |
| 2797 token, value = get_parameter(value) |
| 2798 mime_parameters.append(token) |
| 2799 except errors.HeaderParseError as err: |
| 2800 leader = None |
| 2801 if value[0] in CFWS_LEADER: |
| 2802 leader, value = get_cfws(value) |
| 2803 if not value: |
| 2804 mime_parameters.append(leader) |
| 2805 return mime_parameters |
| 2806 if value[0] == ';': |
| 2807 if leader is not None: |
| 2808 mime_parameters.append(leader) |
| 2809 mime_parameters.defects.append(errors.InvalidHeaderDefect( |
| 2810 "parameter entry with no content")) |
| 2811 else: |
| 2812 token, value = get_invalid_parameter(value) |
| 2813 if leader: |
| 2814 token[:0] = [leader] |
| 2815 mime_parameters.append(token) |
| 2816 mime_parameters.defects.append(errors.InvalidHeaderDefect( |
| 2817 "invalid parameter {!r}".format(token))) |
| 2818 if value and value[0] != ';': |
| 2819 # Junk after the otherwise valid parameter. Mark it as |
| 2820 # invalid, but it will have a value. |
| 2821 param = mime_parameters[-1] |
| 2822 param.token_type = 'invalid-parameter' |
| 2823 token, value = get_invalid_parameter(value) |
| 2824 param.extend(token) |
| 2825 mime_parameters.defects.append(errors.InvalidHeaderDefect( |
| 2826 "parameter with invalid trailing text {!r}".format(token))) |
| 2827 if value: |
| 2828 # Must be a ';' at this point. |
| 2829 mime_parameters.append(ValueTerminal(';', 'parameter-separator')) |
| 2830 value = value[1:] |
| 2831 return mime_parameters |
| 2832 |
| 2833 def _find_mime_parameters(tokenlist, value): |
| 2834 """Do our best to find the parameters in an invalid MIME header |
| 2835 |
| 2836 """ |
| 2837 while value and value[0] != ';': |
| 2838 if value[0] in PHRASE_ENDS: |
| 2839 tokenlist.append(ValueTerminal(value[0], 'misplaced-special')) |
| 2840 value = value[1:] |
| 2841 else: |
| 2842 token, value = get_phrase(value) |
| 2843 tokenlist.append(token) |
| 2844 if not value: |
| 2845 return |
| 2846 tokenlist.append(ValueTerminal(';', 'parameter-separator')) |
| 2847 tokenlist.append(parse_mime_parameters(value[1:])) |
| 2848 |
| 2849 def parse_content_type_header(value): |
| 2850 """ maintype "/" subtype *( ";" parameter ) |
| 2851 |
| 2852 The maintype and substype are tokens. Theoretically they could |
| 2853 be checked against the official IANA list + x-token, but we |
| 2854 don't do that. |
| 2855 """ |
| 2856 ctype = ContentType() |
| 2857 recover = False |
| 2858 if not value: |
| 2859 ctype.defects.append(errors.HeaderMissingRequiredValue( |
| 2860 "Missing content type specification")) |
| 2861 return ctype |
| 2862 try: |
| 2863 token, value = get_token(value) |
| 2864 except errors.HeaderParseError: |
| 2865 ctype.defects.append(errors.InvalidHeaderDefect( |
| 2866 "Expected content maintype but found {!r}".format(value))) |
| 2867 _find_mime_parameters(ctype, value) |
| 2868 return ctype |
| 2869 ctype.append(token) |
| 2870 # XXX: If we really want to follow the formal grammer we should make |
| 2871 # mantype and subtype specialized TokenLists here. Probably not worth it. |
| 2872 if not value or value[0] != '/': |
| 2873 ctype.defects.append(errors.InvalidHeaderDefect( |
| 2874 "Invalid content type")) |
| 2875 if value: |
| 2876 _find_mime_parameters(ctype, value) |
| 2877 return ctype |
| 2878 ctype.maintype = token.value.strip().lower() |
| 2879 ctype.append(ValueTerminal('/', 'content-type-separator')) |
| 2880 value = value[1:] |
| 2881 try: |
| 2882 token, value = get_token(value) |
| 2883 except errors.HeaderParseError: |
| 2884 ctype.defects.append(errors.InvalidHeaderDefect( |
| 2885 "Expected content subtype but found {!r}".format(value))) |
| 2886 _find_mime_parameters(ctype, value) |
| 2887 return ctype |
| 2888 ctype.append(token) |
| 2889 ctype.subtype = token.value.strip().lower() |
| 2890 if not value: |
| 2891 return ctype |
| 2892 if value[0] != ';': |
| 2893 ctype.defects.append(errors.InvalidHeaderDefect( |
| 2894 "Only parameters are valid after content type, but " |
| 2895 "found {!r}".format(value))) |
| 2896 # The RFC requires that a syntactically invalid content-type be treated |
| 2897 # as text/plain. Perhaps we should postel this, but we should probably |
| 2898 # only do that if we were checking the subtype value against IANA. |
| 2899 del ctype.maintype, ctype.subtype |
| 2900 _find_mime_parameters(ctype, value) |
| 2901 return ctype |
| 2902 ctype.append(ValueTerminal(';', 'parameter-separator')) |
| 2903 ctype.append(parse_mime_parameters(value[1:])) |
| 2904 return ctype |
| 2905 |
| 2906 def parse_content_disposition_header(value): |
| 2907 """ disposition-type *( ";" parameter ) |
| 2908 |
| 2909 """ |
| 2910 disp_header = ContentDisposition() |
| 2911 if not value: |
| 2912 disp_header.defects.append(errors.HeaderMissingRequiredValue( |
| 2913 "Missing content disposition")) |
| 2914 return disp_header |
| 2915 try: |
| 2916 token, value = get_token(value) |
| 2917 except errors.HeaderParseError: |
| 2918 ctype.defects.append(errors.InvalidHeaderDefect( |
| 2919 "Expected content disposition but found {!r}".format(value))) |
| 2920 _find_mime_parameters(disp_header, value) |
| 2921 return disp_header |
| 2922 disp_header.append(token) |
| 2923 disp_header.content_disposition = token.value.strip().lower() |
| 2924 if not value: |
| 2925 return disp_header |
| 2926 if value[0] != ';': |
| 2927 disp_header.defects.append(errors.InvalidHeaderDefect( |
| 2928 "Only parameters are valid after content disposition, but " |
| 2929 "found {!r}".format(value))) |
| 2930 _find_mime_parameters(disp_header, value) |
| 2931 return disp_header |
| 2932 disp_header.append(ValueTerminal(';', 'parameter-separator')) |
| 2933 disp_header.append(parse_mime_parameters(value[1:])) |
| 2934 return disp_header |
| 2935 |
| 2936 def parse_content_transfer_encoding_header(value): |
| 2937 """ mechanism |
| 2938 |
| 2939 """ |
| 2940 # We should probably validate the values, since the list is fixed. |
| 2941 cte_header = ContentTransferEncoding() |
| 2942 if not value: |
| 2943 cte_header.defects.append(errors.HeaderMissingRequiredValue( |
| 2944 "Missing content transfer encoding")) |
| 2945 return cte_header |
| 2946 try: |
| 2947 token, value = get_token(value) |
| 2948 except errors.HeaderParseError: |
| 2949 ctype.defects.append(errors.InvalidHeaderDefect( |
| 2950 "Expected content trnasfer encoding but found {!r}".format(value))) |
| 2951 else: |
| 2952 cte_header.append(token) |
| 2953 cte_header.cte = token.value.strip().lower() |
| 2954 if not value: |
| 2955 return cte_header |
| 2956 while value: |
| 2957 cte_header.defects.append(errors.InvalidHeaderDefect( |
| 2958 "Extra text after content transfer encoding")) |
| 2959 if value[0] in PHRASE_ENDS: |
| 2960 cte_header.append(ValueTerminal(value[0], 'misplaced-special')) |
| 2961 value = value[1:] |
| 2962 else: |
| 2963 token, value = get_phrase(value) |
| 2964 cte_header.append(token) |
| 2965 return cte_header |
OLD | NEW |