third_party/google-endpoints/future/backports/email/feedparser.py - Issue 2666783008: Add google-endpoints to third_party/.

Side by Side Diff: third_party/google-endpoints/future/backports/email/feedparser.py

Issue 2666783008: Add google-endpoints to third_party/. (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/google-endpoints/future/backports/email/errors.py ('k') | third_party/google-endpoints/future/backports/email/generator.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 # Copyright (C) 2004-2006 Python Software Foundation

	2 # Authors: Baxter, Wouters and Warsaw

	3 # Contact: email-sig@python.org

	4

	5 """FeedParser - An email feed parser.

	6

	7 The feed parser implements an interface for incrementally parsing an email

	8 message, line by line. This has advantages for certain applications, such as

	9 those reading email messages off a socket.

	10

	11 FeedParser.feed() is the primary interface for pushing new data into the

	12 parser. It returns when there's nothing more it can do with the available

	13 data. When you have no more data to push into the parser, call .close().

	14 This completes the parsing and returns the root message object.

	15

	16 The other advantage of this parser is that it will never raise a parsing

	17 exception. Instead, when it finds something unexpected, it adds a 'defect' to

	18 the current message. Defects are just instances that live on the message

	19 object's .defects attribute.

	20 """

	21 from __future__ import unicode_literals

	22 from __future__ import division

	23 from __future__ import absolute_import

	24 from future.builtins import object, range, super

	25 from future.utils import implements_iterator, PY3

	26

	27 __all__ = ['FeedParser', 'BytesFeedParser']

	28

	29 import re

	30

	31 from future.backports.email import errors

	32 from future.backports.email import message

	33 from future.backports.email._policybase import compat32

	34

	35 NLCRE = re.compile('\r\n\|\r\|\n')

	36 NLCRE_bol = re.compile('(\r\n\|\r\|\n)')

	37 NLCRE_eol = re.compile('(\r\n\|\r\|\n)\Z')

	38 NLCRE_crack = re.compile('(\r\n\|\r\|\n)')

	39 # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character

	40 # except controls, SP, and ":".

	41 headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')

	42 EMPTYSTRING = ''

	43 NL = '\n'

	44

	45 NeedMoreData = object()

	46

	47

	48 # @implements_iterator

	49 class BufferedSubFile(object):

	50 """A file-ish object that can have new data loaded into it.

	51

	52 You can also push and pop line-matching predicates onto a stack. When the

	53 current predicate matches the current line, a false EOF response

	54 (i.e. empty string) is returned instead. This lets the parser adhere to a

	55 simple abstraction -- it parses until EOF closes the current message.

	56 """

	57 def __init__(self):

	58 # The last partial line pushed into this object.

	59 self._partial = ''

	60 # The list of full, pushed lines, in reverse order

	61 self._lines = []

	62 # The stack of false-EOF checking predicates.

	63 self._eofstack = []

	64 # A flag indicating whether the file has been closed or not.

	65 self._closed = False

	66

	67 def push_eof_matcher(self, pred):

	68 self._eofstack.append(pred)

	69

	70 def pop_eof_matcher(self):

	71 return self._eofstack.pop()

	72

	73 def close(self):

	74 # Don't forget any trailing partial line.

	75 self._lines.append(self._partial)

	76 self._partial = ''

	77 self._closed = True

	78

	79 def readline(self):

	80 if not self._lines:

	81 if self._closed:

	82 return ''

	83 return NeedMoreData

	84 # Pop the line off the stack and see if it matches the current

	85 # false-EOF predicate.

	86 line = self._lines.pop()

	87 # RFC 2046, section 5.1.2 requires us to recognize outer level

	88 # boundaries at any level of inner nesting. Do this, but be sure it's

	89 # in the order of most to least nested.

	90 for ateof in self._eofstack[::-1]:

	91 if ateof(line):

	92 # We're at the false EOF. But push the last line back first.

	93 self._lines.append(line)

	94 return ''

	95 return line

	96

	97 def unreadline(self, line):

	98 # Let the consumer push a line back into the buffer.

	99 assert line is not NeedMoreData

	100 self._lines.append(line)

	101

	102 def push(self, data):

	103 """Push some new data into this object."""

	104 # Handle any previous leftovers

	105 data, self._partial = self._partial + data, ''

	106 # Crack into lines, but preserve the newlines on the end of each

	107 parts = NLCRE_crack.split(data)

	108 # The ahem interesting behaviour of re.split when supplied grouping

	109 # parentheses is that the last element of the resulting list is the

	110 # data after the final RE. In the case of a NL/CR terminated string,

	111 # this is the empty string.

	112 self._partial = parts.pop()

	113 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending wit h \r:

	114 # is there a \n to follow later?

	115 if not self._partial and parts and parts[-1].endswith('\r'):

	116 self._partial = parts.pop(-2)+parts.pop()

	117 # parts is a list of strings, alternating between the line contents

	118 # and the eol character(s). Gather up a list of lines after

	119 # re-attaching the newlines.

	120 lines = []

	121 for i in range(len(parts) // 2):

	122 lines.append(parts[i2] + parts[i2+1])

	123 self.pushlines(lines)

	124

	125 def pushlines(self, lines):

	126 # Reverse and insert at the front of the lines.

	127 self._lines[:0] = lines[::-1]

	128

	129 def __iter__(self):

	130 return self

	131

	132 def __next__(self):

	133 line = self.readline()

	134 if line == '':

	135 raise StopIteration

	136 return line

	137

	138

	139 class FeedParser(object):

	140 """A feed-style parser of email."""

	141

	142 def __init__(self, _factory=message.Message, **_3to2kwargs):

	143 if 'policy' in _3to2kwargs: policy = _3to2kwargs['policy']; del _3to2kwa rgs['policy']

	144 else: policy = compat32

	145 """_factory is called with no arguments to create a new message obj

	146

	147 The policy keyword specifies a policy object that controls a number of

	148 aspects of the parser's operation. The default policy maintains

	149 backward compatibility.

	150

	151 """

	152 self._factory = _factory

	153 self.policy = policy

	154 try:

	155 _factory(policy=self.policy)

	156 self._factory_kwds = lambda: {'policy': self.policy}

	157 except TypeError:

	158 # Assume this is an old-style factory

	159 self._factory_kwds = lambda: {}

	160 self._input = BufferedSubFile()

	161 self._msgstack = []

	162 if PY3:

	163 self._parse = self._parsegen().__next__

	164 else:

	165 self._parse = self._parsegen().next

	166 self._cur = None

	167 self._last = None

	168 self._headersonly = False

	169

	170 # Non-public interface for supporting Parser's headersonly flag

	171 def _set_headersonly(self):

	172 self._headersonly = True

	173

	174 def feed(self, data):

	175 """Push more data into the parser."""

	176 self._input.push(data)

	177 self._call_parse()

	178

	179 def _call_parse(self):

	180 try:

	181 self._parse()

	182 except StopIteration:

	183 pass

	184

	185 def close(self):

	186 """Parse all remaining data and return the root message object."""

	187 self._input.close()

	188 self._call_parse()

	189 root = self._pop_message()

	190 assert not self._msgstack

	191 # Look for final set of defects

	192 if root.get_content_maintype() == 'multipart' \

	193 and not root.is_multipart():

	194 defect = errors.MultipartInvariantViolationDefect()

	195 self.policy.handle_defect(root, defect)

	196 return root

	197

	198 def _new_message(self):

	199 msg = self._factory(**self._factory_kwds())

	200 if self._cur and self._cur.get_content_type() == 'multipart/digest':

	201 msg.set_default_type('message/rfc822')

	202 if self._msgstack:

	203 self._msgstack[-1].attach(msg)

	204 self._msgstack.append(msg)

	205 self._cur = msg

	206 self._last = msg

	207

	208 def _pop_message(self):

	209 retval = self._msgstack.pop()

	210 if self._msgstack:

	211 self._cur = self._msgstack[-1]

	212 else:

	213 self._cur = None

	214 return retval

	215

	216 def _parsegen(self):

	217 # Create a new message and start by parsing headers.

	218 self._new_message()

	219 headers = []

	220 # Collect the headers, searching for a line that doesn't match the RFC

	221 # 2822 header or continuation pattern (including an empty line).

	222 for line in self._input:

	223 if line is NeedMoreData:

	224 yield NeedMoreData

	225 continue

	226 if not headerRE.match(line):

	227 # If we saw the RFC defined header/body separator

	228 # (i.e. newline), just throw it away. Otherwise the line is

	229 # part of the body so push it back.

	230 if not NLCRE.match(line):

	231 defect = errors.MissingHeaderBodySeparatorDefect()

	232 self.policy.handle_defect(self._cur, defect)

	233 self._input.unreadline(line)

	234 break

	235 headers.append(line)

	236 # Done with the headers, so parse them and figure out what we're

	237 # supposed to see in the body of the message.

	238 self._parse_headers(headers)

	239 # Headers-only parsing is a backwards compatibility hack, which was

	240 # necessary in the older parser, which could raise errors. All

	241 # remaining lines in the input are thrown into the message body.

	242 if self._headersonly:

	243 lines = []

	244 while True:

	245 line = self._input.readline()

	246 if line is NeedMoreData:

	247 yield NeedMoreData

	248 continue

	249 if line == '':

	250 break

	251 lines.append(line)

	252 self._cur.set_payload(EMPTYSTRING.join(lines))

	253 return

	254 if self._cur.get_content_type() == 'message/delivery-status':

	255 # message/delivery-status contains blocks of headers separated by

	256 # a blank line. We'll represent each header block as a separate

	257 # nested message object, but the processing is a bit different

	258 # than standard message/* types because there is no body for the

	259 # nested messages. A blank line separates the subparts.

	260 while True:

	261 self._input.push_eof_matcher(NLCRE.match)

	262 for retval in self._parsegen():

	263 if retval is NeedMoreData:

	264 yield NeedMoreData

	265 continue

	266 break

	267 msg = self._pop_message()

	268 # We need to pop the EOF matcher in order to tell if we're at

	269 # the end of the current file, not the end of the last block

	270 # of message headers.

	271 self._input.pop_eof_matcher()

	272 # The input stream must be sitting at the newline or at the

	273 # EOF. We want to see if we're at the end of this subpart, so

	274 # first consume the blank line, then test the next line to see

	275 # if we're at this subpart's EOF.

	276 while True:

	277 line = self._input.readline()

	278 if line is NeedMoreData:

	279 yield NeedMoreData

	280 continue

	281 break

	282 while True:

	283 line = self._input.readline()

	284 if line is NeedMoreData:

	285 yield NeedMoreData

	286 continue

	287 break

	288 if line == '':

	289 break

	290 # Not at EOF so this is a line we're going to need.

	291 self._input.unreadline(line)

	292 return

	293 if self._cur.get_content_maintype() == 'message':

	294 # The message claims to be a message/* type, then what follows is

	295 # another RFC 2822 message.

	296 for retval in self._parsegen():

	297 if retval is NeedMoreData:

	298 yield NeedMoreData

	299 continue

	300 break

	301 self._pop_message()

	302 return

	303 if self._cur.get_content_maintype() == 'multipart':

	304 boundary = self._cur.get_boundary()

	305 if boundary is None:

	306 # The message /claims/ to be a multipart but it has not

	307 # defined a boundary. That's a problem which we'll handle by

	308 # reading everything until the EOF and marking the message as

	309 # defective.

	310 defect = errors.NoBoundaryInMultipartDefect()

	311 self.policy.handle_defect(self._cur, defect)

	312 lines = []

	313 for line in self._input:

	314 if line is NeedMoreData:

	315 yield NeedMoreData

	316 continue

	317 lines.append(line)

	318 self._cur.set_payload(EMPTYSTRING.join(lines))

	319 return

	320 # Make sure a valid content type was specified per RFC 2045:6.4.

	321 if (self._cur.get('content-transfer-encoding', '8bit').lower()

	322 not in ('7bit', '8bit', 'binary')):

	323 defect = errors.InvalidMultipartContentTransferEncodingDefect()

	324 self.policy.handle_defect(self._cur, defect)

	325 # Create a line match predicate which matches the inter-part

	326 # boundary as well as the end-of-multipart boundary. Don't push

	327 # this onto the input stream until we've scanned past the

	328 # preamble.

	329 separator = '--' + boundary

	330 boundaryre = re.compile(

	331 '(?P<sep>' + re.escape(separator) +

	332 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')

	333 capturing_preamble = True

	334 preamble = []

	335 linesep = False

	336 close_boundary_seen = False

	337 while True:

	338 line = self._input.readline()

	339 if line is NeedMoreData:

	340 yield NeedMoreData

	341 continue

	342 if line == '':

	343 break

	344 mo = boundaryre.match(line)

	345 if mo:

	346 # If we're looking at the end boundary, we're done with

	347 # this multipart. If there was a newline at the end of

	348 # the closing boundary, then we need to initialize the

	349 # epilogue with the empty string (see below).

	350 if mo.group('end'):

	351 close_boundary_seen = True

	352 linesep = mo.group('linesep')

	353 break

	354 # We saw an inter-part boundary. Were we in the preamble?

	355 if capturing_preamble:

	356 if preamble:

	357 # According to RFC 2046, the last newline belongs

	358 # to the boundary.

	359 lastline = preamble[-1]

	360 eolmo = NLCRE_eol.search(lastline)

	361 if eolmo:

	362 preamble[-1] = lastline[:-len(eolmo.group(0))]

	363 self._cur.preamble = EMPTYSTRING.join(preamble)

	364 capturing_preamble = False

	365 self._input.unreadline(line)

	366 continue

	367 # We saw a boundary separating two parts. Consume any

	368 # multiple boundary lines that may be following. Our

	369 # interpretation of RFC 2046 BNF grammar does not produce

	370 # body parts within such double boundaries.

	371 while True:

	372 line = self._input.readline()

	373 if line is NeedMoreData:

	374 yield NeedMoreData

	375 continue

	376 mo = boundaryre.match(line)

	377 if not mo:

	378 self._input.unreadline(line)

	379 break

	380 # Recurse to parse this subpart; the input stream points

	381 # at the subpart's first line.

	382 self._input.push_eof_matcher(boundaryre.match)

	383 for retval in self._parsegen():

	384 if retval is NeedMoreData:

	385 yield NeedMoreData

	386 continue

	387 break

	388 # Because of RFC 2046, the newline preceding the boundary

	389 # separator actually belongs to the boundary, not the

	390 # previous subpart's payload (or epilogue if the previous

	391 # part is a multipart).

	392 if self._last.get_content_maintype() == 'multipart':

	393 epilogue = self._last.epilogue

	394 if epilogue == '':

	395 self._last.epilogue = None

	396 elif epilogue is not None:

	397 mo = NLCRE_eol.search(epilogue)

	398 if mo:

	399 end = len(mo.group(0))

	400 self._last.epilogue = epilogue[:-end]

	401 else:

	402 payload = self._last._payload

	403 if isinstance(payload, str):

	404 mo = NLCRE_eol.search(payload)

	405 if mo:

	406 payload = payload[:-len(mo.group(0))]

	407 self._last._payload = payload

	408 self._input.pop_eof_matcher()

	409 self._pop_message()

	410 # Set the multipart up for newline cleansing, which will

	411 # happen if we're in a nested multipart.

	412 self._last = self._cur

	413 else:

	414 # I think we must be in the preamble

	415 assert capturing_preamble

	416 preamble.append(line)

	417 # We've seen either the EOF or the end boundary. If we're still

	418 # capturing the preamble, we never saw the start boundary. Note

	419 # that as a defect and store the captured text as the payload.

	420 if capturing_preamble:

	421 defect = errors.StartBoundaryNotFoundDefect()

	422 self.policy.handle_defect(self._cur, defect)

	423 self._cur.set_payload(EMPTYSTRING.join(preamble))

	424 epilogue = []

	425 for line in self._input:

	426 if line is NeedMoreData:

	427 yield NeedMoreData

	428 continue

	429 self._cur.epilogue = EMPTYSTRING.join(epilogue)

	430 return

	431 # If we're not processing the preamble, then we might have seen

	432 # EOF without seeing that end boundary...that is also a defect.

	433 if not close_boundary_seen:

	434 defect = errors.CloseBoundaryNotFoundDefect()

	435 self.policy.handle_defect(self._cur, defect)

	436 return

	437 # Everything from here to the EOF is epilogue. If the end boundary

	438 # ended in a newline, we'll need to make sure the epilogue isn't

	439 # None

	440 if linesep:

	441 epilogue = ['']

	442 else:

	443 epilogue = []

	444 for line in self._input:

	445 if line is NeedMoreData:

	446 yield NeedMoreData

	447 continue

	448 epilogue.append(line)

	449 # Any CRLF at the front of the epilogue is not technically part of

	450 # the epilogue. Also, watch out for an empty string epilogue,

	451 # which means a single newline.

	452 if epilogue:

	453 firstline = epilogue[0]

	454 bolmo = NLCRE_bol.match(firstline)

	455 if bolmo:

	456 epilogue[0] = firstline[len(bolmo.group(0)):]

	457 self._cur.epilogue = EMPTYSTRING.join(epilogue)

	458 return

	459 # Otherwise, it's some non-multipart type, so the entire rest of the

	460 # file contents becomes the payload.

	461 lines = []

	462 for line in self._input:

	463 if line is NeedMoreData:

	464 yield NeedMoreData

	465 continue

	466 lines.append(line)

	467 self._cur.set_payload(EMPTYSTRING.join(lines))

	468

	469 def _parse_headers(self, lines):

	470 # Passed a list of lines that make up the headers for the current msg

	471 lastheader = ''

	472 lastvalue = []

	473 for lineno, line in enumerate(lines):

	474 # Check for continuation

	475 if line[0] in ' \t':

	476 if not lastheader:

	477 # The first line of the headers was a continuation. This

	478 # is illegal, so let's note the defect, store the illegal

	479 # line, and ignore it for purposes of headers.

	480 defect = errors.FirstHeaderLineIsContinuationDefect(line)

	481 self.policy.handle_defect(self._cur, defect)

	482 continue

	483 lastvalue.append(line)

	484 continue

	485 if lastheader:

	486 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

	487 lastheader, lastvalue = '', []

	488 # Check for envelope header, i.e. unix-from

	489 if line.startswith('From '):

	490 if lineno == 0:

	491 # Strip off the trailing newline

	492 mo = NLCRE_eol.search(line)

	493 if mo:

	494 line = line[:-len(mo.group(0))]

	495 self._cur.set_unixfrom(line)

	496 continue

	497 elif lineno == len(lines) - 1:

	498 # Something looking like a unix-from at the end - it's

	499 # probably the first line of the body, so push back the

	500 # line and stop.

	501 self._input.unreadline(line)

	502 return

	503 else:

	504 # Weirdly placed unix-from line. Note this as a defect

	505 # and ignore it.

	506 defect = errors.MisplacedEnvelopeHeaderDefect(line)

	507 self._cur.defects.append(defect)

	508 continue

	509 # Split the line on the colon separating field name from value.

	510 # There will always be a colon, because if there wasn't the part of

	511 # the parser that calls us would have started parsing the body.

	512 i = line.find(':')

	513 assert i>0, "_parse_headers fed line with no : and no leading WS"

	514 lastheader = line[:i]

	515 lastvalue = [line]

	516 # Done with all the lines, so handle the last header.

	517 if lastheader:

	518 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))

	519

	520

	521 class BytesFeedParser(FeedParser):

	522 """Like FeedParser, but feed accepts bytes."""

	523

	524 def feed(self, data):

	525 super().feed(data.decode('ascii', 'surrogateescape'))

OLD	NEW