third_party/google-endpoints/future/backports/html/parser.py - Issue 2666783008: Add google-endpoints to third_party/.

Side by Side Diff: third_party/google-endpoints/future/backports/html/parser.py

Issue 2666783008: Add google-endpoints to third_party/. (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/google-endpoints/future/backports/html/entities.py ('k') | third_party/google-endpoints/future/backports/http/__init__.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 """A parser for HTML and XHTML.

	2

	3 Backported for python-future from Python 3.3.

	4 """

	5

	6 # This file is based on sgmllib.py, but the API is slightly different.

	7

	8 # XXX There should be a way to distinguish between PCDATA (parsed

	9 # character data -- the normal case), RCDATA (replaceable character

	10 # data -- only char and entity references and end tags are special)

	11 # and CDATA (character data -- only end tags are special).

	12

	13 from __future__ import (absolute_import, division,

	14 print_function, unicode_literals)

	15 from future.builtins import *

	16 from future.backports import _markupbase

	17 import re

	18 import warnings

	19

	20 # Regular expressions used for parsing

	21

	22 interesting_normal = re.compile('[&<]')

	23 incomplete = re.compile('&[a-zA-Z#]')

	24

	25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')

	26 charref = re.compile('&#(?:[0-9]+\|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

	27

	28 starttagopen = re.compile('<[a-zA-Z]')

	29 piclose = re.compile('>')

	30 commentclose = re.compile(r'--\s*>')

	31 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_])(?:\s\|/(?!>))')

	32 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state

	33 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state

	34 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')

	35 # Note:

	36 # 1) the strict attrfind isn't really strict, but we can't make it

	37 # correctly strict without breaking backward compatibility;

	38 # 2) if you change attrfind remember to update locatestarttagend too;

	39 # 3) if you change attrfind and/or locatestarttagend the parser will

	40 # explode, so don't do it.

	41 attrfind = re.compile(

	42 r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'

	43 r'(\'[^\']\'\|"[^"]"\|[^\s"\'=<>`]*))?')

	44 attrfind_tolerant = re.compile(

	45 r'((?<=[\'"\s/])[^\s/>][^\s/=>])(\s=+\s*'

	46 r'(\'[^\']\'\|"[^"]"\|(?![\'"])[^>\s]))?(?:\s\|/(?!>))')

	47 locatestarttagend = re.compile(r"""

	48 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name

	49 (?:\s+ # whitespace before attribute name

	50 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name

	51 (?:\s=\s # value indicator

	52 (?:'[^']*' # LITA-enclosed value

	53 \|\"[^\"]*\" # LIT-enclosed value

	54 \|[^'\">\s]+ # bare value

	55 )

	56 )?

	57 )

	58 )*

	59 \s* # trailing whitespace

	60 """, re.VERBOSE)

	61 locatestarttagend_tolerant = re.compile(r"""

	62 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name

	63 (?:[\s/]* # optional whitespace before attribute name

	64 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name

	65 (?:\s=+\s # value indicator

	66 (?:'[^']*' # LITA-enclosed value

	67 \|"[^"]*" # LIT-enclosed value

	68 \|(?!['"])[^>\s]* # bare value

	69 )

	70 (?:\s,) # possibly followed by a comma

	71 )?(?:\s\|/(?!>))*

	72 )*

	73 )?

	74 \s* # trailing whitespace

	75 """, re.VERBOSE)

	76 endendtag = re.compile('>')

	77 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between

	78 # </ and the tag name, so maybe this should be fixed

	79 endtagfind = re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')

	80

	81

	82 class HTMLParseError(Exception):

	83 """Exception raised for all parse errors."""

	84

	85 def __init__(self, msg, position=(None, None)):

	86 assert msg

	87 self.msg = msg

	88 self.lineno = position[0]

	89 self.offset = position[1]

	90

	91 def __str__(self):

	92 result = self.msg

	93 if self.lineno is not None:

	94 result = result + ", at line %d" % self.lineno

	95 if self.offset is not None:

	96 result = result + ", column %d" % (self.offset + 1)

	97 return result

	98

	99

	100 class HTMLParser(_markupbase.ParserBase):

	101 """Find tags and other markup and call handler functions.

	102

	103 Usage:

	104 p = HTMLParser()

	105 p.feed(data)

	106 ...

	107 p.close()

	108

	109 Start tags are handled by calling self.handle_starttag() or

	110 self.handle_startendtag(); end tags by self.handle_endtag(). The

	111 data between tags is passed from the parser to the derived class

	112 by calling self.handle_data() with the data as argument (the data

	113 may be split up in arbitrary chunks). Entity references are

	114 passed by calling self.handle_entityref() with the entity

	115 reference as the argument. Numeric character references are

	116 passed to self.handle_charref() with the string containing the

	117 reference as the argument.

	118 """

	119

	120 CDATA_CONTENT_ELEMENTS = ("script", "style")

	121

	122 def __init__(self, strict=False):

	123 """Initialize and reset this instance.

	124

	125 If strict is set to False (the default) the parser will parse invalid

	126 markup, otherwise it will raise an error. Note that the strict mode

	127 is deprecated.

	128 """

	129 if strict:

	130 warnings.warn("The strict mode is deprecated.",

	131 DeprecationWarning, stacklevel=2)

	132 self.strict = strict

	133 self.reset()

	134

	135 def reset(self):

	136 """Reset this instance. Loses all unprocessed data."""

	137 self.rawdata = ''

	138 self.lasttag = '???'

	139 self.interesting = interesting_normal

	140 self.cdata_elem = None

	141 _markupbase.ParserBase.reset(self)

	142

	143 def feed(self, data):

	144 r"""Feed data to the parser.

	145

	146 Call this as often as you want, with as little or as much text

	147 as you want (may include '\n').

	148 """

	149 self.rawdata = self.rawdata + data

	150 self.goahead(0)

	151

	152 def close(self):

	153 """Handle any buffered data."""

	154 self.goahead(1)

	155

	156 def error(self, message):

	157 raise HTMLParseError(message, self.getpos())

	158

	159 __starttag_text = None

	160

	161 def get_starttag_text(self):

	162 """Return full source of start tag: '<...>'."""

	163 return self.__starttag_text

	164

	165 def set_cdata_mode(self, elem):

	166 self.cdata_elem = elem.lower()

	167 self.interesting = re.compile(r'</\s%s\s>' % self.cdata_elem, re.I)

	168

	169 def clear_cdata_mode(self):

	170 self.interesting = interesting_normal

	171 self.cdata_elem = None

	172

	173 # Internal -- handle data as far as reasonable. May leave state

	174 # and data to be processed by a subsequent call. If 'end' is

	175 # true, force handling all data as if followed by EOF marker.

	176 def goahead(self, end):

	177 rawdata = self.rawdata

	178 i = 0

	179 n = len(rawdata)

	180 while i < n:

	181 match = self.interesting.search(rawdata, i) # < or &

	182 if match:

	183 j = match.start()

	184 else:

	185 if self.cdata_elem:

	186 break

	187 j = n

	188 if i < j: self.handle_data(rawdata[i:j])

	189 i = self.updatepos(i, j)

	190 if i == n: break

	191 startswith = rawdata.startswith

	192 if startswith('<', i):

	193 if starttagopen.match(rawdata, i): # < + letter

	194 k = self.parse_starttag(i)

	195 elif startswith("</", i):

	196 k = self.parse_endtag(i)

	197 elif startswith("<!--", i):

	198 k = self.parse_comment(i)

	199 elif startswith("<?", i):

	200 k = self.parse_pi(i)

	201 elif startswith("<!", i):

	202 if self.strict:

	203 k = self.parse_declaration(i)

	204 else:

	205 k = self.parse_html_declaration(i)

	206 elif (i + 1) < n:

	207 self.handle_data("<")

	208 k = i + 1

	209 else:

	210 break

	211 if k < 0:

	212 if not end:

	213 break

	214 if self.strict:

	215 self.error("EOF in middle of construct")

	216 k = rawdata.find('>', i + 1)

	217 if k < 0:

	218 k = rawdata.find('<', i + 1)

	219 if k < 0:

	220 k = i + 1

	221 else:

	222 k += 1

	223 self.handle_data(rawdata[i:k])

	224 i = self.updatepos(i, k)

	225 elif startswith("&#", i):

	226 match = charref.match(rawdata, i)

	227 if match:

	228 name = match.group()[2:-1]

	229 self.handle_charref(name)

	230 k = match.end()

	231 if not startswith(';', k-1):

	232 k = k - 1

	233 i = self.updatepos(i, k)

	234 continue

	235 else:

	236 if ";" in rawdata[i:]: #bail by consuming &#

	237 self.handle_data(rawdata[0:2])

	238 i = self.updatepos(i, 2)

	239 break

	240 elif startswith('&', i):

	241 match = entityref.match(rawdata, i)

	242 if match:

	243 name = match.group(1)

	244 self.handle_entityref(name)

	245 k = match.end()

	246 if not startswith(';', k-1):

	247 k = k - 1

	248 i = self.updatepos(i, k)

	249 continue

	250 match = incomplete.match(rawdata, i)

	251 if match:

	252 # match.group() will contain at least 2 chars

	253 if end and match.group() == rawdata[i:]:

	254 if self.strict:

	255 self.error("EOF in middle of entity or char ref")

	256 else:

	257 if k <= i:

	258 k = n

	259 i = self.updatepos(i, i + 1)

	260 # incomplete

	261 break

	262 elif (i + 1) < n:

	263 # not the end of the buffer, and can't be confused

	264 # with some other construct

	265 self.handle_data("&")

	266 i = self.updatepos(i, i + 1)

	267 else:

	268 break

	269 else:

	270 assert 0, "interesting.search() lied"

	271 # end while

	272 if end and i < n and not self.cdata_elem:

	273 self.handle_data(rawdata[i:n])

	274 i = self.updatepos(i, n)

	275 self.rawdata = rawdata[i:]

	276

	277 # Internal -- parse html declarations, return length or -1 if not terminated

	278 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state

	279 # See also parse_declaration in _markupbase

	280 def parse_html_declaration(self, i):

	281 rawdata = self.rawdata

	282 assert rawdata[i:i+2] == '<!', ('unexpected call to '

	283 'parse_html_declaration()')

	284 if rawdata[i:i+4] == '<!--':

	285 # this case is actually already handled in goahead()

	286 return self.parse_comment(i)

	287 elif rawdata[i:i+3] == '<![':

	288 return self.parse_marked_section(i)

	289 elif rawdata[i:i+9].lower() == '<!doctype':

	290 # find the closing >

	291 gtpos = rawdata.find('>', i+9)

	292 if gtpos == -1:

	293 return -1

	294 self.handle_decl(rawdata[i+2:gtpos])

	295 return gtpos+1

	296 else:

	297 return self.parse_bogus_comment(i)

	298

	299 # Internal -- parse bogus comment, return length or -1 if not terminated

	300 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state

	301 def parse_bogus_comment(self, i, report=1):

	302 rawdata = self.rawdata

	303 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '

	304 'parse_comment()')

	305 pos = rawdata.find('>', i+2)

	306 if pos == -1:

	307 return -1

	308 if report:

	309 self.handle_comment(rawdata[i+2:pos])

	310 return pos + 1

	311

	312 # Internal -- parse processing instr, return end or -1 if not terminated

	313 def parse_pi(self, i):

	314 rawdata = self.rawdata

	315 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'

	316 match = piclose.search(rawdata, i+2) # >

	317 if not match:

	318 return -1

	319 j = match.start()

	320 self.handle_pi(rawdata[i+2: j])

	321 j = match.end()

	322 return j

	323

	324 # Internal -- handle starttag, return end or -1 if not terminated

	325 def parse_starttag(self, i):

	326 self.__starttag_text = None

	327 endpos = self.check_for_whole_start_tag(i)

	328 if endpos < 0:

	329 return endpos

	330 rawdata = self.rawdata

	331 self.__starttag_text = rawdata[i:endpos]

	332

	333 # Now parse the data between i+1 and j into a tag and attrs

	334 attrs = []

	335 match = tagfind.match(rawdata, i+1)

	336 assert match, 'unexpected call to parse_starttag()'

	337 k = match.end()

	338 self.lasttag = tag = match.group(1).lower()

	339 while k < endpos:

	340 if self.strict:

	341 m = attrfind.match(rawdata, k)

	342 else:

	343 m = attrfind_tolerant.match(rawdata, k)

	344 if not m:

	345 break

	346 attrname, rest, attrvalue = m.group(1, 2, 3)

	347 if not rest:

	348 attrvalue = None

	349 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

	350 attrvalue[:1] == '"' == attrvalue[-1:]:

	351 attrvalue = attrvalue[1:-1]

	352 if attrvalue:

	353 attrvalue = self.unescape(attrvalue)

	354 attrs.append((attrname.lower(), attrvalue))

	355 k = m.end()

	356

	357 end = rawdata[k:endpos].strip()

	358 if end not in (">", "/>"):

	359 lineno, offset = self.getpos()

	360 if "\n" in self.__starttag_text:

	361 lineno = lineno + self.__starttag_text.count("\n")

	362 offset = len(self.__starttag_text) \

	363 - self.__starttag_text.rfind("\n")

	364 else:

	365 offset = offset + len(self.__starttag_text)

	366 if self.strict:

	367 self.error("junk characters in start tag: %r"

	368 % (rawdata[k:endpos][:20],))

	369 self.handle_data(rawdata[i:endpos])

	370 return endpos

	371 if end.endswith('/>'):

	372 # XHTML-style empty tag: <span attr="value" />

	373 self.handle_startendtag(tag, attrs)

	374 else:

	375 self.handle_starttag(tag, attrs)

	376 if tag in self.CDATA_CONTENT_ELEMENTS:

	377 self.set_cdata_mode(tag)

	378 return endpos

	379

	380 # Internal -- check to see if we have a complete starttag; return end

	381 # or -1 if incomplete.

	382 def check_for_whole_start_tag(self, i):

	383 rawdata = self.rawdata

	384 if self.strict:

	385 m = locatestarttagend.match(rawdata, i)

	386 else:

	387 m = locatestarttagend_tolerant.match(rawdata, i)

	388 if m:

	389 j = m.end()

	390 next = rawdata[j:j+1]

	391 if next == ">":

	392 return j + 1

	393 if next == "/":

	394 if rawdata.startswith("/>", j):

	395 return j + 2

	396 if rawdata.startswith("/", j):

	397 # buffer boundary

	398 return -1

	399 # else bogus input

	400 if self.strict:

	401 self.updatepos(i, j + 1)

	402 self.error("malformed empty start tag")

	403 if j > i:

	404 return j

	405 else:

	406 return i + 1

	407 if next == "":

	408 # end of input

	409 return -1

	410 if next in ("abcdefghijklmnopqrstuvwxyz=/"

	411 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):

	412 # end of input in or before attribute value, or we have the

	413 # '/' from a '/>' ending

	414 return -1

	415 if self.strict:

	416 self.updatepos(i, j)

	417 self.error("malformed start tag")

	418 if j > i:

	419 return j

	420 else:

	421 return i + 1

	422 raise AssertionError("we should not get here!")

	423

	424 # Internal -- parse endtag, return end or -1 if incomplete

	425 def parse_endtag(self, i):

	426 rawdata = self.rawdata

	427 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"

	428 match = endendtag.search(rawdata, i+1) # >

	429 if not match:

	430 return -1

	431 gtpos = match.end()

	432 match = endtagfind.match(rawdata, i) # </ + tag + >

	433 if not match:

	434 if self.cdata_elem is not None:

	435 self.handle_data(rawdata[i:gtpos])

	436 return gtpos

	437 if self.strict:

	438 self.error("bad end tag: %r" % (rawdata[i:gtpos],))

	439 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state

	440 namematch = tagfind_tolerant.match(rawdata, i+2)

	441 if not namematch:

	442 # w3.org/TR/html5/tokenization.html#end-tag-open-state

	443 if rawdata[i:i+3] == '</>':

	444 return i+3

	445 else:

	446 return self.parse_bogus_comment(i)

	447 tagname = namematch.group().lower()

	448 # consume and ignore other stuff between the name and the >

	449 # Note: this is not 100% correct, since we might have things like

	450 # </tag attr=">">, but looking for > after tha name should cover

	451 # most of the cases and is much simpler

	452 gtpos = rawdata.find('>', namematch.end())

	453 self.handle_endtag(tagname)

	454 return gtpos+1

	455

	456 elem = match.group(1).lower() # script or style

	457 if self.cdata_elem is not None:

	458 if elem != self.cdata_elem:

	459 self.handle_data(rawdata[i:gtpos])

	460 return gtpos

	461

	462 self.handle_endtag(elem.lower())

	463 self.clear_cdata_mode()

	464 return gtpos

	465

	466 # Overridable -- finish processing of start+end tag: <tag.../>

	467 def handle_startendtag(self, tag, attrs):

	468 self.handle_starttag(tag, attrs)

	469 self.handle_endtag(tag)

	470

	471 # Overridable -- handle start tag

	472 def handle_starttag(self, tag, attrs):

	473 pass

	474

	475 # Overridable -- handle end tag

	476 def handle_endtag(self, tag):

	477 pass

	478

	479 # Overridable -- handle character reference

	480 def handle_charref(self, name):

	481 pass

	482

	483 # Overridable -- handle entity reference

	484 def handle_entityref(self, name):

	485 pass

	486

	487 # Overridable -- handle data

	488 def handle_data(self, data):

	489 pass

	490

	491 # Overridable -- handle comment

	492 def handle_comment(self, data):

	493 pass

	494

	495 # Overridable -- handle declaration

	496 def handle_decl(self, decl):

	497 pass

	498

	499 # Overridable -- handle processing instruction

	500 def handle_pi(self, data):

	501 pass

	502

	503 def unknown_decl(self, data):

	504 if self.strict:

	505 self.error("unknown declaration: %r" % (data,))

	506

	507 # Internal -- helper to remove special character quoting

	508 def unescape(self, s):

	509 if '&' not in s:

	510 return s

	511 def replaceEntities(s):

	512 s = s.groups()[0]

	513 try:

	514 if s[0] == "#":

	515 s = s[1:]

	516 if s[0] in ['x','X']:

	517 c = int(s[1:].rstrip(';'), 16)

	518 else:

	519 c = int(s.rstrip(';'))

	520 return chr(c)

	521 except ValueError:

	522 return '&#' + s

	523 else:

	524 from future.backports.html.entities import html5

	525 if s in html5:

	526 return html5[s]

	527 elif s.endswith(';'):

	528 return '&' + s

	529 for x in range(2, len(s)):

	530 if s[:x] in html5:

	531 return html5[s[:x]] + s[x:]

	532 else:

	533 return '&' + s

	534

	535 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;\|\w{1,32};?))",

	536 replaceEntities, s)

	537

OLD	NEW