third_party/google-endpoints/future/backports/_markupbase.py - Issue 2666783008: Add google-endpoints to third_party/.

Side by Side Diff: third_party/google-endpoints/future/backports/_markupbase.py

Issue 2666783008: Add google-endpoints to third_party/. (Closed)

Patch Set: Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 """Shared support for scanning document type declarations in HTML and XHTML.

	2

	3 Backported for python-future from Python 3.3. Reason: ParserBase is an

	4 old-style class in the Python 2.7 source of markupbase.py, which I suspect

	5 might be the cause of sporadic unit-test failures on travis-ci.org with

	6 test_htmlparser.py. The test failures look like this:

	7

	8 ======================================================================

	9

	10 ERROR: test_attr_entity_replacement (future.tests.test_htmlparser.AttributesStri ctTestCase)

	11

	12 ----------------------------------------------------------------------

	13

	14 Traceback (most recent call last):

	15 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparse r.py", line 661, in test_attr_entity_replacement

	16 [("starttag", "a", [("b", "&><\"'")])])

	17 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparse r.py", line 93, in _run_check

	18 collector = self.get_collector()

	19 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparse r.py", line 617, in get_collector

	20 return EventCollector(strict=True)

	21 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparse r.py", line 27, in __init__

	22 html.parser.HTMLParser.__init__(self, args, *kw)

	23 File "/home/travis/build/edschofield/python-future/future/backports/html/parse r.py", line 135, in __init__

	24 self.reset()

	25 File "/home/travis/build/edschofield/python-future/future/backports/html/parse r.py", line 143, in reset

	26 _markupbase.ParserBase.reset(self)

	27

	28 TypeError: unbound method reset() must be called with ParserBase instance as fir st argument (got EventCollector instance instead)

	29

	30 This module is used as a foundation for the html.parser module. It has no

	31 documented public API and should not be used directly.

	32

	33 """

	34

	35 import re

	36

	37 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match

	38 _declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match

	39 _commentclose = re.compile(r'--\s*>')

	40 _markedsectionclose = re.compile(r']\s]\s>')

	41

	42 # An analysis of the MS-Word extensions is available at

	43 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf

	44

	45 _msmarkedsectionclose = re.compile(r']\s*>')

	46

	47 del re

	48

	49

	50 class ParserBase(object):

	51 """Parser base class which provides some common support methods used

	52 by the SGML/HTML and XHTML parsers."""

	53

	54 def __init__(self):

	55 if self.__class__ is ParserBase:

	56 raise RuntimeError(

	57 "_markupbase.ParserBase must be subclassed")

	58

	59 def error(self, message):

	60 raise NotImplementedError(

	61 "subclasses of ParserBase must override error()")

	62

	63 def reset(self):

	64 self.lineno = 1

	65 self.offset = 0

	66

	67 def getpos(self):

	68 """Return current line number and offset."""

	69 return self.lineno, self.offset

	70

	71 # Internal -- update line number and offset. This should be

	72 # called for each piece of data exactly once, in order -- in other

	73 # words the concatenation of all the input strings to this

	74 # function should be exactly the entire input.

	75 def updatepos(self, i, j):

	76 if i >= j:

	77 return j

	78 rawdata = self.rawdata

	79 nlines = rawdata.count("\n", i, j)

	80 if nlines:

	81 self.lineno = self.lineno + nlines

	82 pos = rawdata.rindex("\n", i, j) # Should not fail

	83 self.offset = j-(pos+1)

	84 else:

	85 self.offset = self.offset + j-i

	86 return j

	87

	88 _decl_otherchars = ''

	89

	90 # Internal -- parse declaration (for use by subclasses).

	91 def parse_declaration(self, i):

	92 # This is some sort of declaration; in "HTML as

	93 # deployed," this should only be the document type

	94 # declaration ("<!DOCTYPE html...>").

	95 # ISO 8879:1986, however, has more complex

	96 # declaration syntax for elements in <!...>, including:

	97 # --comment--

	98 # [marked section]

	99 # name in the following list: ENTITY, DOCTYPE, ELEMENT,

	100 # ATTLIST, NOTATION, SHORTREF, USEMAP,

	101 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM

	102 rawdata = self.rawdata

	103 j = i + 2

	104 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"

	105 if rawdata[j:j+1] == ">":

	106 # the empty comment <!>

	107 return j + 1

	108 if rawdata[j:j+1] in ("-", ""):

	109 # Start of comment followed by buffer boundary,

	110 # or just a buffer boundary.

	111 return -1

	112 # A simple, practical version could look like: ((name\|stringlit) S*) + ' >'

	113 n = len(rawdata)

	114 if rawdata[j:j+2] == '--': #comment

	115 # Locate --.*-- as the body of the comment

	116 return self.parse_comment(i)

	117 elif rawdata[j] == '[': #marked section

	118 # Locate [statusWord [...arbitrary SGML...]] as the body of the mark ed section

	119 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA

	120 # Note that this is extended by Microsoft Office "Save as Web" funct ion

	121 # to include [if...] and [endif].

	122 return self.parse_marked_section(i)

	123 else: #all other declaration elements

	124 decltype, j = self._scan_name(j, i)

	125 if j < 0:

	126 return j

	127 if decltype == "doctype":

	128 self._decl_otherchars = ''

	129 while j < n:

	130 c = rawdata[j]

	131 if c == ">":

	132 # end of declaration syntax

	133 data = rawdata[i+2:j]

	134 if decltype == "doctype":

	135 self.handle_decl(data)

	136 else:

	137 # According to the HTML5 specs sections "8.2.4.44 Bogus

	138 # comment state" and "8.2.4.45 Markup declaration open

	139 # state", a comment token should be emitted.

	140 # Calling unknown_decl provides more flexibility though.

	141 self.unknown_decl(data)

	142 return j + 1

	143 if c in "\"'":

	144 m = _declstringlit_match(rawdata, j)

	145 if not m:

	146 return -1 # incomplete

	147 j = m.end()

	148 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":

	149 name, j = self._scan_name(j, i)

	150 elif c in self._decl_otherchars:

	151 j = j + 1

	152 elif c == "[":

	153 # this could be handled in a separate doctype parser

	154 if decltype == "doctype":

	155 j = self._parse_doctype_subset(j + 1, i)

	156 elif decltype in set(["attlist", "linktype", "link", "element"]) :

	157 # must tolerate []'d groups in a content model in an element declaration

	158 # also in data attribute specifications of attlist declarati on

	159 # also link type declaration subsets in linktype declaration s

	160 # also link attribute specification lists in link declaratio ns

	161 self.error("unsupported '[' char in %s declaration" % declty pe)

	162 else:

	163 self.error("unexpected '[' char in declaration")

	164 else:

	165 self.error(

	166 "unexpected %r char in declaration" % rawdata[j])

	167 if j < 0:

	168 return j

	169 return -1 # incomplete

	170

	171 # Internal -- parse a marked section

	172 # Override this to handle MS-word extension syntax <![if word]>content<![end if]>

	173 def parse_marked_section(self, i, report=1):

	174 rawdata= self.rawdata

	175 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section ()"

	176 sectName, j = self._scan_name( i+3, i )

	177 if j < 0:

	178 return j

	179 if sectName in set(["temp", "cdata", "ignore", "include", "rcdata"]):

	180 # look for standard ]]> ending

	181 match= _markedsectionclose.search(rawdata, i+3)

	182 elif sectName in set(["if", "else", "endif"]):

	183 # look for MS Office ]> ending

	184 match= _msmarkedsectionclose.search(rawdata, i+3)

	185 else:

	186 self.error('unknown status keyword %r in marked section' % rawdata[i +3:j])

	187 if not match:

	188 return -1

	189 if report:

	190 j = match.start(0)

	191 self.unknown_decl(rawdata[i+3: j])

	192 return match.end(0)

	193

	194 # Internal -- parse comment, return length or -1 if not terminated

	195 def parse_comment(self, i, report=1):

	196 rawdata = self.rawdata

	197 if rawdata[i:i+4] != '<!--':

	198 self.error('unexpected call to parse_comment()')

	199 match = _commentclose.search(rawdata, i+4)

	200 if not match:

	201 return -1

	202 if report:

	203 j = match.start(0)

	204 self.handle_comment(rawdata[i+4: j])

	205 return match.end(0)

	206

	207 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,

	208 # returning the index just past any whitespace following the trailing ']'.

	209 def _parse_doctype_subset(self, i, declstartpos):

	210 rawdata = self.rawdata

	211 n = len(rawdata)

	212 j = i

	213 while j < n:

	214 c = rawdata[j]

	215 if c == "<":

	216 s = rawdata[j:j+2]

	217 if s == "<":

	218 # end of buffer; incomplete

	219 return -1

	220 if s != "<!":

	221 self.updatepos(declstartpos, j + 1)

	222 self.error("unexpected char in internal subset (in %r)" % s)

	223 if (j + 2) == n:

	224 # end of buffer; incomplete

	225 return -1

	226 if (j + 4) > n:

	227 # end of buffer; incomplete

	228 return -1

	229 if rawdata[j:j+4] == "<!--":

	230 j = self.parse_comment(j, report=0)

	231 if j < 0:

	232 return j

	233 continue

	234 name, j = self._scan_name(j + 2, declstartpos)

	235 if j == -1:

	236 return -1

	237 if name not in set(["attlist", "element", "entity", "notation"]) :

	238 self.updatepos(declstartpos, j + 2)

	239 self.error(

	240 "unknown declaration %r in internal subset" % name)

	241 # handle the individual names

	242 meth = getattr(self, "_parse_doctype_" + name)

	243 j = meth(j, declstartpos)

	244 if j < 0:

	245 return j

	246 elif c == "%":

	247 # parameter entity reference

	248 if (j + 1) == n:

	249 # end of buffer; incomplete

	250 return -1

	251 s, j = self._scan_name(j + 1, declstartpos)

	252 if j < 0:

	253 return j

	254 if rawdata[j] == ";":

	255 j = j + 1

	256 elif c == "]":

	257 j = j + 1

	258 while j < n and rawdata[j].isspace():

	259 j = j + 1

	260 if j < n:

	261 if rawdata[j] == ">":

	262 return j

	263 self.updatepos(declstartpos, j)

	264 self.error("unexpected char after internal subset")

	265 else:

	266 return -1

	267 elif c.isspace():

	268 j = j + 1

	269 else:

	270 self.updatepos(declstartpos, j)

	271 self.error("unexpected char %r in internal subset" % c)

	272 # end of buffer reached

	273 return -1

	274

	275 # Internal -- scan past <!ELEMENT declarations

	276 def _parse_doctype_element(self, i, declstartpos):

	277 name, j = self._scan_name(i, declstartpos)

	278 if j == -1:

	279 return -1

	280 # style content model; just skip until '>'

	281 rawdata = self.rawdata

	282 if '>' in rawdata[j:]:

	283 return rawdata.find(">", j) + 1

	284 return -1

	285

	286 # Internal -- scan past <!ATTLIST declarations

	287 def _parse_doctype_attlist(self, i, declstartpos):

	288 rawdata = self.rawdata

	289 name, j = self._scan_name(i, declstartpos)

	290 c = rawdata[j:j+1]

	291 if c == "":

	292 return -1

	293 if c == ">":

	294 return j + 1

	295 while 1:

	296 # scan a series of attribute descriptions; simplified:

	297 # name type [value] [#constraint]

	298 name, j = self._scan_name(j, declstartpos)

	299 if j < 0:

	300 return j

	301 c = rawdata[j:j+1]

	302 if c == "":

	303 return -1

	304 if c == "(":

	305 # an enumerated type; look for ')'

	306 if ")" in rawdata[j:]:

	307 j = rawdata.find(")", j) + 1

	308 else:

	309 return -1

	310 while rawdata[j:j+1].isspace():

	311 j = j + 1

	312 if not rawdata[j:]:

	313 # end of buffer, incomplete

	314 return -1

	315 else:

	316 name, j = self._scan_name(j, declstartpos)

	317 c = rawdata[j:j+1]

	318 if not c:

	319 return -1

	320 if c in "'\"":

	321 m = _declstringlit_match(rawdata, j)

	322 if m:

	323 j = m.end()

	324 else:

	325 return -1

	326 c = rawdata[j:j+1]

	327 if not c:

	328 return -1

	329 if c == "#":

	330 if rawdata[j:] == "#":

	331 # end of buffer

	332 return -1

	333 name, j = self._scan_name(j + 1, declstartpos)

	334 if j < 0:

	335 return j

	336 c = rawdata[j:j+1]

	337 if not c:

	338 return -1

	339 if c == '>':

	340 # all done

	341 return j + 1

	342

	343 # Internal -- scan past <!NOTATION declarations

	344 def _parse_doctype_notation(self, i, declstartpos):

	345 name, j = self._scan_name(i, declstartpos)

	346 if j < 0:

	347 return j

	348 rawdata = self.rawdata

	349 while 1:

	350 c = rawdata[j:j+1]

	351 if not c:

	352 # end of buffer; incomplete

	353 return -1

	354 if c == '>':

	355 return j + 1

	356 if c in "'\"":

	357 m = _declstringlit_match(rawdata, j)

	358 if not m:

	359 return -1

	360 j = m.end()

	361 else:

	362 name, j = self._scan_name(j, declstartpos)

	363 if j < 0:

	364 return j

	365

	366 # Internal -- scan past <!ENTITY declarations

	367 def _parse_doctype_entity(self, i, declstartpos):

	368 rawdata = self.rawdata

	369 if rawdata[i:i+1] == "%":

	370 j = i + 1

	371 while 1:

	372 c = rawdata[j:j+1]

	373 if not c:

	374 return -1

	375 if c.isspace():

	376 j = j + 1

	377 else:

	378 break

	379 else:

	380 j = i

	381 name, j = self._scan_name(j, declstartpos)

	382 if j < 0:

	383 return j

	384 while 1:

	385 c = self.rawdata[j:j+1]

	386 if not c:

	387 return -1

	388 if c in "'\"":

	389 m = _declstringlit_match(rawdata, j)

	390 if m:

	391 j = m.end()

	392 else:

	393 return -1 # incomplete

	394 elif c == ">":

	395 return j + 1

	396 else:

	397 name, j = self._scan_name(j, declstartpos)

	398 if j < 0:

	399 return j

	400

	401 # Internal -- scan a name token and the new position and the token, or

	402 # return -1 if we've reached the end of the buffer.

	403 def _scan_name(self, i, declstartpos):

	404 rawdata = self.rawdata

	405 n = len(rawdata)

	406 if i == n:

	407 return None, -1

	408 m = _declname_match(rawdata, i)

	409 if m:

	410 s = m.group()

	411 name = s.strip()

	412 if (i + len(s)) == n:

	413 return None, -1 # end of buffer

	414 return name.lower(), m.end()

	415 else:

	416 self.updatepos(declstartpos, i)

	417 self.error("expected name token at %r"

	418 % rawdata[declstartpos:declstartpos+20])

	419

	420 # To be overridden -- handlers for unknown objects

	421 def unknown_decl(self, data):

	422 pass

OLD	NEW