OLD | NEW |
(Empty) | |
| 1 """Shared support for scanning document type declarations in HTML and XHTML. |
| 2 |
| 3 Backported for python-future from Python 3.3. Reason: ParserBase is an |
| 4 old-style class in the Python 2.7 source of markupbase.py, which I suspect |
| 5 might be the cause of sporadic unit-test failures on travis-ci.org with |
| 6 test_htmlparser.py. The test failures look like this: |
| 7 |
| 8 ====================================================================== |
| 9 |
| 10 ERROR: test_attr_entity_replacement (future.tests.test_htmlparser.AttributesStri
ctTestCase) |
| 11 |
| 12 ---------------------------------------------------------------------- |
| 13 |
| 14 Traceback (most recent call last): |
| 15 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparse
r.py", line 661, in test_attr_entity_replacement |
| 16 [("starttag", "a", [("b", "&><\"'")])]) |
| 17 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparse
r.py", line 93, in _run_check |
| 18 collector = self.get_collector() |
| 19 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparse
r.py", line 617, in get_collector |
| 20 return EventCollector(strict=True) |
| 21 File "/home/travis/build/edschofield/python-future/future/tests/test_htmlparse
r.py", line 27, in __init__ |
| 22 html.parser.HTMLParser.__init__(self, *args, **kw) |
| 23 File "/home/travis/build/edschofield/python-future/future/backports/html/parse
r.py", line 135, in __init__ |
| 24 self.reset() |
| 25 File "/home/travis/build/edschofield/python-future/future/backports/html/parse
r.py", line 143, in reset |
| 26 _markupbase.ParserBase.reset(self) |
| 27 |
| 28 TypeError: unbound method reset() must be called with ParserBase instance as fir
st argument (got EventCollector instance instead) |
| 29 |
| 30 This module is used as a foundation for the html.parser module. It has no |
| 31 documented public API and should not be used directly. |
| 32 |
| 33 """ |
| 34 |
| 35 import re |
| 36 |
| 37 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match |
| 38 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match |
| 39 _commentclose = re.compile(r'--\s*>') |
| 40 _markedsectionclose = re.compile(r']\s*]\s*>') |
| 41 |
| 42 # An analysis of the MS-Word extensions is available at |
| 43 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf |
| 44 |
| 45 _msmarkedsectionclose = re.compile(r']\s*>') |
| 46 |
| 47 del re |
| 48 |
| 49 |
| 50 class ParserBase(object): |
| 51 """Parser base class which provides some common support methods used |
| 52 by the SGML/HTML and XHTML parsers.""" |
| 53 |
| 54 def __init__(self): |
| 55 if self.__class__ is ParserBase: |
| 56 raise RuntimeError( |
| 57 "_markupbase.ParserBase must be subclassed") |
| 58 |
| 59 def error(self, message): |
| 60 raise NotImplementedError( |
| 61 "subclasses of ParserBase must override error()") |
| 62 |
| 63 def reset(self): |
| 64 self.lineno = 1 |
| 65 self.offset = 0 |
| 66 |
| 67 def getpos(self): |
| 68 """Return current line number and offset.""" |
| 69 return self.lineno, self.offset |
| 70 |
| 71 # Internal -- update line number and offset. This should be |
| 72 # called for each piece of data exactly once, in order -- in other |
| 73 # words the concatenation of all the input strings to this |
| 74 # function should be exactly the entire input. |
| 75 def updatepos(self, i, j): |
| 76 if i >= j: |
| 77 return j |
| 78 rawdata = self.rawdata |
| 79 nlines = rawdata.count("\n", i, j) |
| 80 if nlines: |
| 81 self.lineno = self.lineno + nlines |
| 82 pos = rawdata.rindex("\n", i, j) # Should not fail |
| 83 self.offset = j-(pos+1) |
| 84 else: |
| 85 self.offset = self.offset + j-i |
| 86 return j |
| 87 |
| 88 _decl_otherchars = '' |
| 89 |
| 90 # Internal -- parse declaration (for use by subclasses). |
| 91 def parse_declaration(self, i): |
| 92 # This is some sort of declaration; in "HTML as |
| 93 # deployed," this should only be the document type |
| 94 # declaration ("<!DOCTYPE html...>"). |
| 95 # ISO 8879:1986, however, has more complex |
| 96 # declaration syntax for elements in <!...>, including: |
| 97 # --comment-- |
| 98 # [marked section] |
| 99 # name in the following list: ENTITY, DOCTYPE, ELEMENT, |
| 100 # ATTLIST, NOTATION, SHORTREF, USEMAP, |
| 101 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM |
| 102 rawdata = self.rawdata |
| 103 j = i + 2 |
| 104 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" |
| 105 if rawdata[j:j+1] == ">": |
| 106 # the empty comment <!> |
| 107 return j + 1 |
| 108 if rawdata[j:j+1] in ("-", ""): |
| 109 # Start of comment followed by buffer boundary, |
| 110 # or just a buffer boundary. |
| 111 return -1 |
| 112 # A simple, practical version could look like: ((name|stringlit) S*) + '
>' |
| 113 n = len(rawdata) |
| 114 if rawdata[j:j+2] == '--': #comment |
| 115 # Locate --.*-- as the body of the comment |
| 116 return self.parse_comment(i) |
| 117 elif rawdata[j] == '[': #marked section |
| 118 # Locate [statusWord [...arbitrary SGML...]] as the body of the mark
ed section |
| 119 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA |
| 120 # Note that this is extended by Microsoft Office "Save as Web" funct
ion |
| 121 # to include [if...] and [endif]. |
| 122 return self.parse_marked_section(i) |
| 123 else: #all other declaration elements |
| 124 decltype, j = self._scan_name(j, i) |
| 125 if j < 0: |
| 126 return j |
| 127 if decltype == "doctype": |
| 128 self._decl_otherchars = '' |
| 129 while j < n: |
| 130 c = rawdata[j] |
| 131 if c == ">": |
| 132 # end of declaration syntax |
| 133 data = rawdata[i+2:j] |
| 134 if decltype == "doctype": |
| 135 self.handle_decl(data) |
| 136 else: |
| 137 # According to the HTML5 specs sections "8.2.4.44 Bogus |
| 138 # comment state" and "8.2.4.45 Markup declaration open |
| 139 # state", a comment token should be emitted. |
| 140 # Calling unknown_decl provides more flexibility though. |
| 141 self.unknown_decl(data) |
| 142 return j + 1 |
| 143 if c in "\"'": |
| 144 m = _declstringlit_match(rawdata, j) |
| 145 if not m: |
| 146 return -1 # incomplete |
| 147 j = m.end() |
| 148 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": |
| 149 name, j = self._scan_name(j, i) |
| 150 elif c in self._decl_otherchars: |
| 151 j = j + 1 |
| 152 elif c == "[": |
| 153 # this could be handled in a separate doctype parser |
| 154 if decltype == "doctype": |
| 155 j = self._parse_doctype_subset(j + 1, i) |
| 156 elif decltype in set(["attlist", "linktype", "link", "element"])
: |
| 157 # must tolerate []'d groups in a content model in an element
declaration |
| 158 # also in data attribute specifications of attlist declarati
on |
| 159 # also link type declaration subsets in linktype declaration
s |
| 160 # also link attribute specification lists in link declaratio
ns |
| 161 self.error("unsupported '[' char in %s declaration" % declty
pe) |
| 162 else: |
| 163 self.error("unexpected '[' char in declaration") |
| 164 else: |
| 165 self.error( |
| 166 "unexpected %r char in declaration" % rawdata[j]) |
| 167 if j < 0: |
| 168 return j |
| 169 return -1 # incomplete |
| 170 |
| 171 # Internal -- parse a marked section |
| 172 # Override this to handle MS-word extension syntax <![if word]>content<![end
if]> |
| 173 def parse_marked_section(self, i, report=1): |
| 174 rawdata= self.rawdata |
| 175 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section
()" |
| 176 sectName, j = self._scan_name( i+3, i ) |
| 177 if j < 0: |
| 178 return j |
| 179 if sectName in set(["temp", "cdata", "ignore", "include", "rcdata"]): |
| 180 # look for standard ]]> ending |
| 181 match= _markedsectionclose.search(rawdata, i+3) |
| 182 elif sectName in set(["if", "else", "endif"]): |
| 183 # look for MS Office ]> ending |
| 184 match= _msmarkedsectionclose.search(rawdata, i+3) |
| 185 else: |
| 186 self.error('unknown status keyword %r in marked section' % rawdata[i
+3:j]) |
| 187 if not match: |
| 188 return -1 |
| 189 if report: |
| 190 j = match.start(0) |
| 191 self.unknown_decl(rawdata[i+3: j]) |
| 192 return match.end(0) |
| 193 |
| 194 # Internal -- parse comment, return length or -1 if not terminated |
| 195 def parse_comment(self, i, report=1): |
| 196 rawdata = self.rawdata |
| 197 if rawdata[i:i+4] != '<!--': |
| 198 self.error('unexpected call to parse_comment()') |
| 199 match = _commentclose.search(rawdata, i+4) |
| 200 if not match: |
| 201 return -1 |
| 202 if report: |
| 203 j = match.start(0) |
| 204 self.handle_comment(rawdata[i+4: j]) |
| 205 return match.end(0) |
| 206 |
| 207 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, |
| 208 # returning the index just past any whitespace following the trailing ']'. |
| 209 def _parse_doctype_subset(self, i, declstartpos): |
| 210 rawdata = self.rawdata |
| 211 n = len(rawdata) |
| 212 j = i |
| 213 while j < n: |
| 214 c = rawdata[j] |
| 215 if c == "<": |
| 216 s = rawdata[j:j+2] |
| 217 if s == "<": |
| 218 # end of buffer; incomplete |
| 219 return -1 |
| 220 if s != "<!": |
| 221 self.updatepos(declstartpos, j + 1) |
| 222 self.error("unexpected char in internal subset (in %r)" % s) |
| 223 if (j + 2) == n: |
| 224 # end of buffer; incomplete |
| 225 return -1 |
| 226 if (j + 4) > n: |
| 227 # end of buffer; incomplete |
| 228 return -1 |
| 229 if rawdata[j:j+4] == "<!--": |
| 230 j = self.parse_comment(j, report=0) |
| 231 if j < 0: |
| 232 return j |
| 233 continue |
| 234 name, j = self._scan_name(j + 2, declstartpos) |
| 235 if j == -1: |
| 236 return -1 |
| 237 if name not in set(["attlist", "element", "entity", "notation"])
: |
| 238 self.updatepos(declstartpos, j + 2) |
| 239 self.error( |
| 240 "unknown declaration %r in internal subset" % name) |
| 241 # handle the individual names |
| 242 meth = getattr(self, "_parse_doctype_" + name) |
| 243 j = meth(j, declstartpos) |
| 244 if j < 0: |
| 245 return j |
| 246 elif c == "%": |
| 247 # parameter entity reference |
| 248 if (j + 1) == n: |
| 249 # end of buffer; incomplete |
| 250 return -1 |
| 251 s, j = self._scan_name(j + 1, declstartpos) |
| 252 if j < 0: |
| 253 return j |
| 254 if rawdata[j] == ";": |
| 255 j = j + 1 |
| 256 elif c == "]": |
| 257 j = j + 1 |
| 258 while j < n and rawdata[j].isspace(): |
| 259 j = j + 1 |
| 260 if j < n: |
| 261 if rawdata[j] == ">": |
| 262 return j |
| 263 self.updatepos(declstartpos, j) |
| 264 self.error("unexpected char after internal subset") |
| 265 else: |
| 266 return -1 |
| 267 elif c.isspace(): |
| 268 j = j + 1 |
| 269 else: |
| 270 self.updatepos(declstartpos, j) |
| 271 self.error("unexpected char %r in internal subset" % c) |
| 272 # end of buffer reached |
| 273 return -1 |
| 274 |
| 275 # Internal -- scan past <!ELEMENT declarations |
| 276 def _parse_doctype_element(self, i, declstartpos): |
| 277 name, j = self._scan_name(i, declstartpos) |
| 278 if j == -1: |
| 279 return -1 |
| 280 # style content model; just skip until '>' |
| 281 rawdata = self.rawdata |
| 282 if '>' in rawdata[j:]: |
| 283 return rawdata.find(">", j) + 1 |
| 284 return -1 |
| 285 |
| 286 # Internal -- scan past <!ATTLIST declarations |
| 287 def _parse_doctype_attlist(self, i, declstartpos): |
| 288 rawdata = self.rawdata |
| 289 name, j = self._scan_name(i, declstartpos) |
| 290 c = rawdata[j:j+1] |
| 291 if c == "": |
| 292 return -1 |
| 293 if c == ">": |
| 294 return j + 1 |
| 295 while 1: |
| 296 # scan a series of attribute descriptions; simplified: |
| 297 # name type [value] [#constraint] |
| 298 name, j = self._scan_name(j, declstartpos) |
| 299 if j < 0: |
| 300 return j |
| 301 c = rawdata[j:j+1] |
| 302 if c == "": |
| 303 return -1 |
| 304 if c == "(": |
| 305 # an enumerated type; look for ')' |
| 306 if ")" in rawdata[j:]: |
| 307 j = rawdata.find(")", j) + 1 |
| 308 else: |
| 309 return -1 |
| 310 while rawdata[j:j+1].isspace(): |
| 311 j = j + 1 |
| 312 if not rawdata[j:]: |
| 313 # end of buffer, incomplete |
| 314 return -1 |
| 315 else: |
| 316 name, j = self._scan_name(j, declstartpos) |
| 317 c = rawdata[j:j+1] |
| 318 if not c: |
| 319 return -1 |
| 320 if c in "'\"": |
| 321 m = _declstringlit_match(rawdata, j) |
| 322 if m: |
| 323 j = m.end() |
| 324 else: |
| 325 return -1 |
| 326 c = rawdata[j:j+1] |
| 327 if not c: |
| 328 return -1 |
| 329 if c == "#": |
| 330 if rawdata[j:] == "#": |
| 331 # end of buffer |
| 332 return -1 |
| 333 name, j = self._scan_name(j + 1, declstartpos) |
| 334 if j < 0: |
| 335 return j |
| 336 c = rawdata[j:j+1] |
| 337 if not c: |
| 338 return -1 |
| 339 if c == '>': |
| 340 # all done |
| 341 return j + 1 |
| 342 |
| 343 # Internal -- scan past <!NOTATION declarations |
| 344 def _parse_doctype_notation(self, i, declstartpos): |
| 345 name, j = self._scan_name(i, declstartpos) |
| 346 if j < 0: |
| 347 return j |
| 348 rawdata = self.rawdata |
| 349 while 1: |
| 350 c = rawdata[j:j+1] |
| 351 if not c: |
| 352 # end of buffer; incomplete |
| 353 return -1 |
| 354 if c == '>': |
| 355 return j + 1 |
| 356 if c in "'\"": |
| 357 m = _declstringlit_match(rawdata, j) |
| 358 if not m: |
| 359 return -1 |
| 360 j = m.end() |
| 361 else: |
| 362 name, j = self._scan_name(j, declstartpos) |
| 363 if j < 0: |
| 364 return j |
| 365 |
| 366 # Internal -- scan past <!ENTITY declarations |
| 367 def _parse_doctype_entity(self, i, declstartpos): |
| 368 rawdata = self.rawdata |
| 369 if rawdata[i:i+1] == "%": |
| 370 j = i + 1 |
| 371 while 1: |
| 372 c = rawdata[j:j+1] |
| 373 if not c: |
| 374 return -1 |
| 375 if c.isspace(): |
| 376 j = j + 1 |
| 377 else: |
| 378 break |
| 379 else: |
| 380 j = i |
| 381 name, j = self._scan_name(j, declstartpos) |
| 382 if j < 0: |
| 383 return j |
| 384 while 1: |
| 385 c = self.rawdata[j:j+1] |
| 386 if not c: |
| 387 return -1 |
| 388 if c in "'\"": |
| 389 m = _declstringlit_match(rawdata, j) |
| 390 if m: |
| 391 j = m.end() |
| 392 else: |
| 393 return -1 # incomplete |
| 394 elif c == ">": |
| 395 return j + 1 |
| 396 else: |
| 397 name, j = self._scan_name(j, declstartpos) |
| 398 if j < 0: |
| 399 return j |
| 400 |
| 401 # Internal -- scan a name token and the new position and the token, or |
| 402 # return -1 if we've reached the end of the buffer. |
| 403 def _scan_name(self, i, declstartpos): |
| 404 rawdata = self.rawdata |
| 405 n = len(rawdata) |
| 406 if i == n: |
| 407 return None, -1 |
| 408 m = _declname_match(rawdata, i) |
| 409 if m: |
| 410 s = m.group() |
| 411 name = s.strip() |
| 412 if (i + len(s)) == n: |
| 413 return None, -1 # end of buffer |
| 414 return name.lower(), m.end() |
| 415 else: |
| 416 self.updatepos(declstartpos, i) |
| 417 self.error("expected name token at %r" |
| 418 % rawdata[declstartpos:declstartpos+20]) |
| 419 |
| 420 # To be overridden -- handlers for unknown objects |
| 421 def unknown_decl(self, data): |
| 422 pass |
OLD | NEW |