| OLD | NEW |
| (Empty) | |
| 1 """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser. |
| 2 |
| 3 Examples |
| 4 |
| 5 This program extracts all links from a document. It will print one |
| 6 line for each link, containing the URL and the textual description |
| 7 between the <A>...</A> tags: |
| 8 |
| 9 import pullparser, sys |
| 10 f = file(sys.argv[1]) |
| 11 p = pullparser.PullParser(f) |
| 12 for token in p.tags("a"): |
| 13 if token.type == "endtag": continue |
| 14 url = dict(token.attrs).get("href", "-") |
| 15 text = p.get_compressed_text(endat=("endtag", "a")) |
| 16 print "%s\t%s" % (url, text) |
| 17 |
| 18 This program extracts the <TITLE> from the document: |
| 19 |
| 20 import pullparser, sys |
| 21 f = file(sys.argv[1]) |
| 22 p = pullparser.PullParser(f) |
| 23 if p.get_tag("title"): |
| 24 title = p.get_compressed_text() |
| 25 print "Title: %s" % title |
| 26 |
| 27 |
| 28 Copyright 2003-2006 John J. Lee <jjl@pobox.com> |
| 29 Copyright 1998-2001 Gisle Aas (original libwww-perl code) |
| 30 |
| 31 This code is free software; you can redistribute it and/or modify it |
| 32 under the terms of the BSD or ZPL 2.1 licenses. |
| 33 |
| 34 """ |
| 35 |
| 36 import re, htmlentitydefs |
| 37 import _sgmllib_copy as sgmllib |
| 38 import HTMLParser |
| 39 from xml.sax import saxutils |
| 40 |
| 41 from _html import unescape, unescape_charref |
| 42 |
| 43 |
| 44 class NoMoreTokensError(Exception): pass |
| 45 |
| 46 class Token: |
| 47 """Represents an HTML tag, declaration, processing instruction etc. |
| 48 |
| 49 Behaves as both a tuple-like object (ie. iterable) and has attributes |
| 50 .type, .data and .attrs. |
| 51 |
| 52 >>> t = Token("starttag", "a", [("href", "http://www.python.org/")]) |
| 53 >>> t == ("starttag", "a", [("href", "http://www.python.org/")]) |
| 54 True |
| 55 >>> (t.type, t.data) == ("starttag", "a") |
| 56 True |
| 57 >>> t.attrs == [("href", "http://www.python.org/")] |
| 58 True |
| 59 |
| 60 Public attributes |
| 61 |
| 62 type: one of "starttag", "endtag", "startendtag", "charref", "entityref", |
| 63 "data", "comment", "decl", "pi", after the corresponding methods of |
| 64 HTMLParser.HTMLParser |
| 65 data: For a tag, the tag name; otherwise, the relevant data carried by the |
| 66 tag, as a string |
| 67 attrs: list of (name, value) pairs representing HTML attributes |
| 68 (or None if token does not represent an opening tag) |
| 69 |
| 70 """ |
| 71 def __init__(self, type, data, attrs=None): |
| 72 self.type = type |
| 73 self.data = data |
| 74 self.attrs = attrs |
| 75 def __iter__(self): |
| 76 return iter((self.type, self.data, self.attrs)) |
| 77 def __eq__(self, other): |
| 78 type, data, attrs = other |
| 79 if (self.type == type and |
| 80 self.data == data and |
| 81 self.attrs == attrs): |
| 82 return True |
| 83 else: |
| 84 return False |
| 85 def __ne__(self, other): return not self.__eq__(other) |
| 86 def __repr__(self): |
| 87 args = ", ".join(map(repr, [self.type, self.data, self.attrs])) |
| 88 return self.__class__.__name__+"(%s)" % args |
| 89 |
| 90 def __str__(self): |
| 91 """ |
| 92 >>> print Token("starttag", "br") |
| 93 <br> |
| 94 >>> print Token("starttag", "a", |
| 95 ... [("href", "http://www.python.org/"), ("alt", '"foo"')]) |
| 96 <a href="http://www.python.org/" alt='"foo"'> |
| 97 >>> print Token("startendtag", "br") |
| 98 <br /> |
| 99 >>> print Token("startendtag", "br", [("spam", "eggs")]) |
| 100 <br spam="eggs" /> |
| 101 >>> print Token("endtag", "p") |
| 102 </p> |
| 103 >>> print Token("charref", "38") |
| 104 & |
| 105 >>> print Token("entityref", "amp") |
| 106 & |
| 107 >>> print Token("data", "foo\\nbar") |
| 108 foo |
| 109 bar |
| 110 >>> print Token("comment", "Life is a bowl\\nof cherries.") |
| 111 <!--Life is a bowl |
| 112 of cherries.--> |
| 113 >>> print Token("decl", "decl") |
| 114 <!decl> |
| 115 >>> print Token("pi", "pi") |
| 116 <?pi> |
| 117 """ |
| 118 if self.attrs is not None: |
| 119 attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for |
| 120 k, v in self.attrs]) |
| 121 else: |
| 122 attrs = "" |
| 123 if self.type == "starttag": |
| 124 return "<%s%s>" % (self.data, attrs) |
| 125 elif self.type == "startendtag": |
| 126 return "<%s%s />" % (self.data, attrs) |
| 127 elif self.type == "endtag": |
| 128 return "</%s>" % self.data |
| 129 elif self.type == "charref": |
| 130 return "&#%s;" % self.data |
| 131 elif self.type == "entityref": |
| 132 return "&%s;" % self.data |
| 133 elif self.type == "data": |
| 134 return self.data |
| 135 elif self.type == "comment": |
| 136 return "<!--%s-->" % self.data |
| 137 elif self.type == "decl": |
| 138 return "<!%s>" % self.data |
| 139 elif self.type == "pi": |
| 140 return "<?%s>" % self.data |
| 141 assert False |
| 142 |
| 143 |
| 144 def iter_until_exception(fn, exception, *args, **kwds): |
| 145 while 1: |
| 146 try: |
| 147 yield fn(*args, **kwds) |
| 148 except exception: |
| 149 raise StopIteration |
| 150 |
| 151 |
| 152 class _AbstractParser: |
| 153 chunk = 1024 |
| 154 compress_re = re.compile(r"\s+") |
| 155 def __init__(self, fh, textify={"img": "alt", "applet": "alt"}, |
| 156 encoding="ascii", entitydefs=None): |
| 157 """ |
| 158 fh: file-like object (only a .read() method is required) from which to |
| 159 read HTML to be parsed |
| 160 textify: mapping used by .get_text() and .get_compressed_text() methods |
| 161 to represent opening tags as text |
| 162 encoding: encoding used to encode numeric character references by |
| 163 .get_text() and .get_compressed_text() ("ascii" by default) |
| 164 |
| 165 entitydefs: mapping like {"amp": "&", ...} containing HTML entity |
| 166 definitions (a sensible default is used). This is used to unescape |
| 167 entities in .get_text() (and .get_compressed_text()) and attribute |
| 168 values. If the encoding can not represent the character, the entity |
| 169 reference is left unescaped. Note that entity references (both |
| 170 numeric - e.g. { or ઼ - and non-numeric - e.g. &) are |
| 171 unescaped in attribute values and the return value of .get_text(), but |
| 172 not in data outside of tags. Instead, entity references outside of |
| 173 tags are represented as tokens. This is a bit odd, it's true :-/ |
| 174 |
| 175 If the element name of an opening tag matches a key in the textify |
| 176 mapping then that tag is converted to text. The corresponding value is |
| 177 used to specify which tag attribute to obtain the text from. textify |
| 178 maps from element names to either: |
| 179 |
| 180 - an HTML attribute name, in which case the HTML attribute value is |
| 181 used as its text value along with the element name in square |
| 182 brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute |
| 183 were missing, just "[IMG]") |
| 184 - a callable object (e.g. a function) which takes a Token and returns |
| 185 the string to be used as its text value |
| 186 |
| 187 If textify has no key for an element name, nothing is substituted for |
| 188 the opening tag. |
| 189 |
| 190 Public attributes: |
| 191 |
| 192 encoding and textify: see above |
| 193 |
| 194 """ |
| 195 self._fh = fh |
| 196 self._tokenstack = [] # FIFO |
| 197 self.textify = textify |
| 198 self.encoding = encoding |
| 199 if entitydefs is None: |
| 200 entitydefs = htmlentitydefs.name2codepoint |
| 201 self._entitydefs = entitydefs |
| 202 |
| 203 def __iter__(self): return self |
| 204 |
| 205 def tags(self, *names): |
| 206 return iter_until_exception(self.get_tag, NoMoreTokensError, *names) |
| 207 |
| 208 def tokens(self, *tokentypes): |
| 209 return iter_until_exception(self.get_token, NoMoreTokensError, |
| 210 *tokentypes) |
| 211 |
| 212 def next(self): |
| 213 try: |
| 214 return self.get_token() |
| 215 except NoMoreTokensError: |
| 216 raise StopIteration() |
| 217 |
| 218 def get_token(self, *tokentypes): |
| 219 """Pop the next Token object from the stack of parsed tokens. |
| 220 |
| 221 If arguments are given, they are taken to be token types in which the |
| 222 caller is interested: tokens representing other elements will be |
| 223 skipped. Element names must be given in lower case. |
| 224 |
| 225 Raises NoMoreTokensError. |
| 226 |
| 227 """ |
| 228 while 1: |
| 229 while self._tokenstack: |
| 230 token = self._tokenstack.pop(0) |
| 231 if tokentypes: |
| 232 if token.type in tokentypes: |
| 233 return token |
| 234 else: |
| 235 return token |
| 236 data = self._fh.read(self.chunk) |
| 237 if not data: |
| 238 raise NoMoreTokensError() |
| 239 self.feed(data) |
| 240 |
| 241 def unget_token(self, token): |
| 242 """Push a Token back onto the stack.""" |
| 243 self._tokenstack.insert(0, token) |
| 244 |
| 245 def get_tag(self, *names): |
| 246 """Return the next Token that represents an opening or closing tag. |
| 247 |
| 248 If arguments are given, they are taken to be element names in which the |
| 249 caller is interested: tags representing other elements will be skipped. |
| 250 Element names must be given in lower case. |
| 251 |
| 252 Raises NoMoreTokensError. |
| 253 |
| 254 """ |
| 255 while 1: |
| 256 tok = self.get_token() |
| 257 if tok.type not in ["starttag", "endtag", "startendtag"]: |
| 258 continue |
| 259 if names: |
| 260 if tok.data in names: |
| 261 return tok |
| 262 else: |
| 263 return tok |
| 264 |
| 265 def get_text(self, endat=None): |
| 266 """Get some text. |
| 267 |
| 268 endat: stop reading text at this tag (the tag is included in the |
| 269 returned text); endtag is a tuple (type, name) where type is |
| 270 "starttag", "endtag" or "startendtag", and name is the element name of |
| 271 the tag (element names must be given in lower case) |
| 272 |
| 273 If endat is not given, .get_text() will stop at the next opening or |
| 274 closing tag, or when there are no more tokens (no exception is raised). |
| 275 Note that .get_text() includes the text representation (if any) of the |
| 276 opening tag, but pushes the opening tag back onto the stack. As a |
| 277 result, if you want to call .get_text() again, you need to call |
| 278 .get_tag() first (unless you want an empty string returned when you |
| 279 next call .get_text()). |
| 280 |
| 281 Entity references are translated using the value of the entitydefs |
| 282 constructor argument (a mapping from names to characters like that |
| 283 provided by the standard module htmlentitydefs). Named entity |
| 284 references that are not in this mapping are left unchanged. |
| 285 |
| 286 The textify attribute is used to translate opening tags into text: see |
| 287 the class docstring. |
| 288 |
| 289 """ |
| 290 text = [] |
| 291 tok = None |
| 292 while 1: |
| 293 try: |
| 294 tok = self.get_token() |
| 295 except NoMoreTokensError: |
| 296 # unget last token (not the one we just failed to get) |
| 297 if tok: self.unget_token(tok) |
| 298 break |
| 299 if tok.type == "data": |
| 300 text.append(tok.data) |
| 301 elif tok.type == "entityref": |
| 302 t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding) |
| 303 text.append(t) |
| 304 elif tok.type == "charref": |
| 305 t = unescape_charref(tok.data, self.encoding) |
| 306 text.append(t) |
| 307 elif tok.type in ["starttag", "endtag", "startendtag"]: |
| 308 tag_name = tok.data |
| 309 if tok.type in ["starttag", "startendtag"]: |
| 310 alt = self.textify.get(tag_name) |
| 311 if alt is not None: |
| 312 if callable(alt): |
| 313 text.append(alt(tok)) |
| 314 elif tok.attrs is not None: |
| 315 for k, v in tok.attrs: |
| 316 if k == alt: |
| 317 text.append(v) |
| 318 text.append("[%s]" % tag_name.upper()) |
| 319 if endat is None or endat == (tok.type, tag_name): |
| 320 self.unget_token(tok) |
| 321 break |
| 322 return "".join(text) |
| 323 |
| 324 def get_compressed_text(self, *args, **kwds): |
| 325 """ |
| 326 As .get_text(), but collapses each group of contiguous whitespace to a |
| 327 single space character, and removes all initial and trailing |
| 328 whitespace. |
| 329 |
| 330 """ |
| 331 text = self.get_text(*args, **kwds) |
| 332 text = text.strip() |
| 333 return self.compress_re.sub(" ", text) |
| 334 |
| 335 def handle_startendtag(self, tag, attrs): |
| 336 self._tokenstack.append(Token("startendtag", tag, attrs)) |
| 337 def handle_starttag(self, tag, attrs): |
| 338 self._tokenstack.append(Token("starttag", tag, attrs)) |
| 339 def handle_endtag(self, tag): |
| 340 self._tokenstack.append(Token("endtag", tag)) |
| 341 def handle_charref(self, name): |
| 342 self._tokenstack.append(Token("charref", name)) |
| 343 def handle_entityref(self, name): |
| 344 self._tokenstack.append(Token("entityref", name)) |
| 345 def handle_data(self, data): |
| 346 self._tokenstack.append(Token("data", data)) |
| 347 def handle_comment(self, data): |
| 348 self._tokenstack.append(Token("comment", data)) |
| 349 def handle_decl(self, decl): |
| 350 self._tokenstack.append(Token("decl", decl)) |
| 351 def unknown_decl(self, data): |
| 352 # XXX should this call self.error instead? |
| 353 #self.error("unknown declaration: " + `data`) |
| 354 self._tokenstack.append(Token("decl", data)) |
| 355 def handle_pi(self, data): |
| 356 self._tokenstack.append(Token("pi", data)) |
| 357 |
| 358 def unescape_attr(self, name): |
| 359 return unescape(name, self._entitydefs, self.encoding) |
| 360 def unescape_attrs(self, attrs): |
| 361 escaped_attrs = [] |
| 362 for key, val in attrs: |
| 363 escaped_attrs.append((key, self.unescape_attr(val))) |
| 364 return escaped_attrs |
| 365 |
| 366 class PullParser(_AbstractParser, HTMLParser.HTMLParser): |
| 367 def __init__(self, *args, **kwds): |
| 368 HTMLParser.HTMLParser.__init__(self) |
| 369 _AbstractParser.__init__(self, *args, **kwds) |
| 370 def unescape(self, name): |
| 371 # Use the entitydefs passed into constructor, not |
| 372 # HTMLParser.HTMLParser's entitydefs. |
| 373 return self.unescape_attr(name) |
| 374 |
| 375 class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser): |
| 376 def __init__(self, *args, **kwds): |
| 377 sgmllib.SGMLParser.__init__(self) |
| 378 _AbstractParser.__init__(self, *args, **kwds) |
| 379 def unknown_starttag(self, tag, attrs): |
| 380 attrs = self.unescape_attrs(attrs) |
| 381 self._tokenstack.append(Token("starttag", tag, attrs)) |
| 382 def unknown_endtag(self, tag): |
| 383 self._tokenstack.append(Token("endtag", tag)) |
| 384 |
| 385 |
| 386 def _test(): |
| 387 import doctest, _pullparser |
| 388 return doctest.testmod(_pullparser) |
| 389 |
| 390 if __name__ == "__main__": |
| 391 _test() |
| OLD | NEW |