Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(25)

Side by Side Diff: Tools/Scripts/webkitpy/thirdparty/mechanize/_pullparser.py

Issue 18418010: Check in the thirdparty libs needed for webkitpy. (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
2
3 Examples
4
5 This program extracts all links from a document. It will print one
6 line for each link, containing the URL and the textual description
7 between the <A>...</A> tags:
8
9 import pullparser, sys
10 f = file(sys.argv[1])
11 p = pullparser.PullParser(f)
12 for token in p.tags("a"):
13 if token.type == "endtag": continue
14 url = dict(token.attrs).get("href", "-")
15 text = p.get_compressed_text(endat=("endtag", "a"))
16 print "%s\t%s" % (url, text)
17
18 This program extracts the <TITLE> from the document:
19
20 import pullparser, sys
21 f = file(sys.argv[1])
22 p = pullparser.PullParser(f)
23 if p.get_tag("title"):
24 title = p.get_compressed_text()
25 print "Title: %s" % title
26
27
28 Copyright 2003-2006 John J. Lee <jjl@pobox.com>
29 Copyright 1998-2001 Gisle Aas (original libwww-perl code)
30
31 This code is free software; you can redistribute it and/or modify it
32 under the terms of the BSD or ZPL 2.1 licenses.
33
34 """
35
36 import re, htmlentitydefs
37 import _sgmllib_copy as sgmllib
38 import HTMLParser
39 from xml.sax import saxutils
40
41 from _html import unescape, unescape_charref
42
43
44 class NoMoreTokensError(Exception): pass
45
46 class Token:
47 """Represents an HTML tag, declaration, processing instruction etc.
48
49 Behaves as both a tuple-like object (ie. iterable) and has attributes
50 .type, .data and .attrs.
51
52 >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
53 >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
54 True
55 >>> (t.type, t.data) == ("starttag", "a")
56 True
57 >>> t.attrs == [("href", "http://www.python.org/")]
58 True
59
60 Public attributes
61
62 type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
63 "data", "comment", "decl", "pi", after the corresponding methods of
64 HTMLParser.HTMLParser
65 data: For a tag, the tag name; otherwise, the relevant data carried by the
66 tag, as a string
67 attrs: list of (name, value) pairs representing HTML attributes
68 (or None if token does not represent an opening tag)
69
70 """
71 def __init__(self, type, data, attrs=None):
72 self.type = type
73 self.data = data
74 self.attrs = attrs
75 def __iter__(self):
76 return iter((self.type, self.data, self.attrs))
77 def __eq__(self, other):
78 type, data, attrs = other
79 if (self.type == type and
80 self.data == data and
81 self.attrs == attrs):
82 return True
83 else:
84 return False
85 def __ne__(self, other): return not self.__eq__(other)
86 def __repr__(self):
87 args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
88 return self.__class__.__name__+"(%s)" % args
89
90 def __str__(self):
91 """
92 >>> print Token("starttag", "br")
93 <br>
94 >>> print Token("starttag", "a",
95 ... [("href", "http://www.python.org/"), ("alt", '"foo"')])
96 <a href="http://www.python.org/" alt='"foo"'>
97 >>> print Token("startendtag", "br")
98 <br />
99 >>> print Token("startendtag", "br", [("spam", "eggs")])
100 <br spam="eggs" />
101 >>> print Token("endtag", "p")
102 </p>
103 >>> print Token("charref", "38")
104 &#38;
105 >>> print Token("entityref", "amp")
106 &amp;
107 >>> print Token("data", "foo\\nbar")
108 foo
109 bar
110 >>> print Token("comment", "Life is a bowl\\nof cherries.")
111 <!--Life is a bowl
112 of cherries.-->
113 >>> print Token("decl", "decl")
114 <!decl>
115 >>> print Token("pi", "pi")
116 <?pi>
117 """
118 if self.attrs is not None:
119 attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
120 k, v in self.attrs])
121 else:
122 attrs = ""
123 if self.type == "starttag":
124 return "<%s%s>" % (self.data, attrs)
125 elif self.type == "startendtag":
126 return "<%s%s />" % (self.data, attrs)
127 elif self.type == "endtag":
128 return "</%s>" % self.data
129 elif self.type == "charref":
130 return "&#%s;" % self.data
131 elif self.type == "entityref":
132 return "&%s;" % self.data
133 elif self.type == "data":
134 return self.data
135 elif self.type == "comment":
136 return "<!--%s-->" % self.data
137 elif self.type == "decl":
138 return "<!%s>" % self.data
139 elif self.type == "pi":
140 return "<?%s>" % self.data
141 assert False
142
143
144 def iter_until_exception(fn, exception, *args, **kwds):
145 while 1:
146 try:
147 yield fn(*args, **kwds)
148 except exception:
149 raise StopIteration
150
151
152 class _AbstractParser:
153 chunk = 1024
154 compress_re = re.compile(r"\s+")
155 def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
156 encoding="ascii", entitydefs=None):
157 """
158 fh: file-like object (only a .read() method is required) from which to
159 read HTML to be parsed
160 textify: mapping used by .get_text() and .get_compressed_text() methods
161 to represent opening tags as text
162 encoding: encoding used to encode numeric character references by
163 .get_text() and .get_compressed_text() ("ascii" by default)
164
165 entitydefs: mapping like {"amp": "&", ...} containing HTML entity
166 definitions (a sensible default is used). This is used to unescape
167 entities in .get_text() (and .get_compressed_text()) and attribute
168 values. If the encoding can not represent the character, the entity
169 reference is left unescaped. Note that entity references (both
170 numeric - e.g. &#123; or &#xabc; - and non-numeric - e.g. &amp;) are
171 unescaped in attribute values and the return value of .get_text(), but
172 not in data outside of tags. Instead, entity references outside of
173 tags are represented as tokens. This is a bit odd, it's true :-/
174
175 If the element name of an opening tag matches a key in the textify
176 mapping then that tag is converted to text. The corresponding value is
177 used to specify which tag attribute to obtain the text from. textify
178 maps from element names to either:
179
180 - an HTML attribute name, in which case the HTML attribute value is
181 used as its text value along with the element name in square
182 brackets (e.g. "alt text goes here[IMG]", or, if the alt attribute
183 were missing, just "[IMG]")
184 - a callable object (e.g. a function) which takes a Token and returns
185 the string to be used as its text value
186
187 If textify has no key for an element name, nothing is substituted for
188 the opening tag.
189
190 Public attributes:
191
192 encoding and textify: see above
193
194 """
195 self._fh = fh
196 self._tokenstack = [] # FIFO
197 self.textify = textify
198 self.encoding = encoding
199 if entitydefs is None:
200 entitydefs = htmlentitydefs.name2codepoint
201 self._entitydefs = entitydefs
202
203 def __iter__(self): return self
204
205 def tags(self, *names):
206 return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
207
208 def tokens(self, *tokentypes):
209 return iter_until_exception(self.get_token, NoMoreTokensError,
210 *tokentypes)
211
212 def next(self):
213 try:
214 return self.get_token()
215 except NoMoreTokensError:
216 raise StopIteration()
217
218 def get_token(self, *tokentypes):
219 """Pop the next Token object from the stack of parsed tokens.
220
221 If arguments are given, they are taken to be token types in which the
222 caller is interested: tokens representing other elements will be
223 skipped. Element names must be given in lower case.
224
225 Raises NoMoreTokensError.
226
227 """
228 while 1:
229 while self._tokenstack:
230 token = self._tokenstack.pop(0)
231 if tokentypes:
232 if token.type in tokentypes:
233 return token
234 else:
235 return token
236 data = self._fh.read(self.chunk)
237 if not data:
238 raise NoMoreTokensError()
239 self.feed(data)
240
241 def unget_token(self, token):
242 """Push a Token back onto the stack."""
243 self._tokenstack.insert(0, token)
244
245 def get_tag(self, *names):
246 """Return the next Token that represents an opening or closing tag.
247
248 If arguments are given, they are taken to be element names in which the
249 caller is interested: tags representing other elements will be skipped.
250 Element names must be given in lower case.
251
252 Raises NoMoreTokensError.
253
254 """
255 while 1:
256 tok = self.get_token()
257 if tok.type not in ["starttag", "endtag", "startendtag"]:
258 continue
259 if names:
260 if tok.data in names:
261 return tok
262 else:
263 return tok
264
265 def get_text(self, endat=None):
266 """Get some text.
267
268 endat: stop reading text at this tag (the tag is included in the
269 returned text); endtag is a tuple (type, name) where type is
270 "starttag", "endtag" or "startendtag", and name is the element name of
271 the tag (element names must be given in lower case)
272
273 If endat is not given, .get_text() will stop at the next opening or
274 closing tag, or when there are no more tokens (no exception is raised).
275 Note that .get_text() includes the text representation (if any) of the
276 opening tag, but pushes the opening tag back onto the stack. As a
277 result, if you want to call .get_text() again, you need to call
278 .get_tag() first (unless you want an empty string returned when you
279 next call .get_text()).
280
281 Entity references are translated using the value of the entitydefs
282 constructor argument (a mapping from names to characters like that
283 provided by the standard module htmlentitydefs). Named entity
284 references that are not in this mapping are left unchanged.
285
286 The textify attribute is used to translate opening tags into text: see
287 the class docstring.
288
289 """
290 text = []
291 tok = None
292 while 1:
293 try:
294 tok = self.get_token()
295 except NoMoreTokensError:
296 # unget last token (not the one we just failed to get)
297 if tok: self.unget_token(tok)
298 break
299 if tok.type == "data":
300 text.append(tok.data)
301 elif tok.type == "entityref":
302 t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
303 text.append(t)
304 elif tok.type == "charref":
305 t = unescape_charref(tok.data, self.encoding)
306 text.append(t)
307 elif tok.type in ["starttag", "endtag", "startendtag"]:
308 tag_name = tok.data
309 if tok.type in ["starttag", "startendtag"]:
310 alt = self.textify.get(tag_name)
311 if alt is not None:
312 if callable(alt):
313 text.append(alt(tok))
314 elif tok.attrs is not None:
315 for k, v in tok.attrs:
316 if k == alt:
317 text.append(v)
318 text.append("[%s]" % tag_name.upper())
319 if endat is None or endat == (tok.type, tag_name):
320 self.unget_token(tok)
321 break
322 return "".join(text)
323
324 def get_compressed_text(self, *args, **kwds):
325 """
326 As .get_text(), but collapses each group of contiguous whitespace to a
327 single space character, and removes all initial and trailing
328 whitespace.
329
330 """
331 text = self.get_text(*args, **kwds)
332 text = text.strip()
333 return self.compress_re.sub(" ", text)
334
335 def handle_startendtag(self, tag, attrs):
336 self._tokenstack.append(Token("startendtag", tag, attrs))
337 def handle_starttag(self, tag, attrs):
338 self._tokenstack.append(Token("starttag", tag, attrs))
339 def handle_endtag(self, tag):
340 self._tokenstack.append(Token("endtag", tag))
341 def handle_charref(self, name):
342 self._tokenstack.append(Token("charref", name))
343 def handle_entityref(self, name):
344 self._tokenstack.append(Token("entityref", name))
345 def handle_data(self, data):
346 self._tokenstack.append(Token("data", data))
347 def handle_comment(self, data):
348 self._tokenstack.append(Token("comment", data))
349 def handle_decl(self, decl):
350 self._tokenstack.append(Token("decl", decl))
351 def unknown_decl(self, data):
352 # XXX should this call self.error instead?
353 #self.error("unknown declaration: " + `data`)
354 self._tokenstack.append(Token("decl", data))
355 def handle_pi(self, data):
356 self._tokenstack.append(Token("pi", data))
357
358 def unescape_attr(self, name):
359 return unescape(name, self._entitydefs, self.encoding)
360 def unescape_attrs(self, attrs):
361 escaped_attrs = []
362 for key, val in attrs:
363 escaped_attrs.append((key, self.unescape_attr(val)))
364 return escaped_attrs
365
366 class PullParser(_AbstractParser, HTMLParser.HTMLParser):
367 def __init__(self, *args, **kwds):
368 HTMLParser.HTMLParser.__init__(self)
369 _AbstractParser.__init__(self, *args, **kwds)
370 def unescape(self, name):
371 # Use the entitydefs passed into constructor, not
372 # HTMLParser.HTMLParser's entitydefs.
373 return self.unescape_attr(name)
374
375 class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
376 def __init__(self, *args, **kwds):
377 sgmllib.SGMLParser.__init__(self)
378 _AbstractParser.__init__(self, *args, **kwds)
379 def unknown_starttag(self, tag, attrs):
380 attrs = self.unescape_attrs(attrs)
381 self._tokenstack.append(Token("starttag", tag, attrs))
382 def unknown_endtag(self, tag):
383 self._tokenstack.append(Token("endtag", tag))
384
385
386 def _test():
387 import doctest, _pullparser
388 return doctest.testmod(_pullparser)
389
390 if __name__ == "__main__":
391 _test()
OLDNEW
« no previous file with comments | « Tools/Scripts/webkitpy/thirdparty/mechanize/_opener.py ('k') | Tools/Scripts/webkitpy/thirdparty/mechanize/_request.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698