OLD | NEW |
---|---|
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 import re | 5 from HTMLParser import HTMLParser |
6 | |
6 from path_utils import FormatKey | 7 from path_utils import FormatKey |
7 from third_party.handlebar import Handlebar | 8 from third_party.handlebar import Handlebar |
8 | 9 |
10 class _IntroParser(HTMLParser): | |
11 """ An HTML parser which will parse table of contents and page title info out | |
12 of an intro. | |
13 """ | |
14 def init(self): | |
15 """ This method is needed because HTMLParser is an old style class that does | |
16 not inherit from |object|, so the super constructor cannot be called in | |
17 |__init__|. | |
18 """ | |
not at google - send to devlin
2012/07/23 12:47:36
I think the pattern is like
def __init_(self):
cduvall
2012/07/23 20:24:21
Done.
| |
19 self.toc = [] | |
20 self.page_title = '' | |
not at google - send to devlin
2012/07/23 12:47:36
None not empty string? We may want to test for the
cduvall
2012/07/23 20:24:21
I looked all throughout the internet for this, and
| |
21 self._recent_tag = None | |
22 self._current = {} | |
not at google - send to devlin
2012/07/23 12:47:36
current_what?
cduvall
2012/07/23 20:24:21
Done.
| |
23 | |
24 def handle_starttag(self, tag, attrs): | |
25 id_ = '' | |
26 self._recent_tag = tag | |
27 for attr in attrs: | |
28 if attr[0] == 'id': | |
29 id_ = attr[1] | |
30 if tag == 'h2': | |
31 self._current = { 'link': id_, 'subheadings': [] } | |
32 self.toc.append(self._current) | |
33 elif tag == 'h3': | |
34 self._current = { 'link': id_ } | |
35 self.toc[-1]['subheadings'].append(self._current) | |
36 | |
37 def handle_data(self, data): | |
38 if data.isspace(): | |
39 return | |
not at google - send to devlin
2012/07/23 12:47:36
why is this needed?
cduvall
2012/07/23 20:24:21
Not needed anymore.
| |
40 if self._recent_tag == 'h1': | |
41 self.page_title = data | |
42 elif self._recent_tag in ['h2', 'h3']: | |
43 self._current['title'] = data | |
not at google - send to devlin
2012/07/23 12:47:36
Note that this won't handle cases like
<h2>This h
cduvall
2012/07/23 20:24:21
I ended up not using a stack, but the new version
not at google - send to devlin
2012/07/23 23:30:41
sgtm. I realise I started micro-managing you a bit
cduvall
2012/07/23 23:58:33
Np, I would much rather learn to do it the right w
| |
44 | |
9 class IntroDataSource(object): | 45 class IntroDataSource(object): |
10 """This class fetches the intros for a given API. From this intro, a table | 46 """This class fetches the intros for a given API. From this intro, a table |
11 of contents dictionary is created, which contains the headings in the intro. | 47 of contents dictionary is created, which contains the headings in the intro. |
12 """ | 48 """ |
13 def __init__(self, cache_builder, base_paths): | 49 def __init__(self, cache_builder, base_paths): |
14 self._cache = cache_builder.build(self._MakeIntroDict) | 50 self._cache = cache_builder.build(self._MakeIntroDict) |
15 self._base_paths = base_paths | 51 self._base_paths = base_paths |
16 | 52 |
17 def _MakeIntroDict(self, intro): | 53 def _MakeIntroDict(self, intro): |
18 h1s = re.findall('<h1.*>(.+)</h1>', intro) | 54 parser = _IntroParser() |
19 if len(h1s) > 0: | 55 parser.init() |
20 page_title = h1s[0] | 56 parser.feed(intro) |
21 else: | 57 return { |
22 page_title = '' | 58 'intro': Handlebar(intro), |
23 headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro) | 59 'toc': parser.toc, |
24 toc = [] | 60 'title': parser.page_title |
25 for heading in headings: | 61 } |
26 level, link, title = heading | |
27 if level == '2': | |
28 toc.append({ 'link': link, 'title': title, 'subheadings': [] }) | |
29 else: | |
30 toc[-1]['subheadings'].append({ 'link': link, 'title': title }) | |
31 return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title } | |
32 | 62 |
33 def __getitem__(self, key): | 63 def __getitem__(self, key): |
34 return self.get(key) | 64 return self.get(key) |
35 | 65 |
36 def get(self, key): | 66 def get(self, key): |
37 real_path = FormatKey(key) | 67 real_path = FormatKey(key) |
38 for base_path in self._base_paths: | 68 for base_path in self._base_paths: |
39 try: | 69 try: |
40 return self._cache.GetFromFile(base_path + '/' + real_path) | 70 return self._cache.GetFromFile(base_path + '/' + real_path) |
41 except Exception: | 71 except Exception as e: |
42 pass | 72 pass |
43 return None | 73 return None |
OLD | NEW |