Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 import re | 5 from HTMLParser import HTMLParser |
| 6 | |
| 6 from path_utils import FormatKey | 7 from path_utils import FormatKey |
| 7 from third_party.handlebar import Handlebar | 8 from third_party.handlebar import Handlebar |
| 8 | 9 |
| 10 class _IntroParser(HTMLParser): | |
| 11 """ An HTML parser which will parse table of contents and page title info out | |
| 12 of an intro. | |
| 13 """ | |
| 14 def init(self): | |
| 15 """ This method is needed because HTMLParser is an old style class that does | |
| 16 not inherit from |object|, so the super constructor cannot be called in | |
| 17 |__init__|. | |
| 18 """ | |
|
not at google - send to devlin
2012/07/23 12:47:36
I think the pattern is like
def __init_(self):
cduvall
2012/07/23 20:24:21
Done.
| |
| 19 self.toc = [] | |
| 20 self.page_title = '' | |
|
not at google - send to devlin
2012/07/23 12:47:36
None not empty string? We may want to test for the
cduvall
2012/07/23 20:24:21
I looked all throughout the internet for this, and
| |
| 21 self._recent_tag = None | |
| 22 self._current = {} | |
|
not at google - send to devlin
2012/07/23 12:47:36
current_what?
cduvall
2012/07/23 20:24:21
Done.
| |
| 23 | |
| 24 def handle_starttag(self, tag, attrs): | |
| 25 id_ = '' | |
| 26 self._recent_tag = tag | |
| 27 for attr in attrs: | |
| 28 if attr[0] == 'id': | |
| 29 id_ = attr[1] | |
| 30 if tag == 'h2': | |
| 31 self._current = { 'link': id_, 'subheadings': [] } | |
| 32 self.toc.append(self._current) | |
| 33 elif tag == 'h3': | |
| 34 self._current = { 'link': id_ } | |
| 35 self.toc[-1]['subheadings'].append(self._current) | |
| 36 | |
| 37 def handle_data(self, data): | |
| 38 if data.isspace(): | |
| 39 return | |
|
not at google - send to devlin
2012/07/23 12:47:36
why is this needed?
cduvall
2012/07/23 20:24:21
Not needed anymore.
| |
| 40 if self._recent_tag == 'h1': | |
| 41 self.page_title = data | |
| 42 elif self._recent_tag in ['h2', 'h3']: | |
| 43 self._current['title'] = data | |
|
not at google - send to devlin
2012/07/23 12:47:36
Note that this won't handle cases like
<h2>This h
cduvall
2012/07/23 20:24:21
I ended up not using a stack, but the new version
not at google - send to devlin
2012/07/23 23:30:41
sgtm. I realise I started micro-managing you a bit
cduvall
2012/07/23 23:58:33
Np, I would much rather learn to do it the right w
| |
| 44 | |
| 9 class IntroDataSource(object): | 45 class IntroDataSource(object): |
| 10 """This class fetches the intros for a given API. From this intro, a table | 46 """This class fetches the intros for a given API. From this intro, a table |
| 11 of contents dictionary is created, which contains the headings in the intro. | 47 of contents dictionary is created, which contains the headings in the intro. |
| 12 """ | 48 """ |
| 13 def __init__(self, cache_builder, base_paths): | 49 def __init__(self, cache_builder, base_paths): |
| 14 self._cache = cache_builder.build(self._MakeIntroDict) | 50 self._cache = cache_builder.build(self._MakeIntroDict) |
| 15 self._base_paths = base_paths | 51 self._base_paths = base_paths |
| 16 | 52 |
| 17 def _MakeIntroDict(self, intro): | 53 def _MakeIntroDict(self, intro): |
| 18 h1s = re.findall('<h1.*>(.+)</h1>', intro) | 54 parser = _IntroParser() |
| 19 if len(h1s) > 0: | 55 parser.init() |
| 20 page_title = h1s[0] | 56 parser.feed(intro) |
| 21 else: | 57 return { |
| 22 page_title = '' | 58 'intro': Handlebar(intro), |
| 23 headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro) | 59 'toc': parser.toc, |
| 24 toc = [] | 60 'title': parser.page_title |
| 25 for heading in headings: | 61 } |
| 26 level, link, title = heading | |
| 27 if level == '2': | |
| 28 toc.append({ 'link': link, 'title': title, 'subheadings': [] }) | |
| 29 else: | |
| 30 toc[-1]['subheadings'].append({ 'link': link, 'title': title }) | |
| 31 return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title } | |
| 32 | 62 |
| 33 def __getitem__(self, key): | 63 def __getitem__(self, key): |
| 34 return self.get(key) | 64 return self.get(key) |
| 35 | 65 |
| 36 def get(self, key): | 66 def get(self, key): |
| 37 real_path = FormatKey(key) | 67 real_path = FormatKey(key) |
| 38 for base_path in self._base_paths: | 68 for base_path in self._base_paths: |
| 39 try: | 69 try: |
| 40 return self._cache.GetFromFile(base_path + '/' + real_path) | 70 return self._cache.GetFromFile(base_path + '/' + real_path) |
| 41 except Exception: | 71 except Exception as e: |
| 42 pass | 72 pass |
| 43 return None | 73 return None |
| OLD | NEW |