Index: chrome/common/extensions/docs/server2/intro_data_source.py |
diff --git a/chrome/common/extensions/docs/server2/intro_data_source.py b/chrome/common/extensions/docs/server2/intro_data_source.py |
index 1f42eec755152b71a995d32bf7f572abda963f82..68bddeca7d49c16e58e98797e5a05d34437cb37f 100644 |
--- a/chrome/common/extensions/docs/server2/intro_data_source.py |
+++ b/chrome/common/extensions/docs/server2/intro_data_source.py |
@@ -2,10 +2,46 @@ |
# Use of this source code is governed by a BSD-style license that can be |
# found in the LICENSE file. |
-import re |
+from HTMLParser import HTMLParser |
+ |
from path_utils import FormatKey |
from third_party.handlebar import Handlebar |
+class _IntroParser(HTMLParser): |
+ """ An HTML parser which will parse table of contents and page title info out |
+ of an intro. |
+ """ |
+ def init(self): |
+ """ This method is needed because HTMLParser is an old style class that does |
+ not inherit from |object|, so the super constructor cannot be called in |
+ |__init__|. |
+ """ |
not at google - send to devlin
2012/07/23 12:47:36
I think the pattern is like
def __init_(self):
cduvall
2012/07/23 20:24:21
Done.
|
+ self.toc = [] |
+ self.page_title = '' |
not at google - send to devlin
2012/07/23 12:47:36
None not empty string? We may want to test for the
cduvall
2012/07/23 20:24:21
I looked all throughout the internet for this, and
|
+ self._recent_tag = None |
+ self._current = {} |
not at google - send to devlin
2012/07/23 12:47:36
current_what?
cduvall
2012/07/23 20:24:21
Done.
|
+ |
+ def handle_starttag(self, tag, attrs): |
+ id_ = '' |
+ self._recent_tag = tag |
+ for attr in attrs: |
+ if attr[0] == 'id': |
+ id_ = attr[1] |
+ if tag == 'h2': |
+ self._current = { 'link': id_, 'subheadings': [] } |
+ self.toc.append(self._current) |
+ elif tag == 'h3': |
+ self._current = { 'link': id_ } |
+ self.toc[-1]['subheadings'].append(self._current) |
+ |
+ def handle_data(self, data): |
+ if data.isspace(): |
+ return |
not at google - send to devlin
2012/07/23 12:47:36
why is this needed?
cduvall
2012/07/23 20:24:21
Not needed anymore.
|
+ if self._recent_tag == 'h1': |
+ self.page_title = data |
+ elif self._recent_tag in ['h2', 'h3']: |
+ self._current['title'] = data |
not at google - send to devlin
2012/07/23 12:47:36
Note that this won't handle cases like
<h2>This h
cduvall
2012/07/23 20:24:21
I ended up not using a stack, but the new version
not at google - send to devlin
2012/07/23 23:30:41
sgtm. I realise I started micro-managing you a bit
cduvall
2012/07/23 23:58:33
Np, I would much rather learn to do it the right w
|
+ |
class IntroDataSource(object): |
"""This class fetches the intros for a given API. From this intro, a table |
of contents dictionary is created, which contains the headings in the intro. |
@@ -15,20 +51,14 @@ class IntroDataSource(object): |
self._base_paths = base_paths |
def _MakeIntroDict(self, intro): |
- h1s = re.findall('<h1.*>(.+)</h1>', intro) |
- if len(h1s) > 0: |
- page_title = h1s[0] |
- else: |
- page_title = '' |
- headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro) |
- toc = [] |
- for heading in headings: |
- level, link, title = heading |
- if level == '2': |
- toc.append({ 'link': link, 'title': title, 'subheadings': [] }) |
- else: |
- toc[-1]['subheadings'].append({ 'link': link, 'title': title }) |
- return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title } |
+ parser = _IntroParser() |
+ parser.init() |
+ parser.feed(intro) |
+ return { |
+ 'intro': Handlebar(intro), |
+ 'toc': parser.toc, |
+ 'title': parser.page_title |
+ } |
def __getitem__(self, key): |
return self.get(key) |
@@ -38,6 +68,6 @@ class IntroDataSource(object): |
for base_path in self._base_paths: |
try: |
return self._cache.GetFromFile(base_path + '/' + real_path) |
- except Exception: |
+ except Exception as e: |
pass |
return None |