Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(44)

Side by Side Diff: chrome/common/extensions/docs/server2/intro_data_source.py

Issue 10810047: Extensions Docs Server: HTML parser in IDS (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 import re 5 from HTMLParser import HTMLParser
6
6 from path_utils import FormatKey 7 from path_utils import FormatKey
7 from third_party.handlebar import Handlebar 8 from third_party.handlebar import Handlebar
8 9
10 class _IntroParser(HTMLParser):
11 """ An HTML parser which will parse table of contents and page title info out
12 of an intro.
13 """
14 def init(self):
15 """ This method is needed because HTMLParser is an old style class that does
16 not inherit from |object|, so the super constructor cannot be called in
17 |__init__|.
18 """
not at google - send to devlin 2012/07/23 12:47:36 I think the pattern is like def __init_(self):
cduvall 2012/07/23 20:24:21 Done.
19 self.toc = []
20 self.page_title = ''
not at google - send to devlin 2012/07/23 12:47:36 None not empty string? We may want to test for the
cduvall 2012/07/23 20:24:21 I looked all throughout the internet for this, and
21 self._recent_tag = None
22 self._current = {}
not at google - send to devlin 2012/07/23 12:47:36 current_what?
cduvall 2012/07/23 20:24:21 Done.
23
24 def handle_starttag(self, tag, attrs):
25 id_ = ''
26 self._recent_tag = tag
27 for attr in attrs:
28 if attr[0] == 'id':
29 id_ = attr[1]
30 if tag == 'h2':
31 self._current = { 'link': id_, 'subheadings': [] }
32 self.toc.append(self._current)
33 elif tag == 'h3':
34 self._current = { 'link': id_ }
35 self.toc[-1]['subheadings'].append(self._current)
36
37 def handle_data(self, data):
38 if data.isspace():
39 return
not at google - send to devlin 2012/07/23 12:47:36 why is this needed?
cduvall 2012/07/23 20:24:21 Not needed anymore.
40 if self._recent_tag == 'h1':
41 self.page_title = data
42 elif self._recent_tag in ['h2', 'h3']:
43 self._current['title'] = data
not at google - send to devlin 2012/07/23 12:47:36 Note that this won't handle cases like <h2>This h
cduvall 2012/07/23 20:24:21 I ended up not using a stack, but the new version
not at google - send to devlin 2012/07/23 23:30:41 sgtm. I realise I started micro-managing you a bit
cduvall 2012/07/23 23:58:33 Np, I would much rather learn to do it the right w
44
9 class IntroDataSource(object): 45 class IntroDataSource(object):
10 """This class fetches the intros for a given API. From this intro, a table 46 """This class fetches the intros for a given API. From this intro, a table
11 of contents dictionary is created, which contains the headings in the intro. 47 of contents dictionary is created, which contains the headings in the intro.
12 """ 48 """
13 def __init__(self, cache_builder, base_paths): 49 def __init__(self, cache_builder, base_paths):
14 self._cache = cache_builder.build(self._MakeIntroDict) 50 self._cache = cache_builder.build(self._MakeIntroDict)
15 self._base_paths = base_paths 51 self._base_paths = base_paths
16 52
17 def _MakeIntroDict(self, intro): 53 def _MakeIntroDict(self, intro):
18 h1s = re.findall('<h1.*>(.+)</h1>', intro) 54 parser = _IntroParser()
19 if len(h1s) > 0: 55 parser.init()
20 page_title = h1s[0] 56 parser.feed(intro)
21 else: 57 return {
22 page_title = '' 58 'intro': Handlebar(intro),
23 headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro) 59 'toc': parser.toc,
24 toc = [] 60 'title': parser.page_title
25 for heading in headings: 61 }
26 level, link, title = heading
27 if level == '2':
28 toc.append({ 'link': link, 'title': title, 'subheadings': [] })
29 else:
30 toc[-1]['subheadings'].append({ 'link': link, 'title': title })
31 return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title }
32 62
33 def __getitem__(self, key): 63 def __getitem__(self, key):
34 return self.get(key) 64 return self.get(key)
35 65
36 def get(self, key): 66 def get(self, key):
37 real_path = FormatKey(key) 67 real_path = FormatKey(key)
38 for base_path in self._base_paths: 68 for base_path in self._base_paths:
39 try: 69 try:
40 return self._cache.GetFromFile(base_path + '/' + real_path) 70 return self._cache.GetFromFile(base_path + '/' + real_path)
41 except Exception: 71 except Exception as e:
42 pass 72 pass
43 return None 73 return None
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698