chrome/common/extensions/docs/server2/intro_data_source.py - Issue 10810047: Extensions Docs Server: HTML parser in IDS

Side by Side Diff: chrome/common/extensions/docs/server2/intro_data_source.py

Issue 10810047: Extensions Docs Server: HTML parser in IDS (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import re	5 from HTMLParser import HTMLParser

	6

6 from path_utils import FormatKey	7 from path_utils import FormatKey

7 from third_party.handlebar import Handlebar	8 from third_party.handlebar import Handlebar

8	9

	10 class _IntroParser(HTMLParser):

	11 """ An HTML parser which will parse table of contents and page title info out

	12 of an intro.

	13 """

	14 def init(self):

	15 """ This method is needed because HTMLParser is an old style class that does

	16 not inherit from \|object\|, so the super constructor cannot be called in

	17 \|__init__\|.

	18 """
	not at google - send to devlin 2012/07/23 12:47:36 I think the pattern is like def __init_(self): I think the pattern is like def __init_(self): HTMLParser.__init__(self) self.toc = [] ... cduvall 2012/07/23 20:24:21 Done. Show quoted text On 2012/07/23 12:47:36, kalman wrote: > I think the pattern is like > > def __init_(self): > HTMLParser.__init__(self) > self.toc = [] > ... Done.
	19 self.toc = []

	20 self.page_title = ''
	not at google - send to devlin 2012/07/23 12:47:36 None not empty string? We may want to test for the None not empty string? We may want to test for the existence of a title, and this would make it a bit ambiguous. cduvall 2012/07/23 20:24:21 I looked all throughout the internet for this, and Show quoted text On 2012/07/23 12:47:36, kalman wrote: > None not empty string? We may want to test for the existence of a title, and > this would make it a bit ambiguous. I looked all throughout the internet for this, and I only found the super(Foo, self).__init__() version. Thanks.
	21 self._recent_tag = None

	22 self._current = {}
	not at google - send to devlin 2012/07/23 12:47:36 current_what? current_what? cduvall 2012/07/23 20:24:21 Done. Show quoted text On 2012/07/23 12:47:36, kalman wrote: > current_what? Done.
	23

	24 def handle_starttag(self, tag, attrs):

	25 id_ = ''

	26 self._recent_tag = tag

	27 for attr in attrs:

	28 if attr[0] == 'id':

	29 id_ = attr[1]

	30 if tag == 'h2':

	31 self._current = { 'link': id_, 'subheadings': [] }

	32 self.toc.append(self._current)

	33 elif tag == 'h3':

	34 self._current = { 'link': id_ }

	35 self.toc[-1]['subheadings'].append(self._current)

	36

	37 def handle_data(self, data):

	38 if data.isspace():

	39 return
	not at google - send to devlin 2012/07/23 12:47:36 why is this needed? why is this needed? cduvall 2012/07/23 20:24:21 Not needed anymore. Show quoted text On 2012/07/23 12:47:36, kalman wrote: > why is this needed? Not needed anymore.
	40 if self._recent_tag == 'h1':

	41 self.page_title = data

	42 elif self._recent_tag in ['h2', 'h3']:

	43 self._current['title'] = data
	not at google - send to devlin 2012/07/23 12:47:36 Note that this won't handle cases like <h2>This h Note that this won't handle cases like <h2>This heading is <b>extra important</b></h2> You might need to maintain a "header_tag_stack" and rather than accessing recent_tag, access the top of that. cduvall 2012/07/23 20:24:21 I ended up not using a stack, but the new version Show quoted text On 2012/07/23 12:47:36, kalman wrote: > Note that this won't handle cases like > > <h2>This heading is <b>extra important</b></h2> > > You might need to maintain a "header_tag_stack" and rather than accessing > recent_tag, access the top of that. I ended up not using a stack, but the new version handles tags in the headings. It doesn't handle headings inside of headings, but that should never happen. I can change it to use a stack instead if you would like. not at google - send to devlin 2012/07/23 23:30:41 sgtm. I realise I started micro-managing you a bit Show quoted text On 2012/07/23 20:24:21, cduvall wrote: > On 2012/07/23 12:47:36, kalman wrote: > > Note that this won't handle cases like > > > > <h2>This heading is <b>extra important</b></h2> > > > > You might need to maintain a "header_tag_stack" and rather than accessing > > recent_tag, access the top of that. > > I ended up not using a stack, but the new version handles tags in the headings. > It doesn't handle headings inside of headings, but that should never happen. I > can change it to use a stack instead if you would like. sgtm. I realise I started micro-managing you a bit recently, will try to stop :) cduvall 2012/07/23 23:58:33 Np, I would much rather learn to do it the right w Show quoted text On 2012/07/23 23:30:41, kalman wrote: > On 2012/07/23 20:24:21, cduvall wrote: > > On 2012/07/23 12:47:36, kalman wrote: > > > Note that this won't handle cases like > > > > > > <h2>This heading is <b>extra important</b></h2> > > > > > > You might need to maintain a "header_tag_stack" and rather than accessing > > > recent_tag, access the top of that. > > > > I ended up not using a stack, but the new version handles tags in the > headings. > > It doesn't handle headings inside of headings, but that should never happen. I > > can change it to use a stack instead if you would like. > > sgtm. I realise I started micro-managing you a bit recently, will try to stop :) Np, I would much rather learn to do it the right way :)
	44

9 class IntroDataSource(object):	45 class IntroDataSource(object):

10 """This class fetches the intros for a given API. From this intro, a table	46 """This class fetches the intros for a given API. From this intro, a table

11 of contents dictionary is created, which contains the headings in the intro.	47 of contents dictionary is created, which contains the headings in the intro.

12 """	48 """

13 def __init__(self, cache_builder, base_paths):	49 def __init__(self, cache_builder, base_paths):

14 self._cache = cache_builder.build(self._MakeIntroDict)	50 self._cache = cache_builder.build(self._MakeIntroDict)

15 self._base_paths = base_paths	51 self._base_paths = base_paths

16	52

17 def _MakeIntroDict(self, intro):	53 def _MakeIntroDict(self, intro):

18 h1s = re.findall('<h1.*>(.+)</h1>', intro)	54 parser = _IntroParser()

19 if len(h1s) > 0:	55 parser.init()

20 page_title = h1s[0]	56 parser.feed(intro)

21 else:	57 return {

22 page_title = ''	58 'intro': Handlebar(intro),

23 headings = re.findall('<h([23]) id\="(.+)">(.+)</h[23]>', intro)	59 'toc': parser.toc,

24 toc = []	60 'title': parser.page_title

25 for heading in headings:	61 }

26 level, link, title = heading

27 if level == '2':

28 toc.append({ 'link': link, 'title': title, 'subheadings': [] })

29 else:

30 toc[-1]['subheadings'].append({ 'link': link, 'title': title })

31 return { 'intro': Handlebar(intro), 'toc': toc , 'title': page_title }

32	62

33 def __getitem__(self, key):	63 def __getitem__(self, key):

34 return self.get(key)	64 return self.get(key)

35	65

36 def get(self, key):	66 def get(self, key):

37 real_path = FormatKey(key)	67 real_path = FormatKey(key)

38 for base_path in self._base_paths:	68 for base_path in self._base_paths:

39 try:	69 try:

40 return self._cache.GetFromFile(base_path + '/' + real_path)	70 return self._cache.GetFromFile(base_path + '/' + real_path)

41 except Exception:	71 except Exception as e:

42 pass	72 pass

43 return None	73 return None

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »