Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2013 The Chromium Authors. All rights reserved. | 1 # Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 from HTMLParser import HTMLParser | 5 from HTMLParser import HTMLParser |
| 6 import logging | 6 import logging |
| 7 | 7 |
| 8 | 8 |
| 9 class ParseResult(object): | 9 class ParseResult(object): |
| 10 '''The result of |ParseDocument|: | 10 '''The result of |ParseDocument|: |
| (...skipping 30 matching lines...) Expand all Loading... | |
| 41 ''' | 41 ''' |
| 42 | 42 |
| 43 def __init__(self, tag, attributes): | 43 def __init__(self, tag, attributes): |
| 44 self.attributes = attributes | 44 self.attributes = attributes |
| 45 self.name = '' | 45 self.name = '' |
| 46 self.entries = [] | 46 self.entries = [] |
| 47 # Callers shouldn't care about the tag, but we need it for sanity checking, | 47 # Callers shouldn't care about the tag, but we need it for sanity checking, |
| 48 # so make it private. In particular we pretend that anything but the first | 48 # so make it private. In particular we pretend that anything but the first |
| 49 # h1 is an h2, and it'd be odd to expose that. | 49 # h1 is an h2, and it'd be odd to expose that. |
| 50 self._tag = tag | 50 self._tag = tag |
| 51 # Documents can override the name of the entry using title="". | |
| 52 self._has_explicit_name = False | |
| 51 | 53 |
| 52 def __repr__(self): | 54 def __repr__(self): |
| 53 return '<%s>%s</%s>' % (self._tag, self.name, self._tag) | 55 return '<%s>%s</%s>' % (self._tag, self.name, self._tag) |
| 54 | 56 |
| 55 def __str__(self): | 57 def __str__(self): |
| 56 return repr(self) | 58 return repr(self) |
| 57 | 59 |
| 58 | 60 |
| 59 def ParseDocument(document, expect_title=False): | 61 def ParseDocument(document, expect_title=False): |
| 60 '''Parses the title and a document structure form |document| and returns a | 62 '''Parses the title and a document structure form |document| and returns a |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 118 return | 120 return |
| 119 | 121 |
| 120 if tag != 'h1' and tag not in _HEADER_TAGS: | 122 if tag != 'h1' and tag not in _HEADER_TAGS: |
| 121 return | 123 return |
| 122 | 124 |
| 123 if self._processing_entry is not None: | 125 if self._processing_entry is not None: |
| 124 self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' % | 126 self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' % |
| 125 (tag, self._processing_entry._tag)) | 127 (tag, self._processing_entry._tag)) |
| 126 return | 128 return |
| 127 | 129 |
| 128 self._processing_entry = DocumentStructureEntry(tag, dict(attrs)) | 130 attrs_dict = dict(attrs) |
| 131 self._processing_entry = DocumentStructureEntry(tag, attrs_dict) | |
| 132 | |
| 133 explicit_name = attrs_dict.pop('title', None) | |
| 134 if explicit_name == '': | |
| 135 # The document can specify title="" to ignore the TOC entry entirely. | |
|
Yoyo Zhou
2013/12/10 23:48:23
The wording is a little confusing - how about some
not at google - send to devlin
2013/12/11 22:23:23
Done.
| |
| 136 return | |
| 137 if explicit_name is not None: | |
| 138 self._processing_entry.name = explicit_name | |
| 139 self._processing_entry._has_explicit_name = True | |
| 129 | 140 |
| 130 if tag == 'h1' and self._title_entry is not None: | 141 if tag == 'h1' and self._title_entry is not None: |
| 131 self._WarnWithPosition('Found multiple <h1> tags. Subsequent <h1> tags ' | 142 self._WarnWithPosition('Found multiple <h1> tags. Subsequent <h1> tags ' |
| 132 'will be classified as <h2> for the purpose of ' | 143 'will be classified as <h2> for the purpose of ' |
| 133 'the structure') | 144 'the structure') |
| 134 tag = 'h2' | 145 tag = 'h2' |
| 135 | 146 |
| 136 if tag == 'h1': | 147 if tag == 'h1': |
| 137 self._title_entry = self._processing_entry | 148 self._title_entry = self._processing_entry |
| 138 else: | 149 else: |
| (...skipping 23 matching lines...) Expand all Loading... | |
| 162 | 173 |
| 163 if self._processing_entry._tag != tag: | 174 if self._processing_entry._tag != tag: |
| 164 self._WarnWithPosition('Found closing </%s> while processing a <%s>' % | 175 self._WarnWithPosition('Found closing </%s> while processing a <%s>' % |
| 165 (tag, self._processing_entry._tag)) | 176 (tag, self._processing_entry._tag)) |
| 166 # Note: no early return, it's more likely that the mismatched header was | 177 # Note: no early return, it's more likely that the mismatched header was |
| 167 # a typo rather than a misplaced closing header tag. | 178 # a typo rather than a misplaced closing header tag. |
| 168 | 179 |
| 169 self._processing_entry = None | 180 self._processing_entry = None |
| 170 | 181 |
| 171 def handle_data(self, data): | 182 def handle_data(self, data): |
| 172 if self._processing_entry is not None: | 183 if (self._processing_entry is not None and |
| 184 not self._processing_entry._has_explicit_name): | |
| 173 # += is inefficient, but probably fine here because the chances of a | 185 # += is inefficient, but probably fine here because the chances of a |
| 174 # large number of nested tags within header tags is pretty low. | 186 # large number of nested tags within header tags is pretty low. |
| 175 self._processing_entry.name += data | 187 self._processing_entry.name += data |
| 176 | 188 |
| 177 def close(self): | 189 def close(self): |
| 178 HTMLParser.close(self) | 190 HTMLParser.close(self) |
| 179 | 191 |
| 180 self._OnSectionBoundary() | 192 self._OnSectionBoundary() |
| 181 | 193 |
| 182 if self._processing_entry is not None: | 194 if self._processing_entry is not None: |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 201 | 213 |
| 202 def _OnSectionBoundary(self): | 214 def _OnSectionBoundary(self): |
| 203 # Only start a new section if the previous section was non-empty. | 215 # Only start a new section if the previous section was non-empty. |
| 204 if self._processing_section.structure: | 216 if self._processing_section.structure: |
| 205 self._sections.append(self._processing_section) | 217 self._sections.append(self._processing_section) |
| 206 self._processing_section = DocumentSection() | 218 self._processing_section = DocumentSection() |
| 207 | 219 |
| 208 def _WarnWithPosition(self, message): | 220 def _WarnWithPosition(self, message): |
| 209 line, col = self.getpos() | 221 line, col = self.getpos() |
| 210 self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1)) | 222 self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1)) |
| OLD | NEW |