Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 # Copyright 2013 The Chromium Authors. All rights reserved. | |
| 2 # Use of this source code is governed by a BSD-style license that can be | |
| 3 # found in the LICENSE file. | |
| 4 | |
| 5 from collections import defaultdict, deque, namedtuple | |
| 6 from HTMLParser import HTMLParser, HTMLParseError | |
| 7 import posixpath | |
| 8 from urlparse import urlsplit | |
| 9 | |
| 10 from cron_servlet import CreateURLsFromPaths | |
| 11 import svn_constants | |
| 12 | |
| 13 Page = namedtuple('Page', 'status, links, anchors, anchor_refs') | |
| 14 | |
| 15 class _ContentParser(HTMLParser): | |
| 16 ''' Find all the links inside a section of the document with an id of | |
|
not at google - send to devlin
2013/07/09 00:17:32
s/''' Find/'''Finds/
jshumway
2013/07/17 00:49:55
Done.
| |
| 17 |container_id|. Pull out all 'id' and 'name' attributes and add them to a set. | |
| 18 ''' | |
| 19 | |
| 20 def __init__(self, container_id, exclude): | |
|
not at google - send to devlin
2013/07/09 00:17:32
what is exclude?
jshumway
2013/07/17 00:49:55
It used to be a way to tell the parser to ignore c
| |
| 21 HTMLParser.__init__(self) | |
| 22 self._container_id = container_id | |
| 23 self._in_content = 0 | |
| 24 self._exclude = exclude | |
| 25 self.links = [] | |
| 26 self.anchors = set() | |
| 27 | |
| 28 def handle_starttag(self, tag, raw_attrs): | |
| 29 attrs = dict(raw_attrs) | |
| 30 | |
| 31 if tag == 'div': | |
| 32 if attrs.get('id', '') == self._container_id or self._in_content: | |
|
not at google - send to devlin
2013/07/09 00:17:32
the ", ''" part seems unnecessary.
jshumway
2013/07/17 00:49:55
indeed it is, removed.
| |
| 33 self._in_content += 1 | |
| 34 | |
|
not at google - send to devlin
2013/07/09 00:17:32
no blank line here
jshumway
2013/07/17 00:49:55
Done.
| |
| 35 elif self._in_content: | |
| 36 href = attrs.get('href') | |
| 37 if tag == 'a' and href and not urlsplit(href).scheme in ('http', 'https'): | |
|
not at google - send to devlin
2013/07/09 00:17:32
let's have:
if tag == 'a':
href = attrs.get('hr
jshumway
2013/07/17 00:49:55
Done.
| |
| 38 if href.endswith('.html') or '#' in href: | |
| 39 if all([e not in href for e in self._exclude]): | |
|
not at google - send to devlin
2013/07/09 00:17:32
[]s not needed, it can just be all(e not in href..
jshumway
2013/07/17 00:49:55
I created such a function.
There check for endswi
| |
| 40 self.links.append(href) | |
| 41 | |
| 42 if attrs.get('id'): | |
| 43 self.anchors.add(attrs['id']) | |
| 44 | |
|
not at google - send to devlin
2013/07/09 00:17:32
no blank line here or below (I know, it's just tha
jshumway
2013/07/17 00:49:55
Done.
| |
| 45 if attrs.get('name'): | |
| 46 self.anchors.add(attrs['name']) | |
| 47 | |
| 48 def handle_endtag(self, tag): | |
| 49 if self._in_content and tag == 'div': | |
| 50 self._in_content -= 1 | |
| 51 | |
| 52 class LinkErrorDetector(object): | |
|
not at google - send to devlin
2013/07/09 00:17:32
please write tests for this.
jshumway
2013/07/17 00:49:55
Done.
| |
| 53 ''' Find link errors on the doc server. This includes broken links, those with | |
| 54 a target page that 404s or contain an anchor that doesn't exist, or pages that | |
| 55 have no links to them. | |
| 56 ''' | |
| 57 | |
| 58 def __init__(self, | |
| 59 file_system, | |
| 60 renderer, | |
| 61 public_path, | |
| 62 root_pages, | |
| 63 exclude=()): | |
|
not at google - send to devlin
2013/07/09 00:17:32
explain these arguments
jshumway
2013/07/17 00:49:55
Done.
| |
| 64 self._file_system = file_system | |
| 65 self._renderer = renderer | |
| 66 self._public_path = public_path | |
| 67 self._exclude = exclude | |
| 68 self._pages = defaultdict(lambda: Page(404, (), (), ())) | |
| 69 self._root_pages = frozenset(root_pages) | |
| 70 self._ignore_detached = frozenset(('apps/404.html', 'extensions/404.html')) | |
|
not at google - send to devlin
2013/07/09 00:17:32
always_detached?
jshumway
2013/07/17 00:49:55
Done.
| |
| 71 | |
| 72 self._RenderAllPages() | |
| 73 | |
| 74 def _RenderAllPages(self): | |
| 75 ''' Traverse the public templates directory rendering each URL and | |
| 76 processing the resultant html to pull out all links and anchors. | |
| 77 ''' | |
| 78 | |
| 79 def process(path, content_from): | |
| 80 parser = _ContentParser(content_from, self._exclude) | |
| 81 response = self._renderer(path) | |
| 82 | |
| 83 if response.status != 200: | |
| 84 return Page(response.status, (), (), ()) | |
| 85 if not path.endswith('.html'): | |
| 86 return Page(200, (), (), ()) | |
| 87 | |
| 88 try: | |
| 89 parser.feed(str(response.content)) | |
| 90 except HTMLParseError: | |
| 91 return Page(200, (), (), ()) | |
| 92 | |
| 93 links, anchors = parser.links, parser.anchors | |
| 94 base, _ = path.rsplit('/', 1) | |
| 95 edges = [] | |
| 96 anchor_refs = [] | |
| 97 | |
| 98 # Convert relative links to absolute links and categorize links as edges | |
| 99 # or anchor_refs. | |
| 100 for link in links: | |
| 101 # Files like experimental_history.html are refered to with the URL | |
| 102 # experimental.history.html. | |
| 103 head, last = link.rsplit('/', 1) if '/' in link else ('', link) | |
| 104 if '#' in last: | |
| 105 last, anchor = last.split('#') | |
| 106 else: | |
| 107 anchor = '' | |
| 108 | |
| 109 if last.count('.') > 1: | |
| 110 last = last.replace('.', '_', last.count('.') - 1) | |
| 111 link = posixpath.join(head, last) | |
| 112 if anchor: | |
| 113 link = '%s#%s' % (link, anchor) | |
| 114 | |
| 115 if link.startswith('#'): | |
| 116 anchor_refs.append(link) | |
| 117 else: | |
| 118 if link.startswith('/'): | |
| 119 link = link[1:] | |
| 120 else: | |
| 121 link = posixpath.normpath('%s/%s' % (base, link)) | |
| 122 | |
| 123 if '#' in link: | |
| 124 anchor_refs.append(link) | |
| 125 else: | |
| 126 edges.append(link) | |
| 127 | |
| 128 return Page(200, edges, anchors, anchor_refs) | |
| 129 | |
| 130 top_level_directories = ( | |
| 131 (svn_constants.PUBLIC_TEMPLATE_PATH, ''), | |
| 132 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'), | |
| 133 ) | |
| 134 | |
| 135 for dirpath, urlprefix in top_level_directories: | |
| 136 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) | |
| 137 for url, path in files: | |
| 138 self._pages[url] = process( | |
| 139 url, 'gc-container' if url in self._root_pages else 'gc-pagecontent') | |
|
not at google - send to devlin
2013/07/09 00:17:32
please tell me we can implement this without needi
jshumway
2013/07/17 00:49:55
As we talked about in IRC, we can implement this w
| |
| 140 | |
| 141 if self._pages[url].status != 200: | |
| 142 print (url, ', a url derived from the path', dirpath + | |
| 143 ', resulted in a', self._pages[url].status) | |
| 144 | |
| 145 def GetBrokenLinks(self): | |
| 146 ''' Find all broken links. A broken link is a link that leads to a page that | |
| 147 does not exist (404s when rendered) or that contains an anchor that does not | |
| 148 properly resolve. | |
| 149 | |
| 150 Returns a list of tuples, one tuple per broken link. The first item is the | |
| 151 page that a broken link occurred on; the second item the value of the link; | |
| 152 and an optional third value with some debugging information. | |
|
not at google - send to devlin
2013/07/09 00:17:32
Can we either always or never have a third argumen
jshumway
2013/07/17 00:49:55
The method now returns two lists, the first list c
| |
| 153 ''' | |
| 154 broken_links = [] | |
| 155 | |
| 156 for url in self._pages.keys(): | |
| 157 page = self._pages[url] | |
| 158 if page.status != 200: | |
| 159 continue | |
| 160 | |
| 161 # First test links without anchors. | |
| 162 for link in page.links: | |
| 163 if self._pages[link].status != 200: | |
| 164 broken_links.append((url, link)) | |
| 165 | |
| 166 # Then find broken links with an anchor component. | |
|
not at google - send to devlin
2013/07/09 00:17:32
use urlparse here?
jshumway
2013/07/17 00:49:55
Done.
| |
| 167 for ref in page.anchor_refs: | |
| 168 if ref.startswith('#'): | |
| 169 if not ref.lstrip('#') in page.anchors and ref != '#top': | |
| 170 broken_links.append((url, ref)) | |
| 171 else: | |
| 172 path, lref = ref.split('#') | |
| 173 target_page = self._pages[path] | |
| 174 | |
| 175 if target_page.status != 200: | |
| 176 broken_links.append((url, ref, 'target page does not exist')) | |
| 177 elif not lref in target_page.anchors: | |
| 178 broken_links.append((url, ref)) | |
| 179 | |
| 180 return broken_links | |
| 181 | |
| 182 def GetOrphanPages(self, seed_urls=None): | |
|
not at google - send to devlin
2013/07/09 00:17:32
Orphaned nor Orphan
Orphan sounds so sad :(
jshumway
2013/07/17 00:49:55
Agreed.
| |
| 183 ''' Crawl the server find all pages that are connected to the pages at | |
| 184 |seed_url|s. Return the links that are valid on the server but are not in | |
| 185 part of the connected component containing the |seed_url|s. These pages | |
| 186 are orphans and cannot be reached simply by clicking through the server. | |
| 187 | |
| 188 If seed_urls is not provided, root_pages will be used instead. | |
| 189 ''' | |
| 190 seeds = seed_urls or self._root_pages | |
| 191 queue = deque(seeds) | |
|
not at google - send to devlin
2013/07/09 00:17:32
pages_to_check?
jshumway
2013/07/17 00:49:55
Done.
| |
| 192 connected = set(seeds) | |
|
not at google - send to devlin
2013/07/09 00:17:32
found?
jshumway
2013/07/17 00:49:55
Done.
| |
| 193 | |
| 194 while queue: | |
| 195 item = queue.popleft() | |
| 196 | |
|
not at google - send to devlin
2013/07/09 00:17:32
no blank line here
jshumway
2013/07/17 00:49:55
Done.
| |
| 197 for link in self._pages[item].links: | |
| 198 if link not in connected: | |
| 199 connected.add(link) | |
| 200 queue.append(link) | |
|
not at google - send to devlin
2013/07/09 00:17:32
if links were a set this would be a lot simpler.
jshumway
2013/07/17 00:49:55
Perhaps. Is it better that is resembles the pseudo
| |
| 201 | |
| 202 connected |= self._ignore_detached | |
|
not at google - send to devlin
2013/07/09 00:17:32
instead of this, subtract ignore_detached from the
jshumway
2013/07/17 00:49:55
I moved a bunch of stuff around and | the set into
| |
| 203 | |
| 204 return [p for p in set(self._pages.keys()) - connected if '.html' in p] | |
|
not at google - send to devlin
2013/07/09 00:17:32
when does "if '.html' in p" fail? and see comment
jshumway
2013/07/17 00:49:55
In many places. png, zip, jpg, gif, py, js, css, t
| |
| 205 | |
| 206 def pprint_broken_links(links): | |
|
not at google - send to devlin
2013/07/09 00:17:32
move this to integration_test - and call it Pretty
jshumway
2013/07/17 00:49:55
Done. Moved as a function because it is called twi
| |
| 207 ''' Print out broken links in a more readable format. | |
| 208 ''' | |
| 209 headings = [ | |
| 210 'page a broken link occurred', | |
| 211 "broken link's href value", | |
| 212 'notes\n' | |
| 213 ] | |
| 214 | |
| 215 links.insert(0, headings) | |
| 216 | |
| 217 colw0 = max([len(i[0]) for i in links]) | |
| 218 colw1 = max([len(i[1]) for i in links]) | |
| 219 | |
| 220 for col in links: | |
| 221 # The first link, right aligned. | |
| 222 print '%*s ->' % (colw0, col[0]), | |
| 223 # The second link, left aligned. | |
| 224 print '%s%s' % (col[1], (colw1 - len(col[1])) * ' '), | |
| 225 # The note, if it exists. | |
| 226 print '%s' % col[2] if len(col) == 3 else '' | |
| OLD | NEW |