Chromium Code Reviews| Index: chrome/common/extensions/docs/server2/link_error_detector.py |
| diff --git a/chrome/common/extensions/docs/server2/link_error_detector.py b/chrome/common/extensions/docs/server2/link_error_detector.py |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..9fe72535e9d0254dce2353b3d7721caa1f4322bb |
| --- /dev/null |
| +++ b/chrome/common/extensions/docs/server2/link_error_detector.py |
| @@ -0,0 +1,226 @@ |
| +# Copyright 2013 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +from collections import defaultdict, deque, namedtuple |
| +from HTMLParser import HTMLParser, HTMLParseError |
| +import posixpath |
| +from urlparse import urlsplit |
| + |
| +from cron_servlet import CreateURLsFromPaths |
| +import svn_constants |
| + |
| +Page = namedtuple('Page', 'status, links, anchors, anchor_refs') |
| + |
| +class _ContentParser(HTMLParser): |
| + ''' Find all the links inside a section of the document with an id of |
|
not at google - send to devlin
2013/07/09 00:17:32
s/''' Find/'''Finds/
jshumway
2013/07/17 00:49:55
Done.
|
| + |container_id|. Pull out all 'id' and 'name' attributes and add them to a set. |
| + ''' |
| + |
| + def __init__(self, container_id, exclude): |
|
not at google - send to devlin
2013/07/09 00:17:32
what is exclude?
jshumway
2013/07/17 00:49:55
It used to be a way to tell the parser to ignore c
|
| + HTMLParser.__init__(self) |
| + self._container_id = container_id |
| + self._in_content = 0 |
| + self._exclude = exclude |
| + self.links = [] |
| + self.anchors = set() |
| + |
| + def handle_starttag(self, tag, raw_attrs): |
| + attrs = dict(raw_attrs) |
| + |
| + if tag == 'div': |
| + if attrs.get('id', '') == self._container_id or self._in_content: |
|
not at google - send to devlin
2013/07/09 00:17:32
the ", ''" part seems unnecessary.
jshumway
2013/07/17 00:49:55
indeed it is, removed.
|
| + self._in_content += 1 |
| + |
|
not at google - send to devlin
2013/07/09 00:17:32
no blank line here
jshumway
2013/07/17 00:49:55
Done.
|
| + elif self._in_content: |
| + href = attrs.get('href') |
| + if tag == 'a' and href and not urlsplit(href).scheme in ('http', 'https'): |
|
not at google - send to devlin
2013/07/09 00:17:32
let's have:
if tag == 'a':
href = attrs.get('hr
jshumway
2013/07/17 00:49:55
Done.
|
| + if href.endswith('.html') or '#' in href: |
| + if all([e not in href for e in self._exclude]): |
|
not at google - send to devlin
2013/07/09 00:17:32
[]s not needed, it can just be all(e not in href..
jshumway
2013/07/17 00:49:55
I created such a function.
There check for endswi
|
| + self.links.append(href) |
| + |
| + if attrs.get('id'): |
| + self.anchors.add(attrs['id']) |
| + |
|
not at google - send to devlin
2013/07/09 00:17:32
no blank line here or below (I know, it's just tha
jshumway
2013/07/17 00:49:55
Done.
|
| + if attrs.get('name'): |
| + self.anchors.add(attrs['name']) |
| + |
| + def handle_endtag(self, tag): |
| + if self._in_content and tag == 'div': |
| + self._in_content -= 1 |
| + |
| +class LinkErrorDetector(object): |
|
not at google - send to devlin
2013/07/09 00:17:32
please write tests for this.
jshumway
2013/07/17 00:49:55
Done.
|
| + ''' Find link errors on the doc server. This includes broken links, those with |
| + a target page that 404s or contain an anchor that doesn't exist, or pages that |
| + have no links to them. |
| + ''' |
| + |
| + def __init__(self, |
| + file_system, |
| + renderer, |
| + public_path, |
| + root_pages, |
| + exclude=()): |
|
not at google - send to devlin
2013/07/09 00:17:32
explain these arguments
jshumway
2013/07/17 00:49:55
Done.
|
| + self._file_system = file_system |
| + self._renderer = renderer |
| + self._public_path = public_path |
| + self._exclude = exclude |
| + self._pages = defaultdict(lambda: Page(404, (), (), ())) |
| + self._root_pages = frozenset(root_pages) |
| + self._ignore_detached = frozenset(('apps/404.html', 'extensions/404.html')) |
|
not at google - send to devlin
2013/07/09 00:17:32
always_detached?
jshumway
2013/07/17 00:49:55
Done.
|
| + |
| + self._RenderAllPages() |
| + |
| + def _RenderAllPages(self): |
| + ''' Traverse the public templates directory rendering each URL and |
| + processing the resultant html to pull out all links and anchors. |
| + ''' |
| + |
| + def process(path, content_from): |
| + parser = _ContentParser(content_from, self._exclude) |
| + response = self._renderer(path) |
| + |
| + if response.status != 200: |
| + return Page(response.status, (), (), ()) |
| + if not path.endswith('.html'): |
| + return Page(200, (), (), ()) |
| + |
| + try: |
| + parser.feed(str(response.content)) |
| + except HTMLParseError: |
| + return Page(200, (), (), ()) |
| + |
| + links, anchors = parser.links, parser.anchors |
| + base, _ = path.rsplit('/', 1) |
| + edges = [] |
| + anchor_refs = [] |
| + |
| + # Convert relative links to absolute links and categorize links as edges |
| + # or anchor_refs. |
| + for link in links: |
| + # Files like experimental_history.html are refered to with the URL |
| + # experimental.history.html. |
| + head, last = link.rsplit('/', 1) if '/' in link else ('', link) |
| + if '#' in last: |
| + last, anchor = last.split('#') |
| + else: |
| + anchor = '' |
| + |
| + if last.count('.') > 1: |
| + last = last.replace('.', '_', last.count('.') - 1) |
| + link = posixpath.join(head, last) |
| + if anchor: |
| + link = '%s#%s' % (link, anchor) |
| + |
| + if link.startswith('#'): |
| + anchor_refs.append(link) |
| + else: |
| + if link.startswith('/'): |
| + link = link[1:] |
| + else: |
| + link = posixpath.normpath('%s/%s' % (base, link)) |
| + |
| + if '#' in link: |
| + anchor_refs.append(link) |
| + else: |
| + edges.append(link) |
| + |
| + return Page(200, edges, anchors, anchor_refs) |
| + |
| + top_level_directories = ( |
| + (svn_constants.PUBLIC_TEMPLATE_PATH, ''), |
| + (svn_constants.EXAMPLES_PATH, 'extensions/examples/'), |
| + ) |
| + |
| + for dirpath, urlprefix in top_level_directories: |
| + files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) |
| + for url, path in files: |
| + self._pages[url] = process( |
| + url, 'gc-container' if url in self._root_pages else 'gc-pagecontent') |
|
not at google - send to devlin
2013/07/09 00:17:32
please tell me we can implement this without needi
jshumway
2013/07/17 00:49:55
As we talked about in IRC, we can implement this w
|
| + |
| + if self._pages[url].status != 200: |
| + print (url, ', a url derived from the path', dirpath + |
| + ', resulted in a', self._pages[url].status) |
| + |
| + def GetBrokenLinks(self): |
| + ''' Find all broken links. A broken link is a link that leads to a page that |
| + does not exist (404s when rendered) or that contains an anchor that does not |
| + properly resolve. |
| + |
| + Returns a list of tuples, one tuple per broken link. The first item is the |
| + page that a broken link occurred on; the second item the value of the link; |
| + and an optional third value with some debugging information. |
|
not at google - send to devlin
2013/07/09 00:17:32
Can we either always or never have a third argumen
jshumway
2013/07/17 00:49:55
The method now returns two lists, the first list c
|
| + ''' |
| + broken_links = [] |
| + |
| + for url in self._pages.keys(): |
| + page = self._pages[url] |
| + if page.status != 200: |
| + continue |
| + |
| + # First test links without anchors. |
| + for link in page.links: |
| + if self._pages[link].status != 200: |
| + broken_links.append((url, link)) |
| + |
| + # Then find broken links with an anchor component. |
|
not at google - send to devlin
2013/07/09 00:17:32
use urlparse here?
jshumway
2013/07/17 00:49:55
Done.
|
| + for ref in page.anchor_refs: |
| + if ref.startswith('#'): |
| + if not ref.lstrip('#') in page.anchors and ref != '#top': |
| + broken_links.append((url, ref)) |
| + else: |
| + path, lref = ref.split('#') |
| + target_page = self._pages[path] |
| + |
| + if target_page.status != 200: |
| + broken_links.append((url, ref, 'target page does not exist')) |
| + elif not lref in target_page.anchors: |
| + broken_links.append((url, ref)) |
| + |
| + return broken_links |
| + |
| + def GetOrphanPages(self, seed_urls=None): |
|
not at google - send to devlin
2013/07/09 00:17:32
Orphaned nor Orphan
Orphan sounds so sad :(
jshumway
2013/07/17 00:49:55
Agreed.
|
| + ''' Crawl the server find all pages that are connected to the pages at |
| + |seed_url|s. Return the links that are valid on the server but are not in |
| + part of the connected component containing the |seed_url|s. These pages |
| + are orphans and cannot be reached simply by clicking through the server. |
| + |
| + If seed_urls is not provided, root_pages will be used instead. |
| + ''' |
| + seeds = seed_urls or self._root_pages |
| + queue = deque(seeds) |
|
not at google - send to devlin
2013/07/09 00:17:32
pages_to_check?
jshumway
2013/07/17 00:49:55
Done.
|
| + connected = set(seeds) |
|
not at google - send to devlin
2013/07/09 00:17:32
found?
jshumway
2013/07/17 00:49:55
Done.
|
| + |
| + while queue: |
| + item = queue.popleft() |
| + |
|
not at google - send to devlin
2013/07/09 00:17:32
no blank line here
jshumway
2013/07/17 00:49:55
Done.
|
| + for link in self._pages[item].links: |
| + if link not in connected: |
| + connected.add(link) |
| + queue.append(link) |
|
not at google - send to devlin
2013/07/09 00:17:32
if links were a set this would be a lot simpler.
jshumway
2013/07/17 00:49:55
Perhaps. Is it better that is resembles the pseudo
|
| + |
| + connected |= self._ignore_detached |
|
not at google - send to devlin
2013/07/09 00:17:32
instead of this, subtract ignore_detached from the
jshumway
2013/07/17 00:49:55
I moved a bunch of stuff around and | the set into
|
| + |
| + return [p for p in set(self._pages.keys()) - connected if '.html' in p] |
|
not at google - send to devlin
2013/07/09 00:17:32
when does "if '.html' in p" fail? and see comment
jshumway
2013/07/17 00:49:55
In many places. png, zip, jpg, gif, py, js, css, t
|
| + |
| +def pprint_broken_links(links): |
|
not at google - send to devlin
2013/07/09 00:17:32
move this to integration_test - and call it Pretty
jshumway
2013/07/17 00:49:55
Done. Moved as a function because it is called twi
|
| + ''' Print out broken links in a more readable format. |
| + ''' |
| + headings = [ |
| + 'page a broken link occurred', |
| + "broken link's href value", |
| + 'notes\n' |
| + ] |
| + |
| + links.insert(0, headings) |
| + |
| + colw0 = max([len(i[0]) for i in links]) |
| + colw1 = max([len(i[1]) for i in links]) |
| + |
| + for col in links: |
| + # The first link, right aligned. |
| + print '%*s ->' % (colw0, col[0]), |
| + # The second link, left aligned. |
| + print '%s%s' % (col[1], (colw1 - len(col[1])) * ' '), |
| + # The note, if it exists. |
| + print '%s' % col[2] if len(col) == 3 else '' |