Chromium Code Reviews| Index: chrome/common/extensions/docs/server2/find_broken_links.py |
| diff --git a/chrome/common/extensions/docs/server2/find_broken_links.py b/chrome/common/extensions/docs/server2/find_broken_links.py |
| new file mode 100755 |
| index 0000000000000000000000000000000000000000..cda5d18071e5762e4bf8dfe0d9c7da9f534a74de |
| --- /dev/null |
| +++ b/chrome/common/extensions/docs/server2/find_broken_links.py |
| @@ -0,0 +1,190 @@ |
| +#!/usr/bin/env python |
| +# Copyright 2013 The Chromium Authors. All rights reserved. |
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +from collections import deque |
| +from functools import wraps |
| +from HTMLParser import HTMLParser, HTMLParseError |
| +import os |
| +from re import match |
| + |
| +from local_renderer import LocalRenderer |
| + |
| +usage = """\ |
| +The following broken links exist throughout the docserver. All links have been |
| +made absolute. |
| + |
| +Page that contains a broken link -> Content of broken link's href |
| +""" |
| + |
| +def Memoize(func): |
|
not at google - send to devlin
2013/07/01 23:16:21
can you use the implementation from json schema co
jshumway
2013/07/05 18:48:48
Oh right. I changed how most of the code works and
|
| + cache = {} |
| + @wraps(func) |
| + def wrap(*args): |
| + if args not in cache: |
| + cache[args] = func(*args) |
| + return cache[args] |
| + return wrap |
| + |
| +def PrintFunction(*args): |
| + for arg in args: |
| + print arg, |
| + |
| +class _ContentParser(HTMLParser): |
| + ''' Find all the links inside a section of the document with an id of |
| + |container_id|. Pull out all 'id' and 'name' attributes and add them to a set. |
| + ''' |
| + |
| + def __init__(self, container_id, exclude): |
| + HTMLParser.__init__(self) |
| + self._container_id = container_id |
| + self._in_content = 0 |
| + self._exclude = exclude |
| + self.links = [] |
| + self.anchors = set() |
| + |
| + def handle_starttag(self, tag, raw_attrs): |
| + attrs = dict(raw_attrs) |
| + |
| + if tag == 'div': |
| + if attrs.get('id', '') == self._container_id or self._in_content: |
| + self._in_content += 1 |
| + |
| + elif self._in_content: |
| + href = attrs.get('href') |
| + if tag == 'a' and href and not match(r'https?://', href): |
| + if href.endswith('.html') or '#' in href: |
| + if all([e not in href for e in self._exclude]): |
| + self.links.append(href) |
| + |
| + if attrs.get('id'): |
| + self.anchors.add(attrs['id']) |
| + |
| + if attrs.get('name'): |
| + self.anchors.add(attrs['name']) |
| + |
| + def handle_endtag(self, tag): |
| + if self._in_content and tag == 'div': |
| + self._in_content -= 1 |
| + |
| +def CreateProcessor(renderer, exclude=(), writer=PrintFunction): |
| + ''' Create a function that renders and processes a page. This process can be |
| + customized by providing a renderer, a callable that will be passed a path and |
| + is expected to return a response object. |
| + |
| + An exclude list can be used to prevent URLs with certain content to be |
| + ignored. If any string in the exclude list in is a link, that link will not be |
| + followed. |
| + ''' |
| + |
| + def processor(path, content_from='gc-pagecontent'): |
| + ''' Processes a page's content pulling out relevant information. Such |
| + information includes: the http status code of the response; a set of all |
| + links on the page, made absolute; a set of all ids and names on the page |
| + that are potential anchors; and all the links on the page that contain an |
| + anchor id. |
| + ''' |
| + |
| + parser = _ContentParser(content_from, exclude) |
| + response = renderer(path) |
| + |
| + if response.status != 200: |
| + return (response.status, [], [], []) |
| + |
| + if not path.endswith('.html'): |
| + return (200, [], [], []) |
| + |
| + try: |
| + parser.feed(str(response.content)) |
| + except HTMLParseError: |
| + if writer: |
| + writer('Page at %s exists but could not be parsed.' % path) |
| + return (200, [], [], []) |
| + |
| + links, anchors = parser.links, parser.anchors |
| + base, _ = path.rsplit('/', 1) |
| + edges = set() |
| + anchor_refs = [] |
| + |
| + # Convert relative links to absolute links and categorize links as edges or |
| + # anchor_refs. |
| + for link in links: |
| + if link.startswith('#'): |
| + anchor_refs.append(link) |
| + else: |
| + if not link.startswith('/'): |
| + link = os.path.normpath(base + '/' + link) |
| + |
| + if '#' in link: |
| + anchor_refs.append(link) |
| + else: |
| + edges.add(link) |
| + |
| + return (200, edges, anchors, anchor_refs) |
| + |
| + return Memoize(processor) |
| + |
| +def FindBrokenLinks(processor, seed_paths, writer=PrintFunction): |
| + ''' Crawl the doc server looking for broken links. |
| + |
| + A link is broken if the page it links to 404s or if the it contains an anchor |
| + ref with no corresponding id or name to anchor at. |
| + |
| + |seed_paths| is used to seed the breadth first search. |
| + |
| + If more specialized behavior than printing out the broken links is required, |
| + a |writer| function can be supplied. This function will be called with a |
| + varying number of arguments, all of which can be converted to strings. |
|
not at google - send to devlin
2013/07/01 23:16:21
Just return the broken links in some format. The c
jshumway
2013/07/05 18:48:48
Done.
|
| + ''' |
| + |
| + initial_nodes = set(seed_paths) |
| + |
| + # Add the sidebar content for apps and extensions to the queue. The sidenav |
| + # will not be processed again. |
| + for path in seed_paths: |
| + initial_nodes.update(processor(path, 'gc-sidebar')[1]) |
| + |
| + queue = deque(initial_nodes) |
| + searched = initial_nodes |
| + |
| + # Crawl the doc server. |
| + while queue: |
| + node = queue.popleft() |
| + |
| + edges = processor(node)[1] |
| + for edge in edges: |
| + if edge not in searched: |
| + searched.add(edge) |
| + status = processor(edge)[0] |
| + |
| + if status != 200: |
| + writer(node, '->', edge) |
| + |
| + queue.append(edge) |
| + |
| + # Go back through links that contain an anchor. |
| + for node in searched: |
| + _, __, anchors, anchor_refs = processor(node) |
| + |
| + for ref in anchor_refs: |
| + if ref.startswith('#'): |
| + if not ref.lstrip('#') in anchors and ref != '#top': |
| + writer(node, '->', ref) |
| + else: |
| + path, lref = ref.split('#') |
| + status, _, lanchors, __ = processor(path) |
| + if status == 404: |
| + writer(node, '->', ref, '(target page does not exist)') |
| + elif not lref in lanchors: |
| + writer(node, '->', ref) |
| + |
|
not at google - send to devlin
2013/07/01 23:16:21
I would find this file easier to follow if it were
jshumway
2013/07/05 18:48:48
Restructured everything to be more object oriented
|
| +if __name__ == '__main__': |
| + import logging |
| + |
| + logging.disable(None) |
| + print usage |
| + FindBrokenLinks( |
| + CreateProcessor(LocalRenderer.Render,exclude=('samples', 'examples')), |
| + seed_paths=('/extensions/index.html', '/apps/about_apps.html')) |