Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1869)

Unified Diff: chrome/common/extensions/docs/server2/find_broken_links.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Additional information for broken anchor links Created 7 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/common/extensions/docs/server2/find_broken_links.py
diff --git a/chrome/common/extensions/docs/server2/find_broken_links.py b/chrome/common/extensions/docs/server2/find_broken_links.py
new file mode 100755
index 0000000000000000000000000000000000000000..cda5d18071e5762e4bf8dfe0d9c7da9f534a74de
--- /dev/null
+++ b/chrome/common/extensions/docs/server2/find_broken_links.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+# Copyright 2013 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+from collections import deque
+from functools import wraps
+from HTMLParser import HTMLParser, HTMLParseError
+import os
+from re import match
+
+from local_renderer import LocalRenderer
+
+usage = """\
+The following broken links exist throughout the docserver. All links have been
+made absolute.
+
+Page that contains a broken link -> Content of broken link's href
+"""
+
+def Memoize(func):
not at google - send to devlin 2013/07/01 23:16:21 can you use the implementation from json schema co
jshumway 2013/07/05 18:48:48 Oh right. I changed how most of the code works and
+ cache = {}
+ @wraps(func)
+ def wrap(*args):
+ if args not in cache:
+ cache[args] = func(*args)
+ return cache[args]
+ return wrap
+
+def PrintFunction(*args):
+ for arg in args:
+ print arg,
+ print
+
+class _ContentParser(HTMLParser):
+ ''' Find all the links inside a section of the document with an id of
+ |container_id|. Pull out all 'id' and 'name' attributes and add them to a set.
+ '''
+
+ def __init__(self, container_id, exclude):
+ HTMLParser.__init__(self)
+ self._container_id = container_id
+ self._in_content = 0
+ self._exclude = exclude
+ self.links = []
+ self.anchors = set()
+
+ def handle_starttag(self, tag, raw_attrs):
+ attrs = dict(raw_attrs)
+
+ if tag == 'div':
+ if attrs.get('id', '') == self._container_id or self._in_content:
+ self._in_content += 1
+
+ elif self._in_content:
+ href = attrs.get('href')
+ if tag == 'a' and href and not match(r'https?://', href):
+ if href.endswith('.html') or '#' in href:
+ if all([e not in href for e in self._exclude]):
+ self.links.append(href)
+
+ if attrs.get('id'):
+ self.anchors.add(attrs['id'])
+
+ if attrs.get('name'):
+ self.anchors.add(attrs['name'])
+
+ def handle_endtag(self, tag):
+ if self._in_content and tag == 'div':
+ self._in_content -= 1
+
+def CreateProcessor(renderer, exclude=(), writer=PrintFunction):
+ ''' Create a function that renders and processes a page. This process can be
+ customized by providing a renderer, a callable that will be passed a path and
+ is expected to return a response object.
+
+ An exclude list can be used to prevent URLs with certain content to be
+ ignored. If any string in the exclude list in is a link, that link will not be
+ followed.
+ '''
+
+ def processor(path, content_from='gc-pagecontent'):
+ ''' Processes a page's content pulling out relevant information. Such
+ information includes: the http status code of the response; a set of all
+ links on the page, made absolute; a set of all ids and names on the page
+ that are potential anchors; and all the links on the page that contain an
+ anchor id.
+ '''
+
+ parser = _ContentParser(content_from, exclude)
+ response = renderer(path)
+
+ if response.status != 200:
+ return (response.status, [], [], [])
+
+ if not path.endswith('.html'):
+ return (200, [], [], [])
+
+ try:
+ parser.feed(str(response.content))
+ except HTMLParseError:
+ if writer:
+ writer('Page at %s exists but could not be parsed.' % path)
+ return (200, [], [], [])
+
+ links, anchors = parser.links, parser.anchors
+ base, _ = path.rsplit('/', 1)
+ edges = set()
+ anchor_refs = []
+
+ # Convert relative links to absolute links and categorize links as edges or
+ # anchor_refs.
+ for link in links:
+ if link.startswith('#'):
+ anchor_refs.append(link)
+ else:
+ if not link.startswith('/'):
+ link = os.path.normpath(base + '/' + link)
+
+ if '#' in link:
+ anchor_refs.append(link)
+ else:
+ edges.add(link)
+
+ return (200, edges, anchors, anchor_refs)
+
+ return Memoize(processor)
+
+def FindBrokenLinks(processor, seed_paths, writer=PrintFunction):
+ ''' Crawl the doc server looking for broken links.
+
+ A link is broken if the page it links to 404s or if the it contains an anchor
+ ref with no corresponding id or name to anchor at.
+
+ |seed_paths| is used to seed the breadth first search.
+
+ If more specialized behavior than printing out the broken links is required,
+ a |writer| function can be supplied. This function will be called with a
+ varying number of arguments, all of which can be converted to strings.
not at google - send to devlin 2013/07/01 23:16:21 Just return the broken links in some format. The c
jshumway 2013/07/05 18:48:48 Done.
+ '''
+
+ initial_nodes = set(seed_paths)
+
+ # Add the sidebar content for apps and extensions to the queue. The sidenav
+ # will not be processed again.
+ for path in seed_paths:
+ initial_nodes.update(processor(path, 'gc-sidebar')[1])
+
+ queue = deque(initial_nodes)
+ searched = initial_nodes
+
+ # Crawl the doc server.
+ while queue:
+ node = queue.popleft()
+
+ edges = processor(node)[1]
+ for edge in edges:
+ if edge not in searched:
+ searched.add(edge)
+ status = processor(edge)[0]
+
+ if status != 200:
+ writer(node, '->', edge)
+
+ queue.append(edge)
+
+ # Go back through links that contain an anchor.
+ for node in searched:
+ _, __, anchors, anchor_refs = processor(node)
+
+ for ref in anchor_refs:
+ if ref.startswith('#'):
+ if not ref.lstrip('#') in anchors and ref != '#top':
+ writer(node, '->', ref)
+ else:
+ path, lref = ref.split('#')
+ status, _, lanchors, __ = processor(path)
+ if status == 404:
+ writer(node, '->', ref, '(target page does not exist)')
+ elif not lref in lanchors:
+ writer(node, '->', ref)
+
not at google - send to devlin 2013/07/01 23:16:21 I would find this file easier to follow if it were
jshumway 2013/07/05 18:48:48 Restructured everything to be more object oriented
+if __name__ == '__main__':
+ import logging
+
+ logging.disable(None)
+ print usage
+ FindBrokenLinks(
+ CreateProcessor(LocalRenderer.Render,exclude=('samples', 'examples')),
+ seed_paths=('/extensions/index.html', '/apps/about_apps.html'))
« no previous file with comments | « no previous file | chrome/common/extensions/docs/server2/handler.py » ('j') | chrome/common/extensions/docs/server2/handler.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698