chrome/common/extensions/docs/server2/find_broken_links.py - Issue 17816005: Doc server broken link detection

Unified Diff: chrome/common/extensions/docs/server2/find_broken_links.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Additional information for broken anchor links Created 7 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: chrome/common/extensions/docs/server2/find_broken_links.py

diff --git a/chrome/common/extensions/docs/server2/find_broken_links.py b/chrome/common/extensions/docs/server2/find_broken_links.py

new file mode 100755

index 0000000000000000000000000000000000000000..cda5d18071e5762e4bf8dfe0d9c7da9f534a74de

--- /dev/null

+++ b/chrome/common/extensions/docs/server2/find_broken_links.py

@@ -0,0 +1,190 @@

+#!/usr/bin/env python

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+from collections import deque

+from functools import wraps

+from HTMLParser import HTMLParser, HTMLParseError

+import os

+from re import match

+from local_renderer import LocalRenderer

+usage = """\

+The following broken links exist throughout the docserver. All links have been

+made absolute.

+Page that contains a broken link -> Content of broken link's href

+"""

+def Memoize(func):

not at google - send to devlin 2013/07/01 23:16:21 can you use the implementation from json schema co

jshumway 2013/07/05 18:48:48 Oh right. I changed how most of the code works and

+ cache = {}

+ @wraps(func)

+ def wrap(*args):

+ if args not in cache:

+ cache[args] = func(*args)

+ return cache[args]

+ return wrap

+def PrintFunction(*args):

+ for arg in args:

+ print arg,

+ print

+class _ContentParser(HTMLParser):

+ ''' Find all the links inside a section of the document with an id of

+ |container_id|. Pull out all 'id' and 'name' attributes and add them to a set.

+ '''

+ def __init__(self, container_id, exclude):

+ HTMLParser.__init__(self)

+ self._container_id = container_id

+ self._in_content = 0

+ self._exclude = exclude

+ self.links = []

+ self.anchors = set()

+ def handle_starttag(self, tag, raw_attrs):

+ attrs = dict(raw_attrs)

+ if tag == 'div':

+ if attrs.get('id', '') == self._container_id or self._in_content:

+ self._in_content += 1

+ elif self._in_content:

+ href = attrs.get('href')

+ if tag == 'a' and href and not match(r'https?://', href):

+ if href.endswith('.html') or '#' in href:

+ if all([e not in href for e in self._exclude]):

+ self.links.append(href)

+ if attrs.get('id'):

+ self.anchors.add(attrs['id'])

+ if attrs.get('name'):

+ self.anchors.add(attrs['name'])

+ def handle_endtag(self, tag):

+ if self._in_content and tag == 'div':

+ self._in_content -= 1

+def CreateProcessor(renderer, exclude=(), writer=PrintFunction):

+ ''' Create a function that renders and processes a page. This process can be

+ customized by providing a renderer, a callable that will be passed a path and

+ is expected to return a response object.

+ An exclude list can be used to prevent URLs with certain content to be

+ ignored. If any string in the exclude list in is a link, that link will not be

+ followed.

+ '''

+ def processor(path, content_from='gc-pagecontent'):

+ ''' Processes a page's content pulling out relevant information. Such

+ information includes: the http status code of the response; a set of all

+ links on the page, made absolute; a set of all ids and names on the page

+ that are potential anchors; and all the links on the page that contain an

+ anchor id.

+ '''

+ parser = _ContentParser(content_from, exclude)

+ response = renderer(path)

+ if response.status != 200:

+ return (response.status, [], [], [])

+ if not path.endswith('.html'):

+ return (200, [], [], [])

+ try:

+ parser.feed(str(response.content))

+ except HTMLParseError:

+ if writer:

+ writer('Page at %s exists but could not be parsed.' % path)

+ return (200, [], [], [])

+ links, anchors = parser.links, parser.anchors

+ base, _ = path.rsplit('/', 1)

+ edges = set()

+ anchor_refs = []

+ # Convert relative links to absolute links and categorize links as edges or

+ # anchor_refs.

+ for link in links:

+ if link.startswith('#'):

+ anchor_refs.append(link)

+ else:

+ if not link.startswith('/'):

+ link = os.path.normpath(base + '/' + link)

+ if '#' in link:

+ anchor_refs.append(link)

+ else:

+ edges.add(link)

+ return (200, edges, anchors, anchor_refs)

+ return Memoize(processor)

+def FindBrokenLinks(processor, seed_paths, writer=PrintFunction):

+ ''' Crawl the doc server looking for broken links.

+ A link is broken if the page it links to 404s or if the it contains an anchor

+ ref with no corresponding id or name to anchor at.

+ |seed_paths| is used to seed the breadth first search.

+ If more specialized behavior than printing out the broken links is required,

+ a |writer| function can be supplied. This function will be called with a

+ varying number of arguments, all of which can be converted to strings.

not at google - send to devlin 2013/07/01 23:16:21 Just return the broken links in some format. The c

jshumway 2013/07/05 18:48:48 Done.

+ '''

+ initial_nodes = set(seed_paths)

+ # Add the sidebar content for apps and extensions to the queue. The sidenav

+ # will not be processed again.

+ for path in seed_paths:

+ initial_nodes.update(processor(path, 'gc-sidebar')[1])

+ queue = deque(initial_nodes)

+ searched = initial_nodes

+ # Crawl the doc server.

+ while queue:

+ node = queue.popleft()

+ edges = processor(node)[1]

+ for edge in edges:

+ if edge not in searched:

+ searched.add(edge)

+ status = processor(edge)[0]

+ if status != 200:

+ writer(node, '->', edge)

+ queue.append(edge)

+ # Go back through links that contain an anchor.

+ for node in searched:

+ _, __, anchors, anchor_refs = processor(node)

+ for ref in anchor_refs:

+ if ref.startswith('#'):

+ if not ref.lstrip('#') in anchors and ref != '#top':

+ writer(node, '->', ref)

+ else:

+ path, lref = ref.split('#')

+ status, _, lanchors, __ = processor(path)

+ if status == 404:

+ writer(node, '->', ref, '(target page does not exist)')

+ elif not lref in lanchors:

+ writer(node, '->', ref)

not at google - send to devlin 2013/07/01 23:16:21 I would find this file easier to follow if it were

jshumway 2013/07/05 18:48:48 Restructured everything to be more object oriented

+if __name__ == '__main__':

+ import logging

+ logging.disable(None)

+ print usage

+ FindBrokenLinks(

+ CreateProcessor(LocalRenderer.Render,exclude=('samples', 'examples')),

+ seed_paths=('/extensions/index.html', '/apps/about_apps.html'))

« no previous file with comments | « no previous file | chrome/common/extensions/docs/server2/handler.py » ('j') | chrome/common/extensions/docs/server2/handler.py » ('J')