Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/env python | |
| 2 # Copyright 2013 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 from collections import deque | |
| 7 from functools import wraps | |
| 8 from HTMLParser import HTMLParser, HTMLParseError | |
| 9 import os | |
| 10 from re import match | |
| 11 | |
| 12 from local_renderer import LocalRenderer | |
| 13 | |
| 14 usage = """\ | |
| 15 The following broken links exist throughout the docserver. All links have been | |
| 16 made absolute. | |
| 17 | |
| 18 Page that contains a broken link -> Content of broken link's href | |
| 19 """ | |
| 20 | |
| 21 def Memoize(func): | |
|
not at google - send to devlin
2013/07/01 23:16:21
can you use the implementation from json schema co
jshumway
2013/07/05 18:48:48
Oh right. I changed how most of the code works and
| |
| 22 cache = {} | |
| 23 @wraps(func) | |
| 24 def wrap(*args): | |
| 25 if args not in cache: | |
| 26 cache[args] = func(*args) | |
| 27 return cache[args] | |
| 28 return wrap | |
| 29 | |
| 30 def PrintFunction(*args): | |
| 31 for arg in args: | |
| 32 print arg, | |
| 33 print | |
| 34 | |
| 35 class _ContentParser(HTMLParser): | |
| 36 ''' Find all the links inside a section of the document with an id of | |
| 37 |container_id|. Pull out all 'id' and 'name' attributes and add them to a set. | |
| 38 ''' | |
| 39 | |
| 40 def __init__(self, container_id, exclude): | |
| 41 HTMLParser.__init__(self) | |
| 42 self._container_id = container_id | |
| 43 self._in_content = 0 | |
| 44 self._exclude = exclude | |
| 45 self.links = [] | |
| 46 self.anchors = set() | |
| 47 | |
| 48 def handle_starttag(self, tag, raw_attrs): | |
| 49 attrs = dict(raw_attrs) | |
| 50 | |
| 51 if tag == 'div': | |
| 52 if attrs.get('id', '') == self._container_id or self._in_content: | |
| 53 self._in_content += 1 | |
| 54 | |
| 55 elif self._in_content: | |
| 56 href = attrs.get('href') | |
| 57 if tag == 'a' and href and not match(r'https?://', href): | |
| 58 if href.endswith('.html') or '#' in href: | |
| 59 if all([e not in href for e in self._exclude]): | |
| 60 self.links.append(href) | |
| 61 | |
| 62 if attrs.get('id'): | |
| 63 self.anchors.add(attrs['id']) | |
| 64 | |
| 65 if attrs.get('name'): | |
| 66 self.anchors.add(attrs['name']) | |
| 67 | |
| 68 def handle_endtag(self, tag): | |
| 69 if self._in_content and tag == 'div': | |
| 70 self._in_content -= 1 | |
| 71 | |
| 72 def CreateProcessor(renderer, exclude=(), writer=PrintFunction): | |
| 73 ''' Create a function that renders and processes a page. This process can be | |
| 74 customized by providing a renderer, a callable that will be passed a path and | |
| 75 is expected to return a response object. | |
| 76 | |
| 77 An exclude list can be used to prevent URLs with certain content to be | |
| 78 ignored. If any string in the exclude list in is a link, that link will not be | |
| 79 followed. | |
| 80 ''' | |
| 81 | |
| 82 def processor(path, content_from='gc-pagecontent'): | |
| 83 ''' Processes a page's content pulling out relevant information. Such | |
| 84 information includes: the http status code of the response; a set of all | |
| 85 links on the page, made absolute; a set of all ids and names on the page | |
| 86 that are potential anchors; and all the links on the page that contain an | |
| 87 anchor id. | |
| 88 ''' | |
| 89 | |
| 90 parser = _ContentParser(content_from, exclude) | |
| 91 response = renderer(path) | |
| 92 | |
| 93 if response.status != 200: | |
| 94 return (response.status, [], [], []) | |
| 95 | |
| 96 if not path.endswith('.html'): | |
| 97 return (200, [], [], []) | |
| 98 | |
| 99 try: | |
| 100 parser.feed(str(response.content)) | |
| 101 except HTMLParseError: | |
| 102 if writer: | |
| 103 writer('Page at %s exists but could not be parsed.' % path) | |
| 104 return (200, [], [], []) | |
| 105 | |
| 106 links, anchors = parser.links, parser.anchors | |
| 107 base, _ = path.rsplit('/', 1) | |
| 108 edges = set() | |
| 109 anchor_refs = [] | |
| 110 | |
| 111 # Convert relative links to absolute links and categorize links as edges or | |
| 112 # anchor_refs. | |
| 113 for link in links: | |
| 114 if link.startswith('#'): | |
| 115 anchor_refs.append(link) | |
| 116 else: | |
| 117 if not link.startswith('/'): | |
| 118 link = os.path.normpath(base + '/' + link) | |
| 119 | |
| 120 if '#' in link: | |
| 121 anchor_refs.append(link) | |
| 122 else: | |
| 123 edges.add(link) | |
| 124 | |
| 125 return (200, edges, anchors, anchor_refs) | |
| 126 | |
| 127 return Memoize(processor) | |
| 128 | |
| 129 def FindBrokenLinks(processor, seed_paths, writer=PrintFunction): | |
| 130 ''' Crawl the doc server looking for broken links. | |
| 131 | |
| 132 A link is broken if the page it links to 404s or if the it contains an anchor | |
| 133 ref with no corresponding id or name to anchor at. | |
| 134 | |
| 135 |seed_paths| is used to seed the breadth first search. | |
| 136 | |
| 137 If more specialized behavior than printing out the broken links is required, | |
| 138 a |writer| function can be supplied. This function will be called with a | |
| 139 varying number of arguments, all of which can be converted to strings. | |
|
not at google - send to devlin
2013/07/01 23:16:21
Just return the broken links in some format. The c
jshumway
2013/07/05 18:48:48
Done.
| |
| 140 ''' | |
| 141 | |
| 142 initial_nodes = set(seed_paths) | |
| 143 | |
| 144 # Add the sidebar content for apps and extensions to the queue. The sidenav | |
| 145 # will not be processed again. | |
| 146 for path in seed_paths: | |
| 147 initial_nodes.update(processor(path, 'gc-sidebar')[1]) | |
| 148 | |
| 149 queue = deque(initial_nodes) | |
| 150 searched = initial_nodes | |
| 151 | |
| 152 # Crawl the doc server. | |
| 153 while queue: | |
| 154 node = queue.popleft() | |
| 155 | |
| 156 edges = processor(node)[1] | |
| 157 for edge in edges: | |
| 158 if edge not in searched: | |
| 159 searched.add(edge) | |
| 160 status = processor(edge)[0] | |
| 161 | |
| 162 if status != 200: | |
| 163 writer(node, '->', edge) | |
| 164 | |
| 165 queue.append(edge) | |
| 166 | |
| 167 # Go back through links that contain an anchor. | |
| 168 for node in searched: | |
| 169 _, __, anchors, anchor_refs = processor(node) | |
| 170 | |
| 171 for ref in anchor_refs: | |
| 172 if ref.startswith('#'): | |
| 173 if not ref.lstrip('#') in anchors and ref != '#top': | |
| 174 writer(node, '->', ref) | |
| 175 else: | |
| 176 path, lref = ref.split('#') | |
| 177 status, _, lanchors, __ = processor(path) | |
| 178 if status == 404: | |
| 179 writer(node, '->', ref, '(target page does not exist)') | |
| 180 elif not lref in lanchors: | |
| 181 writer(node, '->', ref) | |
| 182 | |
|
not at google - send to devlin
2013/07/01 23:16:21
I would find this file easier to follow if it were
jshumway
2013/07/05 18:48:48
Restructured everything to be more object oriented
| |
| 183 if __name__ == '__main__': | |
| 184 import logging | |
| 185 | |
| 186 logging.disable(None) | |
| 187 print usage | |
| 188 FindBrokenLinks( | |
| 189 CreateProcessor(LocalRenderer.Render,exclude=('samples', 'examples')), | |
| 190 seed_paths=('/extensions/index.html', '/apps/about_apps.html')) | |
| OLD | NEW |