Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(177)

Side by Side Diff: chrome/common/extensions/docs/server2/find_broken_links.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Additional information for broken anchor links Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 from collections import deque
7 from functools import wraps
8 from HTMLParser import HTMLParser, HTMLParseError
9 import os
10 from re import match
11
12 from local_renderer import LocalRenderer
13
14 usage = """\
15 The following broken links exist throughout the docserver. All links have been
16 made absolute.
17
18 Page that contains a broken link -> Content of broken link's href
19 """
20
21 def Memoize(func):
not at google - send to devlin 2013/07/01 23:16:21 can you use the implementation from json schema co
jshumway 2013/07/05 18:48:48 Oh right. I changed how most of the code works and
22 cache = {}
23 @wraps(func)
24 def wrap(*args):
25 if args not in cache:
26 cache[args] = func(*args)
27 return cache[args]
28 return wrap
29
30 def PrintFunction(*args):
31 for arg in args:
32 print arg,
33 print
34
35 class _ContentParser(HTMLParser):
36 ''' Find all the links inside a section of the document with an id of
37 |container_id|. Pull out all 'id' and 'name' attributes and add them to a set.
38 '''
39
40 def __init__(self, container_id, exclude):
41 HTMLParser.__init__(self)
42 self._container_id = container_id
43 self._in_content = 0
44 self._exclude = exclude
45 self.links = []
46 self.anchors = set()
47
48 def handle_starttag(self, tag, raw_attrs):
49 attrs = dict(raw_attrs)
50
51 if tag == 'div':
52 if attrs.get('id', '') == self._container_id or self._in_content:
53 self._in_content += 1
54
55 elif self._in_content:
56 href = attrs.get('href')
57 if tag == 'a' and href and not match(r'https?://', href):
58 if href.endswith('.html') or '#' in href:
59 if all([e not in href for e in self._exclude]):
60 self.links.append(href)
61
62 if attrs.get('id'):
63 self.anchors.add(attrs['id'])
64
65 if attrs.get('name'):
66 self.anchors.add(attrs['name'])
67
68 def handle_endtag(self, tag):
69 if self._in_content and tag == 'div':
70 self._in_content -= 1
71
72 def CreateProcessor(renderer, exclude=(), writer=PrintFunction):
73 ''' Create a function that renders and processes a page. This process can be
74 customized by providing a renderer, a callable that will be passed a path and
75 is expected to return a response object.
76
77 An exclude list can be used to prevent URLs with certain content to be
78 ignored. If any string in the exclude list in is a link, that link will not be
79 followed.
80 '''
81
82 def processor(path, content_from='gc-pagecontent'):
83 ''' Processes a page's content pulling out relevant information. Such
84 information includes: the http status code of the response; a set of all
85 links on the page, made absolute; a set of all ids and names on the page
86 that are potential anchors; and all the links on the page that contain an
87 anchor id.
88 '''
89
90 parser = _ContentParser(content_from, exclude)
91 response = renderer(path)
92
93 if response.status != 200:
94 return (response.status, [], [], [])
95
96 if not path.endswith('.html'):
97 return (200, [], [], [])
98
99 try:
100 parser.feed(str(response.content))
101 except HTMLParseError:
102 if writer:
103 writer('Page at %s exists but could not be parsed.' % path)
104 return (200, [], [], [])
105
106 links, anchors = parser.links, parser.anchors
107 base, _ = path.rsplit('/', 1)
108 edges = set()
109 anchor_refs = []
110
111 # Convert relative links to absolute links and categorize links as edges or
112 # anchor_refs.
113 for link in links:
114 if link.startswith('#'):
115 anchor_refs.append(link)
116 else:
117 if not link.startswith('/'):
118 link = os.path.normpath(base + '/' + link)
119
120 if '#' in link:
121 anchor_refs.append(link)
122 else:
123 edges.add(link)
124
125 return (200, edges, anchors, anchor_refs)
126
127 return Memoize(processor)
128
129 def FindBrokenLinks(processor, seed_paths, writer=PrintFunction):
130 ''' Crawl the doc server looking for broken links.
131
132 A link is broken if the page it links to 404s or if the it contains an anchor
133 ref with no corresponding id or name to anchor at.
134
135 |seed_paths| is used to seed the breadth first search.
136
137 If more specialized behavior than printing out the broken links is required,
138 a |writer| function can be supplied. This function will be called with a
139 varying number of arguments, all of which can be converted to strings.
not at google - send to devlin 2013/07/01 23:16:21 Just return the broken links in some format. The c
jshumway 2013/07/05 18:48:48 Done.
140 '''
141
142 initial_nodes = set(seed_paths)
143
144 # Add the sidebar content for apps and extensions to the queue. The sidenav
145 # will not be processed again.
146 for path in seed_paths:
147 initial_nodes.update(processor(path, 'gc-sidebar')[1])
148
149 queue = deque(initial_nodes)
150 searched = initial_nodes
151
152 # Crawl the doc server.
153 while queue:
154 node = queue.popleft()
155
156 edges = processor(node)[1]
157 for edge in edges:
158 if edge not in searched:
159 searched.add(edge)
160 status = processor(edge)[0]
161
162 if status != 200:
163 writer(node, '->', edge)
164
165 queue.append(edge)
166
167 # Go back through links that contain an anchor.
168 for node in searched:
169 _, __, anchors, anchor_refs = processor(node)
170
171 for ref in anchor_refs:
172 if ref.startswith('#'):
173 if not ref.lstrip('#') in anchors and ref != '#top':
174 writer(node, '->', ref)
175 else:
176 path, lref = ref.split('#')
177 status, _, lanchors, __ = processor(path)
178 if status == 404:
179 writer(node, '->', ref, '(target page does not exist)')
180 elif not lref in lanchors:
181 writer(node, '->', ref)
182
not at google - send to devlin 2013/07/01 23:16:21 I would find this file easier to follow if it were
jshumway 2013/07/05 18:48:48 Restructured everything to be more object oriented
183 if __name__ == '__main__':
184 import logging
185
186 logging.disable(None)
187 print usage
188 FindBrokenLinks(
189 CreateProcessor(LocalRenderer.Render,exclude=('samples', 'examples')),
190 seed_paths=('/extensions/index.html', '/apps/about_apps.html'))
OLDNEW
« no previous file with comments | « no previous file | chrome/common/extensions/docs/server2/handler.py » ('j') | chrome/common/extensions/docs/server2/handler.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698