Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(746)

Side by Side Diff: chrome/common/extensions/docs/server2/link_error_detector.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: patch rewrite Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 from collections import defaultdict, deque, namedtuple
6 from HTMLParser import HTMLParser, HTMLParseError
7 import posixpath
8 from urlparse import urlsplit
9
10 from cron_servlet import CreateURLsFromPaths
11 import svn_constants
12
13 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
14
15 class _ContentParser(HTMLParser):
16 ''' Find all the links inside a section of the document with an id of
not at google - send to devlin 2013/07/09 00:17:32 s/''' Find/'''Finds/
jshumway 2013/07/17 00:49:55 Done.
17 |container_id|. Pull out all 'id' and 'name' attributes and add them to a set.
18 '''
19
20 def __init__(self, container_id, exclude):
not at google - send to devlin 2013/07/09 00:17:32 what is exclude?
jshumway 2013/07/17 00:49:55 It used to be a way to tell the parser to ignore c
21 HTMLParser.__init__(self)
22 self._container_id = container_id
23 self._in_content = 0
24 self._exclude = exclude
25 self.links = []
26 self.anchors = set()
27
28 def handle_starttag(self, tag, raw_attrs):
29 attrs = dict(raw_attrs)
30
31 if tag == 'div':
32 if attrs.get('id', '') == self._container_id or self._in_content:
not at google - send to devlin 2013/07/09 00:17:32 the ", ''" part seems unnecessary.
jshumway 2013/07/17 00:49:55 indeed it is, removed.
33 self._in_content += 1
34
not at google - send to devlin 2013/07/09 00:17:32 no blank line here
jshumway 2013/07/17 00:49:55 Done.
35 elif self._in_content:
36 href = attrs.get('href')
37 if tag == 'a' and href and not urlsplit(href).scheme in ('http', 'https'):
not at google - send to devlin 2013/07/09 00:17:32 let's have: if tag == 'a': href = attrs.get('hr
jshumway 2013/07/17 00:49:55 Done.
38 if href.endswith('.html') or '#' in href:
39 if all([e not in href for e in self._exclude]):
not at google - send to devlin 2013/07/09 00:17:32 []s not needed, it can just be all(e not in href..
jshumway 2013/07/17 00:49:55 I created such a function. There check for endswi
40 self.links.append(href)
41
42 if attrs.get('id'):
43 self.anchors.add(attrs['id'])
44
not at google - send to devlin 2013/07/09 00:17:32 no blank line here or below (I know, it's just tha
jshumway 2013/07/17 00:49:55 Done.
45 if attrs.get('name'):
46 self.anchors.add(attrs['name'])
47
48 def handle_endtag(self, tag):
49 if self._in_content and tag == 'div':
50 self._in_content -= 1
51
52 class LinkErrorDetector(object):
not at google - send to devlin 2013/07/09 00:17:32 please write tests for this.
jshumway 2013/07/17 00:49:55 Done.
53 ''' Find link errors on the doc server. This includes broken links, those with
54 a target page that 404s or contain an anchor that doesn't exist, or pages that
55 have no links to them.
56 '''
57
58 def __init__(self,
59 file_system,
60 renderer,
61 public_path,
62 root_pages,
63 exclude=()):
not at google - send to devlin 2013/07/09 00:17:32 explain these arguments
jshumway 2013/07/17 00:49:55 Done.
64 self._file_system = file_system
65 self._renderer = renderer
66 self._public_path = public_path
67 self._exclude = exclude
68 self._pages = defaultdict(lambda: Page(404, (), (), ()))
69 self._root_pages = frozenset(root_pages)
70 self._ignore_detached = frozenset(('apps/404.html', 'extensions/404.html'))
not at google - send to devlin 2013/07/09 00:17:32 always_detached?
jshumway 2013/07/17 00:49:55 Done.
71
72 self._RenderAllPages()
73
74 def _RenderAllPages(self):
75 ''' Traverse the public templates directory rendering each URL and
76 processing the resultant html to pull out all links and anchors.
77 '''
78
79 def process(path, content_from):
80 parser = _ContentParser(content_from, self._exclude)
81 response = self._renderer(path)
82
83 if response.status != 200:
84 return Page(response.status, (), (), ())
85 if not path.endswith('.html'):
86 return Page(200, (), (), ())
87
88 try:
89 parser.feed(str(response.content))
90 except HTMLParseError:
91 return Page(200, (), (), ())
92
93 links, anchors = parser.links, parser.anchors
94 base, _ = path.rsplit('/', 1)
95 edges = []
96 anchor_refs = []
97
98 # Convert relative links to absolute links and categorize links as edges
99 # or anchor_refs.
100 for link in links:
101 # Files like experimental_history.html are refered to with the URL
102 # experimental.history.html.
103 head, last = link.rsplit('/', 1) if '/' in link else ('', link)
104 if '#' in last:
105 last, anchor = last.split('#')
106 else:
107 anchor = ''
108
109 if last.count('.') > 1:
110 last = last.replace('.', '_', last.count('.') - 1)
111 link = posixpath.join(head, last)
112 if anchor:
113 link = '%s#%s' % (link, anchor)
114
115 if link.startswith('#'):
116 anchor_refs.append(link)
117 else:
118 if link.startswith('/'):
119 link = link[1:]
120 else:
121 link = posixpath.normpath('%s/%s' % (base, link))
122
123 if '#' in link:
124 anchor_refs.append(link)
125 else:
126 edges.append(link)
127
128 return Page(200, edges, anchors, anchor_refs)
129
130 top_level_directories = (
131 (svn_constants.PUBLIC_TEMPLATE_PATH, ''),
132 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),
133 )
134
135 for dirpath, urlprefix in top_level_directories:
136 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
137 for url, path in files:
138 self._pages[url] = process(
139 url, 'gc-container' if url in self._root_pages else 'gc-pagecontent')
not at google - send to devlin 2013/07/09 00:17:32 please tell me we can implement this without needi
jshumway 2013/07/17 00:49:55 As we talked about in IRC, we can implement this w
140
141 if self._pages[url].status != 200:
142 print (url, ', a url derived from the path', dirpath +
143 ', resulted in a', self._pages[url].status)
144
145 def GetBrokenLinks(self):
146 ''' Find all broken links. A broken link is a link that leads to a page that
147 does not exist (404s when rendered) or that contains an anchor that does not
148 properly resolve.
149
150 Returns a list of tuples, one tuple per broken link. The first item is the
151 page that a broken link occurred on; the second item the value of the link;
152 and an optional third value with some debugging information.
not at google - send to devlin 2013/07/09 00:17:32 Can we either always or never have a third argumen
jshumway 2013/07/17 00:49:55 The method now returns two lists, the first list c
153 '''
154 broken_links = []
155
156 for url in self._pages.keys():
157 page = self._pages[url]
158 if page.status != 200:
159 continue
160
161 # First test links without anchors.
162 for link in page.links:
163 if self._pages[link].status != 200:
164 broken_links.append((url, link))
165
166 # Then find broken links with an anchor component.
not at google - send to devlin 2013/07/09 00:17:32 use urlparse here?
jshumway 2013/07/17 00:49:55 Done.
167 for ref in page.anchor_refs:
168 if ref.startswith('#'):
169 if not ref.lstrip('#') in page.anchors and ref != '#top':
170 broken_links.append((url, ref))
171 else:
172 path, lref = ref.split('#')
173 target_page = self._pages[path]
174
175 if target_page.status != 200:
176 broken_links.append((url, ref, 'target page does not exist'))
177 elif not lref in target_page.anchors:
178 broken_links.append((url, ref))
179
180 return broken_links
181
182 def GetOrphanPages(self, seed_urls=None):
not at google - send to devlin 2013/07/09 00:17:32 Orphaned nor Orphan Orphan sounds so sad :(
jshumway 2013/07/17 00:49:55 Agreed.
183 ''' Crawl the server find all pages that are connected to the pages at
184 |seed_url|s. Return the links that are valid on the server but are not in
185 part of the connected component containing the |seed_url|s. These pages
186 are orphans and cannot be reached simply by clicking through the server.
187
188 If seed_urls is not provided, root_pages will be used instead.
189 '''
190 seeds = seed_urls or self._root_pages
191 queue = deque(seeds)
not at google - send to devlin 2013/07/09 00:17:32 pages_to_check?
jshumway 2013/07/17 00:49:55 Done.
192 connected = set(seeds)
not at google - send to devlin 2013/07/09 00:17:32 found?
jshumway 2013/07/17 00:49:55 Done.
193
194 while queue:
195 item = queue.popleft()
196
not at google - send to devlin 2013/07/09 00:17:32 no blank line here
jshumway 2013/07/17 00:49:55 Done.
197 for link in self._pages[item].links:
198 if link not in connected:
199 connected.add(link)
200 queue.append(link)
not at google - send to devlin 2013/07/09 00:17:32 if links were a set this would be a lot simpler.
jshumway 2013/07/17 00:49:55 Perhaps. Is it better that is resembles the pseudo
201
202 connected |= self._ignore_detached
not at google - send to devlin 2013/07/09 00:17:32 instead of this, subtract ignore_detached from the
jshumway 2013/07/17 00:49:55 I moved a bunch of stuff around and | the set into
203
204 return [p for p in set(self._pages.keys()) - connected if '.html' in p]
not at google - send to devlin 2013/07/09 00:17:32 when does "if '.html' in p" fail? and see comment
jshumway 2013/07/17 00:49:55 In many places. png, zip, jpg, gif, py, js, css, t
205
206 def pprint_broken_links(links):
not at google - send to devlin 2013/07/09 00:17:32 move this to integration_test - and call it Pretty
jshumway 2013/07/17 00:49:55 Done. Moved as a function because it is called twi
207 ''' Print out broken links in a more readable format.
208 '''
209 headings = [
210 'page a broken link occurred',
211 "broken link's href value",
212 'notes\n'
213 ]
214
215 links.insert(0, headings)
216
217 colw0 = max([len(i[0]) for i in links])
218 colw1 = max([len(i[1]) for i in links])
219
220 for col in links:
221 # The first link, right aligned.
222 print '%*s ->' % (colw0, col[0]),
223 # The second link, left aligned.
224 print '%s%s' % (col[1], (colw1 - len(col[1])) * ' '),
225 # The note, if it exists.
226 print '%s' % col[2] if len(col) == 3 else ''
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698