Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1276)

Side by Side Diff: chrome/common/extensions/docs/server2/link_error_detector.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: finalization Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
4
5 from collections import defaultdict, deque, namedtuple
6 from HTMLParser import HTMLParser, HTMLParseError
7 import posixpath
8 from urlparse import urlsplit
9
10 from file_system_util import CreateURLsFromPaths
11 import svn_constants
12
13 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
14
15 def _SplitAnchor(url):
16 components = urlsplit(url)
17 return components.path, components.fragment
18
19 def _Process(path, renderer):
20 '''Render the page at |path| using a |renderer| and process the contents of
21 that page. Returns a |Page| namedtuple with fields for the http status code
22 of the page render, the href of all the links that occurred on the page, all
23 of the anchors on the page (ids and names), and all links that contain an
24 anchor component.
25
26 If a non-html page is properly rendered, a |Page| with status code 200 and
27 all other fields empty is returned.
28 '''
29 parser = _ContentParser()
30 response = renderer(path)
31
32 if response.status != 200:
33 return Page(response.status, (), (), ())
34 if not path.endswith('.html'):
35 return Page(200, (), (), ())
36
37 try:
38 parser.feed(str(response.content))
39 except HTMLParseError:
40 return Page(200, (), (), ())
41
42 links, anchors = parser.links, parser.anchors
43 base, _ = path.rsplit('/', 1)
44 edges = []
45 anchor_refs = []
46
47 # Convert relative links to absolute links and categorize links as edges
48 # or anchor_refs.
49 for link in links:
50 # Files like experimental_history.html are refered to with the URL
51 # experimental.history.html.
52 head, last = link.rsplit('/', 1) if '/' in link else ('', link)
53 last, anchor = _SplitAnchor(last)
54
55 if last.endswith('.html') and last.count('.') > 1:
56 last = last.replace('.', '_', last.count('.') - 1)
57 link = posixpath.join(head, last)
58 if anchor:
59 link = '%s#%s' % (link, anchor)
60
61 if link.startswith('#'):
62 anchor_refs.append(link)
63 else:
64 if link.startswith('/'):
65 link = link[1:]
66 else:
67 link = posixpath.normpath('%s/%s' % (base, link))
68
69 if '#' in link:
70 anchor_refs.append(link)
71 else:
72 edges.append(link)
73
74 return Page(200, edges, anchors, anchor_refs)
75
76 def _CategorizeBrokenLinks(url, page, pages):
77 '''Find all the broken links on a page and categorize them as either
78 broken_links, which link to a page that 404s, or broken_anchors. |page| is
79 the page to search at |url|, |pages| is a callable that takes a path and
80 returns a Page. Returns two lists, the first of all the broken_links, the
81 second of all the broken_anchors.
82 '''
83 broken_links = []
84 broken_anchors = []
85
86 # First test links without anchors.
87 for link in page.links:
88 if pages(link).status != 200:
89 broken_links.append((url, link))
90
91 # Then find broken links with an anchor component.
92 for ref in page.anchor_refs:
93 path, anchor = _SplitAnchor(ref)
94
95 if path == '':
96 if not anchor in page.anchors and anchor != 'top':
97 broken_anchors.append((url, ref))
98 else:
99 target_page = pages(path)
100 if target_page.status != 200:
101 broken_links.append((url, ref))
102 elif not anchor in target_page.anchors:
103 broken_anchors.append((url, ref))
104
105 return broken_links, broken_anchors
106
107 class _ContentParser(HTMLParser):
108 '''Parse an html file pulling out all links and anchor_refs, where an
109 anchor_ref is a link that contains an anchor.
110 '''
111
112 def __init__(self):
113 HTMLParser.__init__(self)
114 self.links = []
115 self.anchors = set()
116
117 def handle_starttag(self, tag, raw_attrs):
118 attrs = dict(raw_attrs)
119
120 if tag == 'a':
121 # Handle special cases for href's that: start with a space, contain
122 # just a '.' (period), contain python templating code, are an absolute
123 # url, are a zip file, or execute javascript on the page.
124 href = attrs.get('href', '').strip()
125 if href and not href == '.' and not '{{' in href:
126 if not urlsplit(href).scheme in ('http', 'https'):
127 if not href.endswith('.zip') and not 'javascript:' in href:
128 self.links.append(href)
129
130 if attrs.get('id'):
131 self.anchors.add(attrs['id'])
132 if attrs.get('name'):
133 self.anchors.add(attrs['name'])
134
135 class LinkErrorDetector(object):
136 '''Finds link errors on the doc server. This includes broken links, those with
137 a target page that 404s or contain an anchor that doesn't exist, or pages that
138 have no links to them.
139 '''
140
141 def __init__(self, file_system, renderer, public_path, root_pages):
142 '''Creates a new broken link detector. |renderer| is a callable that takes
143 a path and returns a full html page. |public_path| is the path to public
144 template files. All URLs in |root_pages| are used as the starting nodes for
145 the orphaned page search.
146 '''
147 self._file_system = file_system
148 self._renderer = renderer
149 self._public_path = public_path
150 self._pages = defaultdict(lambda: Page(404, (), (), ()))
151 self._root_pages = frozenset(root_pages)
152 self._always_detached = frozenset(('apps/404.html', 'extensions/404.html'))
153
154 self._RenderAllPages()
155
156 def _RenderAllPages(self):
157 '''Traverses the public templates directory rendering each URL and
158 processing the resultant html to pull out all links and anchors.
159 '''
160 top_level_directories = (
161 (svn_constants.PUBLIC_TEMPLATE_PATH, ''),
162 (svn_constants.STATIC_PATH, 'static/'),
163 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),
164 )
165
166 for dirpath, urlprefix in top_level_directories:
167 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
168 for url, path in files:
169 self._pages[url] = _Process(url, self._renderer)
170
171 if self._pages[url].status != 200:
172 print(url, ', a url derived from the path', dirpath +
173 ', resulted in a', self._pages[url].status)
174
175 def GetBrokenLinks(self):
176 '''Finds all broken links. A broken link is a link that leads to a page that
177 does not exist (404s when rendered) or that contains an anchor that does not
178 properly resolve.
179
180 Returns a pair of lists, the first all of the links that lead to a
181 non-existant page, the second all of the links that contain a broken
182 anchor. Each item in the lists is a tuple of the page a broken link
183 occurred on and the href of the broken link.
184 '''
185 broken_links = []
186 broken_anchors = []
187
188 for url in self._pages.keys():
189 page = self._pages[url]
190 if page.status != 200:
191 continue
192 links, anchors = _CategorizeBrokenLinks(
193 url, page, lambda x: self._pages[x])
194
195 broken_links.extend(links)
196 broken_anchors.extend(anchors)
197
198 return broken_links, broken_anchors
199
200 def GetOrphanedPages(self):
201 '''Crawls the server find all pages that are connected to the pages at
202 |seed_url|s. Return the links that are valid on the server but are not in
203 part of the connected component containing the |root_pages|. These pages
204 are orphans and cannot be reached simply by clicking through the server.
205 '''
206 pages_to_check = deque(self._root_pages)
207 found = set(self._root_pages) | self._always_detached
208
209 while pages_to_check:
210 item = pages_to_check.popleft()
211 for link in self._pages[item].links:
212 if link not in found:
213 found.add(link)
214 pages_to_check.append(link)
215
216 all_urls = set(
217 [url for url, page in self._pages.iteritems() if page.status == 200])
218
219 return [url for url in all_urls - found if url.endswith('.html')]
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698