Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(54)

Unified Diff: chrome/common/extensions/docs/server2/link_error_detector.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: patch rewrite Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/common/extensions/docs/server2/link_error_detector.py
diff --git a/chrome/common/extensions/docs/server2/link_error_detector.py b/chrome/common/extensions/docs/server2/link_error_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fe72535e9d0254dce2353b3d7721caa1f4322bb
--- /dev/null
+++ b/chrome/common/extensions/docs/server2/link_error_detector.py
@@ -0,0 +1,226 @@
+# Copyright 2013 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+from collections import defaultdict, deque, namedtuple
+from HTMLParser import HTMLParser, HTMLParseError
+import posixpath
+from urlparse import urlsplit
+
+from cron_servlet import CreateURLsFromPaths
+import svn_constants
+
+Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
+
+class _ContentParser(HTMLParser):
+ ''' Find all the links inside a section of the document with an id of
not at google - send to devlin 2013/07/09 00:17:32 s/''' Find/'''Finds/
jshumway 2013/07/17 00:49:55 Done.
+ |container_id|. Pull out all 'id' and 'name' attributes and add them to a set.
+ '''
+
+ def __init__(self, container_id, exclude):
not at google - send to devlin 2013/07/09 00:17:32 what is exclude?
jshumway 2013/07/17 00:49:55 It used to be a way to tell the parser to ignore c
+ HTMLParser.__init__(self)
+ self._container_id = container_id
+ self._in_content = 0
+ self._exclude = exclude
+ self.links = []
+ self.anchors = set()
+
+ def handle_starttag(self, tag, raw_attrs):
+ attrs = dict(raw_attrs)
+
+ if tag == 'div':
+ if attrs.get('id', '') == self._container_id or self._in_content:
not at google - send to devlin 2013/07/09 00:17:32 the ", ''" part seems unnecessary.
jshumway 2013/07/17 00:49:55 indeed it is, removed.
+ self._in_content += 1
+
not at google - send to devlin 2013/07/09 00:17:32 no blank line here
jshumway 2013/07/17 00:49:55 Done.
+ elif self._in_content:
+ href = attrs.get('href')
+ if tag == 'a' and href and not urlsplit(href).scheme in ('http', 'https'):
not at google - send to devlin 2013/07/09 00:17:32 let's have: if tag == 'a': href = attrs.get('hr
jshumway 2013/07/17 00:49:55 Done.
+ if href.endswith('.html') or '#' in href:
+ if all([e not in href for e in self._exclude]):
not at google - send to devlin 2013/07/09 00:17:32 []s not needed, it can just be all(e not in href..
jshumway 2013/07/17 00:49:55 I created such a function. There check for endswi
+ self.links.append(href)
+
+ if attrs.get('id'):
+ self.anchors.add(attrs['id'])
+
not at google - send to devlin 2013/07/09 00:17:32 no blank line here or below (I know, it's just tha
jshumway 2013/07/17 00:49:55 Done.
+ if attrs.get('name'):
+ self.anchors.add(attrs['name'])
+
+ def handle_endtag(self, tag):
+ if self._in_content and tag == 'div':
+ self._in_content -= 1
+
+class LinkErrorDetector(object):
not at google - send to devlin 2013/07/09 00:17:32 please write tests for this.
jshumway 2013/07/17 00:49:55 Done.
+ ''' Find link errors on the doc server. This includes broken links, those with
+ a target page that 404s or contain an anchor that doesn't exist, or pages that
+ have no links to them.
+ '''
+
+ def __init__(self,
+ file_system,
+ renderer,
+ public_path,
+ root_pages,
+ exclude=()):
not at google - send to devlin 2013/07/09 00:17:32 explain these arguments
jshumway 2013/07/17 00:49:55 Done.
+ self._file_system = file_system
+ self._renderer = renderer
+ self._public_path = public_path
+ self._exclude = exclude
+ self._pages = defaultdict(lambda: Page(404, (), (), ()))
+ self._root_pages = frozenset(root_pages)
+ self._ignore_detached = frozenset(('apps/404.html', 'extensions/404.html'))
not at google - send to devlin 2013/07/09 00:17:32 always_detached?
jshumway 2013/07/17 00:49:55 Done.
+
+ self._RenderAllPages()
+
+ def _RenderAllPages(self):
+ ''' Traverse the public templates directory rendering each URL and
+ processing the resultant html to pull out all links and anchors.
+ '''
+
+ def process(path, content_from):
+ parser = _ContentParser(content_from, self._exclude)
+ response = self._renderer(path)
+
+ if response.status != 200:
+ return Page(response.status, (), (), ())
+ if not path.endswith('.html'):
+ return Page(200, (), (), ())
+
+ try:
+ parser.feed(str(response.content))
+ except HTMLParseError:
+ return Page(200, (), (), ())
+
+ links, anchors = parser.links, parser.anchors
+ base, _ = path.rsplit('/', 1)
+ edges = []
+ anchor_refs = []
+
+ # Convert relative links to absolute links and categorize links as edges
+ # or anchor_refs.
+ for link in links:
+ # Files like experimental_history.html are refered to with the URL
+ # experimental.history.html.
+ head, last = link.rsplit('/', 1) if '/' in link else ('', link)
+ if '#' in last:
+ last, anchor = last.split('#')
+ else:
+ anchor = ''
+
+ if last.count('.') > 1:
+ last = last.replace('.', '_', last.count('.') - 1)
+ link = posixpath.join(head, last)
+ if anchor:
+ link = '%s#%s' % (link, anchor)
+
+ if link.startswith('#'):
+ anchor_refs.append(link)
+ else:
+ if link.startswith('/'):
+ link = link[1:]
+ else:
+ link = posixpath.normpath('%s/%s' % (base, link))
+
+ if '#' in link:
+ anchor_refs.append(link)
+ else:
+ edges.append(link)
+
+ return Page(200, edges, anchors, anchor_refs)
+
+ top_level_directories = (
+ (svn_constants.PUBLIC_TEMPLATE_PATH, ''),
+ (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),
+ )
+
+ for dirpath, urlprefix in top_level_directories:
+ files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
+ for url, path in files:
+ self._pages[url] = process(
+ url, 'gc-container' if url in self._root_pages else 'gc-pagecontent')
not at google - send to devlin 2013/07/09 00:17:32 please tell me we can implement this without needi
jshumway 2013/07/17 00:49:55 As we talked about in IRC, we can implement this w
+
+ if self._pages[url].status != 200:
+ print (url, ', a url derived from the path', dirpath +
+ ', resulted in a', self._pages[url].status)
+
+ def GetBrokenLinks(self):
+ ''' Find all broken links. A broken link is a link that leads to a page that
+ does not exist (404s when rendered) or that contains an anchor that does not
+ properly resolve.
+
+ Returns a list of tuples, one tuple per broken link. The first item is the
+ page that a broken link occurred on; the second item the value of the link;
+ and an optional third value with some debugging information.
not at google - send to devlin 2013/07/09 00:17:32 Can we either always or never have a third argumen
jshumway 2013/07/17 00:49:55 The method now returns two lists, the first list c
+ '''
+ broken_links = []
+
+ for url in self._pages.keys():
+ page = self._pages[url]
+ if page.status != 200:
+ continue
+
+ # First test links without anchors.
+ for link in page.links:
+ if self._pages[link].status != 200:
+ broken_links.append((url, link))
+
+ # Then find broken links with an anchor component.
not at google - send to devlin 2013/07/09 00:17:32 use urlparse here?
jshumway 2013/07/17 00:49:55 Done.
+ for ref in page.anchor_refs:
+ if ref.startswith('#'):
+ if not ref.lstrip('#') in page.anchors and ref != '#top':
+ broken_links.append((url, ref))
+ else:
+ path, lref = ref.split('#')
+ target_page = self._pages[path]
+
+ if target_page.status != 200:
+ broken_links.append((url, ref, 'target page does not exist'))
+ elif not lref in target_page.anchors:
+ broken_links.append((url, ref))
+
+ return broken_links
+
+ def GetOrphanPages(self, seed_urls=None):
not at google - send to devlin 2013/07/09 00:17:32 Orphaned nor Orphan Orphan sounds so sad :(
jshumway 2013/07/17 00:49:55 Agreed.
+ ''' Crawl the server find all pages that are connected to the pages at
+ |seed_url|s. Return the links that are valid on the server but are not in
+ part of the connected component containing the |seed_url|s. These pages
+ are orphans and cannot be reached simply by clicking through the server.
+
+ If seed_urls is not provided, root_pages will be used instead.
+ '''
+ seeds = seed_urls or self._root_pages
+ queue = deque(seeds)
not at google - send to devlin 2013/07/09 00:17:32 pages_to_check?
jshumway 2013/07/17 00:49:55 Done.
+ connected = set(seeds)
not at google - send to devlin 2013/07/09 00:17:32 found?
jshumway 2013/07/17 00:49:55 Done.
+
+ while queue:
+ item = queue.popleft()
+
not at google - send to devlin 2013/07/09 00:17:32 no blank line here
jshumway 2013/07/17 00:49:55 Done.
+ for link in self._pages[item].links:
+ if link not in connected:
+ connected.add(link)
+ queue.append(link)
not at google - send to devlin 2013/07/09 00:17:32 if links were a set this would be a lot simpler.
jshumway 2013/07/17 00:49:55 Perhaps. Is it better that is resembles the pseudo
+
+ connected |= self._ignore_detached
not at google - send to devlin 2013/07/09 00:17:32 instead of this, subtract ignore_detached from the
jshumway 2013/07/17 00:49:55 I moved a bunch of stuff around and | the set into
+
+ return [p for p in set(self._pages.keys()) - connected if '.html' in p]
not at google - send to devlin 2013/07/09 00:17:32 when does "if '.html' in p" fail? and see comment
jshumway 2013/07/17 00:49:55 In many places. png, zip, jpg, gif, py, js, css, t
+
+def pprint_broken_links(links):
not at google - send to devlin 2013/07/09 00:17:32 move this to integration_test - and call it Pretty
jshumway 2013/07/17 00:49:55 Done. Moved as a function because it is called twi
+ ''' Print out broken links in a more readable format.
+ '''
+ headings = [
+ 'page a broken link occurred',
+ "broken link's href value",
+ 'notes\n'
+ ]
+
+ links.insert(0, headings)
+
+ colw0 = max([len(i[0]) for i in links])
+ colw1 = max([len(i[1]) for i in links])
+
+ for col in links:
+ # The first link, right aligned.
+ print '%*s ->' % (colw0, col[0]),
+ # The second link, left aligned.
+ print '%s%s' % (col[1], (colw1 - len(col[1])) * ' '),
+ # The note, if it exists.
+ print '%s' % col[2] if len(col) == 3 else ''

Powered by Google App Engine
This is Rietveld 408576698