chrome/common/extensions/docs/server2/link_error_detector.py - Issue 17816005: Doc server broken link detection

Unified Diff: chrome/common/extensions/docs/server2/link_error_detector.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: finalization Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/common/extensions/docs/server2/integration_test.py ('k') | chrome/common/extensions/docs/server2/link_error_detector_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/common/extensions/docs/server2/link_error_detector.py

diff --git a/chrome/common/extensions/docs/server2/link_error_detector.py b/chrome/common/extensions/docs/server2/link_error_detector.py

new file mode 100644

index 0000000000000000000000000000000000000000..d49e86b62c447d27b9ef79c8b3fb8ce9cb8f3128

--- /dev/null

+++ b/chrome/common/extensions/docs/server2/link_error_detector.py

@@ -0,0 +1,219 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+from collections import defaultdict, deque, namedtuple

+from HTMLParser import HTMLParser, HTMLParseError

+import posixpath

+from urlparse import urlsplit

+from file_system_util import CreateURLsFromPaths

+import svn_constants

+Page = namedtuple('Page', 'status, links, anchors, anchor_refs')

+def _SplitAnchor(url):

+ components = urlsplit(url)

+ return components.path, components.fragment

+def _Process(path, renderer):

+ '''Render the page at |path| using a |renderer| and process the contents of

+ that page. Returns a |Page| namedtuple with fields for the http status code

+ of the page render, the href of all the links that occurred on the page, all

+ of the anchors on the page (ids and names), and all links that contain an

+ anchor component.

+ If a non-html page is properly rendered, a |Page| with status code 200 and

+ all other fields empty is returned.

+ '''

+ parser = _ContentParser()

+ response = renderer(path)

+ if response.status != 200:

+ return Page(response.status, (), (), ())

+ if not path.endswith('.html'):

+ return Page(200, (), (), ())

+ try:

+ parser.feed(str(response.content))

+ except HTMLParseError:

+ return Page(200, (), (), ())

+ links, anchors = parser.links, parser.anchors

+ base, _ = path.rsplit('/', 1)

+ edges = []

+ anchor_refs = []

+ # Convert relative links to absolute links and categorize links as edges

+ # or anchor_refs.

+ for link in links:

+ # Files like experimental_history.html are refered to with the URL

+ # experimental.history.html.

+ head, last = link.rsplit('/', 1) if '/' in link else ('', link)

+ last, anchor = _SplitAnchor(last)

+ if last.endswith('.html') and last.count('.') > 1:

+ last = last.replace('.', '_', last.count('.') - 1)

+ link = posixpath.join(head, last)

+ if anchor:

+ link = '%s#%s' % (link, anchor)

+ if link.startswith('#'):

+ anchor_refs.append(link)

+ else:

+ if link.startswith('/'):

+ link = link[1:]

+ else:

+ link = posixpath.normpath('%s/%s' % (base, link))

+ if '#' in link:

+ anchor_refs.append(link)

+ else:

+ edges.append(link)

+ return Page(200, edges, anchors, anchor_refs)

+def _CategorizeBrokenLinks(url, page, pages):

+ '''Find all the broken links on a page and categorize them as either

+ broken_links, which link to a page that 404s, or broken_anchors. |page| is

+ the page to search at |url|, |pages| is a callable that takes a path and

+ returns a Page. Returns two lists, the first of all the broken_links, the

+ second of all the broken_anchors.

+ '''

+ broken_links = []

+ broken_anchors = []

+ # First test links without anchors.

+ for link in page.links:

+ if pages(link).status != 200:

+ broken_links.append((url, link))

+ # Then find broken links with an anchor component.

+ for ref in page.anchor_refs:

+ path, anchor = _SplitAnchor(ref)

+ if path == '':

+ if not anchor in page.anchors and anchor != 'top':

+ broken_anchors.append((url, ref))

+ else:

+ target_page = pages(path)

+ if target_page.status != 200:

+ broken_links.append((url, ref))

+ elif not anchor in target_page.anchors:

+ broken_anchors.append((url, ref))

+ return broken_links, broken_anchors

+class _ContentParser(HTMLParser):

+ '''Parse an html file pulling out all links and anchor_refs, where an

+ anchor_ref is a link that contains an anchor.

+ '''

+ def __init__(self):

+ HTMLParser.__init__(self)

+ self.links = []

+ self.anchors = set()

+ def handle_starttag(self, tag, raw_attrs):

+ attrs = dict(raw_attrs)

+ if tag == 'a':

+ # Handle special cases for href's that: start with a space, contain

+ # just a '.' (period), contain python templating code, are an absolute

+ # url, are a zip file, or execute javascript on the page.

+ href = attrs.get('href', '').strip()

+ if href and not href == '.' and not '{{' in href:

+ if not urlsplit(href).scheme in ('http', 'https'):

+ if not href.endswith('.zip') and not 'javascript:' in href:

+ self.links.append(href)

+ if attrs.get('id'):

+ self.anchors.add(attrs['id'])

+ if attrs.get('name'):

+ self.anchors.add(attrs['name'])

+class LinkErrorDetector(object):

+ '''Finds link errors on the doc server. This includes broken links, those with

+ a target page that 404s or contain an anchor that doesn't exist, or pages that

+ have no links to them.

+ '''

+ def __init__(self, file_system, renderer, public_path, root_pages):

+ '''Creates a new broken link detector. |renderer| is a callable that takes

+ a path and returns a full html page. |public_path| is the path to public

+ template files. All URLs in |root_pages| are used as the starting nodes for

+ the orphaned page search.

+ '''

+ self._file_system = file_system

+ self._renderer = renderer

+ self._public_path = public_path

+ self._pages = defaultdict(lambda: Page(404, (), (), ()))

+ self._root_pages = frozenset(root_pages)

+ self._always_detached = frozenset(('apps/404.html', 'extensions/404.html'))

+ self._RenderAllPages()

+ def _RenderAllPages(self):

+ '''Traverses the public templates directory rendering each URL and

+ processing the resultant html to pull out all links and anchors.

+ '''

+ top_level_directories = (

+ (svn_constants.PUBLIC_TEMPLATE_PATH, ''),

+ (svn_constants.STATIC_PATH, 'static/'),

+ (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),

+ )

+ for dirpath, urlprefix in top_level_directories:

+ files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)

+ for url, path in files:

+ self._pages[url] = _Process(url, self._renderer)

+ if self._pages[url].status != 200:

+ print(url, ', a url derived from the path', dirpath +

+ ', resulted in a', self._pages[url].status)

+ def GetBrokenLinks(self):

+ '''Finds all broken links. A broken link is a link that leads to a page that

+ does not exist (404s when rendered) or that contains an anchor that does not

+ properly resolve.

+ Returns a pair of lists, the first all of the links that lead to a

+ non-existant page, the second all of the links that contain a broken

+ anchor. Each item in the lists is a tuple of the page a broken link

+ occurred on and the href of the broken link.

+ '''

+ broken_links = []

+ broken_anchors = []

+ for url in self._pages.keys():

+ page = self._pages[url]

+ if page.status != 200:

+ continue

+ links, anchors = _CategorizeBrokenLinks(

+ url, page, lambda x: self._pages[x])

+ broken_links.extend(links)

+ broken_anchors.extend(anchors)

+ return broken_links, broken_anchors

+ def GetOrphanedPages(self):

+ '''Crawls the server find all pages that are connected to the pages at

+ |seed_url|s. Return the links that are valid on the server but are not in

+ part of the connected component containing the |root_pages|. These pages

+ are orphans and cannot be reached simply by clicking through the server.

+ '''

+ pages_to_check = deque(self._root_pages)

+ found = set(self._root_pages) | self._always_detached

+ while pages_to_check:

+ item = pages_to_check.popleft()

+ for link in self._pages[item].links:

+ if link not in found:

+ found.add(link)

+ pages_to_check.append(link)

+ all_urls = set(

+ [url for url, page in self._pages.iteritems() if page.status == 200])

+ return [url for url in all_urls - found if url.endswith('.html')]