chrome/common/extensions/docs/server2/link_error_detector.py - Issue 17816005: Doc server broken link detection

Side by Side Diff: chrome/common/extensions/docs/server2/link_error_detector.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: finalization Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/common/extensions/docs/server2/integration_test.py ('k') | chrome/common/extensions/docs/server2/link_error_detector_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 # Copyright 2013 The Chromium Authors. All rights reserved.

	2 # Use of this source code is governed by a BSD-style license that can be

	3 # found in the LICENSE file.

	4

	5 from collections import defaultdict, deque, namedtuple

	6 from HTMLParser import HTMLParser, HTMLParseError

	7 import posixpath

	8 from urlparse import urlsplit

	9

	10 from file_system_util import CreateURLsFromPaths

	11 import svn_constants

	12

	13 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')

	14

	15 def _SplitAnchor(url):

	16 components = urlsplit(url)

	17 return components.path, components.fragment

	18

	19 def _Process(path, renderer):

	20 '''Render the page at \|path\| using a \|renderer\| and process the contents of

	21 that page. Returns a \|Page\| namedtuple with fields for the http status code

	22 of the page render, the href of all the links that occurred on the page, all

	23 of the anchors on the page (ids and names), and all links that contain an

	24 anchor component.

	25

	26 If a non-html page is properly rendered, a \|Page\| with status code 200 and

	27 all other fields empty is returned.

	28 '''

	29 parser = _ContentParser()

	30 response = renderer(path)

	31

	32 if response.status != 200:

	33 return Page(response.status, (), (), ())

	34 if not path.endswith('.html'):

	35 return Page(200, (), (), ())

	36

	37 try:

	38 parser.feed(str(response.content))

	39 except HTMLParseError:

	40 return Page(200, (), (), ())

	41

	42 links, anchors = parser.links, parser.anchors

	43 base, _ = path.rsplit('/', 1)

	44 edges = []

	45 anchor_refs = []

	46

	47 # Convert relative links to absolute links and categorize links as edges

	48 # or anchor_refs.

	49 for link in links:

	50 # Files like experimental_history.html are refered to with the URL

	51 # experimental.history.html.

	52 head, last = link.rsplit('/', 1) if '/' in link else ('', link)

	53 last, anchor = _SplitAnchor(last)

	54

	55 if last.endswith('.html') and last.count('.') > 1:

	56 last = last.replace('.', '_', last.count('.') - 1)

	57 link = posixpath.join(head, last)

	58 if anchor:

	59 link = '%s#%s' % (link, anchor)

	60

	61 if link.startswith('#'):

	62 anchor_refs.append(link)

	63 else:

	64 if link.startswith('/'):

	65 link = link[1:]

	66 else:

	67 link = posixpath.normpath('%s/%s' % (base, link))

	68

	69 if '#' in link:

	70 anchor_refs.append(link)

	71 else:

	72 edges.append(link)

	73

	74 return Page(200, edges, anchors, anchor_refs)

	75

	76 def _CategorizeBrokenLinks(url, page, pages):

	77 '''Find all the broken links on a page and categorize them as either

	78 broken_links, which link to a page that 404s, or broken_anchors. \|page\| is

	79 the page to search at \|url\|, \|pages\| is a callable that takes a path and

	80 returns a Page. Returns two lists, the first of all the broken_links, the

	81 second of all the broken_anchors.

	82 '''

	83 broken_links = []

	84 broken_anchors = []

	85

	86 # First test links without anchors.

	87 for link in page.links:

	88 if pages(link).status != 200:

	89 broken_links.append((url, link))

	90

	91 # Then find broken links with an anchor component.

	92 for ref in page.anchor_refs:

	93 path, anchor = _SplitAnchor(ref)

	94

	95 if path == '':

	96 if not anchor in page.anchors and anchor != 'top':

	97 broken_anchors.append((url, ref))

	98 else:

	99 target_page = pages(path)

	100 if target_page.status != 200:

	101 broken_links.append((url, ref))

	102 elif not anchor in target_page.anchors:

	103 broken_anchors.append((url, ref))

	104

	105 return broken_links, broken_anchors

	106

	107 class _ContentParser(HTMLParser):

	108 '''Parse an html file pulling out all links and anchor_refs, where an

	109 anchor_ref is a link that contains an anchor.

	110 '''

	111

	112 def __init__(self):

	113 HTMLParser.__init__(self)

	114 self.links = []

	115 self.anchors = set()

	116

	117 def handle_starttag(self, tag, raw_attrs):

	118 attrs = dict(raw_attrs)

	119

	120 if tag == 'a':

	121 # Handle special cases for href's that: start with a space, contain

	122 # just a '.' (period), contain python templating code, are an absolute

	123 # url, are a zip file, or execute javascript on the page.

	124 href = attrs.get('href', '').strip()

	125 if href and not href == '.' and not '{{' in href:

	126 if not urlsplit(href).scheme in ('http', 'https'):

	127 if not href.endswith('.zip') and not 'javascript:' in href:

	128 self.links.append(href)

	129

	130 if attrs.get('id'):

	131 self.anchors.add(attrs['id'])

	132 if attrs.get('name'):

	133 self.anchors.add(attrs['name'])

	134

	135 class LinkErrorDetector(object):

	136 '''Finds link errors on the doc server. This includes broken links, those with

	137 a target page that 404s or contain an anchor that doesn't exist, or pages that

	138 have no links to them.

	139 '''

	140

	141 def __init__(self, file_system, renderer, public_path, root_pages):

	142 '''Creates a new broken link detector. \|renderer\| is a callable that takes

	143 a path and returns a full html page. \|public_path\| is the path to public

	144 template files. All URLs in \|root_pages\| are used as the starting nodes for

	145 the orphaned page search.

	146 '''

	147 self._file_system = file_system

	148 self._renderer = renderer

	149 self._public_path = public_path

	150 self._pages = defaultdict(lambda: Page(404, (), (), ()))

	151 self._root_pages = frozenset(root_pages)

	152 self._always_detached = frozenset(('apps/404.html', 'extensions/404.html'))

	153

	154 self._RenderAllPages()

	155

	156 def _RenderAllPages(self):

	157 '''Traverses the public templates directory rendering each URL and

	158 processing the resultant html to pull out all links and anchors.

	159 '''

	160 top_level_directories = (

	161 (svn_constants.PUBLIC_TEMPLATE_PATH, ''),

	162 (svn_constants.STATIC_PATH, 'static/'),

	163 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),

	164 )

	165

	166 for dirpath, urlprefix in top_level_directories:

	167 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)

	168 for url, path in files:

	169 self._pages[url] = _Process(url, self._renderer)

	170

	171 if self._pages[url].status != 200:

	172 print(url, ', a url derived from the path', dirpath +

	173 ', resulted in a', self._pages[url].status)

	174

	175 def GetBrokenLinks(self):

	176 '''Finds all broken links. A broken link is a link that leads to a page that

	177 does not exist (404s when rendered) or that contains an anchor that does not

	178 properly resolve.

	179

	180 Returns a pair of lists, the first all of the links that lead to a

	181 non-existant page, the second all of the links that contain a broken

	182 anchor. Each item in the lists is a tuple of the page a broken link

	183 occurred on and the href of the broken link.

	184 '''

	185 broken_links = []

	186 broken_anchors = []

	187

	188 for url in self._pages.keys():

	189 page = self._pages[url]

	190 if page.status != 200:

	191 continue

	192 links, anchors = _CategorizeBrokenLinks(

	193 url, page, lambda x: self._pages[x])

	194

	195 broken_links.extend(links)

	196 broken_anchors.extend(anchors)

	197

	198 return broken_links, broken_anchors

	199

	200 def GetOrphanedPages(self):

	201 '''Crawls the server find all pages that are connected to the pages at

	202 \|seed_url\|s. Return the links that are valid on the server but are not in

	203 part of the connected component containing the \|root_pages\|. These pages

	204 are orphans and cannot be reached simply by clicking through the server.

	205 '''

	206 pages_to_check = deque(self._root_pages)

	207 found = set(self._root_pages) \| self._always_detached

	208

	209 while pages_to_check:

	210 item = pages_to_check.popleft()

	211 for link in self._pages[item].links:

	212 if link not in found:

	213 found.add(link)

	214 pages_to_check.append(link)

	215

	216 all_urls = set(

	217 [url for url, page in self._pages.iteritems() if page.status == 200])

	218

	219 return [url for url in all_urls - found if url.endswith('.html')]

OLD	NEW