| OLD | NEW |
| (Empty) | |
| 1 # Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. |
| 4 |
| 5 from collections import defaultdict, deque, namedtuple |
| 6 from HTMLParser import HTMLParser, HTMLParseError |
| 7 import posixpath |
| 8 from urlparse import urlsplit |
| 9 |
| 10 from file_system_util import CreateURLsFromPaths |
| 11 import svn_constants |
| 12 |
| 13 Page = namedtuple('Page', 'status, links, anchors, anchor_refs') |
| 14 |
| 15 def _SplitAnchor(url): |
| 16 components = urlsplit(url) |
| 17 return components.path, components.fragment |
| 18 |
| 19 def _Process(path, renderer): |
| 20 '''Render the page at |path| using a |renderer| and process the contents of |
| 21 that page. Returns a |Page| namedtuple with fields for the http status code |
| 22 of the page render, the href of all the links that occurred on the page, all |
| 23 of the anchors on the page (ids and names), and all links that contain an |
| 24 anchor component. |
| 25 |
| 26 If a non-html page is properly rendered, a |Page| with status code 200 and |
| 27 all other fields empty is returned. |
| 28 ''' |
| 29 parser = _ContentParser() |
| 30 response = renderer(path) |
| 31 |
| 32 if response.status != 200: |
| 33 return Page(response.status, (), (), ()) |
| 34 if not path.endswith('.html'): |
| 35 return Page(200, (), (), ()) |
| 36 |
| 37 try: |
| 38 parser.feed(str(response.content)) |
| 39 except HTMLParseError: |
| 40 return Page(200, (), (), ()) |
| 41 |
| 42 links, anchors = parser.links, parser.anchors |
| 43 base, _ = path.rsplit('/', 1) |
| 44 edges = [] |
| 45 anchor_refs = [] |
| 46 |
| 47 # Convert relative links to absolute links and categorize links as edges |
| 48 # or anchor_refs. |
| 49 for link in links: |
| 50 # Files like experimental_history.html are refered to with the URL |
| 51 # experimental.history.html. |
| 52 head, last = link.rsplit('/', 1) if '/' in link else ('', link) |
| 53 last, anchor = _SplitAnchor(last) |
| 54 |
| 55 if last.endswith('.html') and last.count('.') > 1: |
| 56 last = last.replace('.', '_', last.count('.') - 1) |
| 57 link = posixpath.join(head, last) |
| 58 if anchor: |
| 59 link = '%s#%s' % (link, anchor) |
| 60 |
| 61 if link.startswith('#'): |
| 62 anchor_refs.append(link) |
| 63 else: |
| 64 if link.startswith('/'): |
| 65 link = link[1:] |
| 66 else: |
| 67 link = posixpath.normpath('%s/%s' % (base, link)) |
| 68 |
| 69 if '#' in link: |
| 70 anchor_refs.append(link) |
| 71 else: |
| 72 edges.append(link) |
| 73 |
| 74 return Page(200, edges, anchors, anchor_refs) |
| 75 |
| 76 def _CategorizeBrokenLinks(url, page, pages): |
| 77 '''Find all the broken links on a page and categorize them as either |
| 78 broken_links, which link to a page that 404s, or broken_anchors. |page| is |
| 79 the page to search at |url|, |pages| is a callable that takes a path and |
| 80 returns a Page. Returns two lists, the first of all the broken_links, the |
| 81 second of all the broken_anchors. |
| 82 ''' |
| 83 broken_links = [] |
| 84 broken_anchors = [] |
| 85 |
| 86 # First test links without anchors. |
| 87 for link in page.links: |
| 88 if pages(link).status != 200: |
| 89 broken_links.append((url, link)) |
| 90 |
| 91 # Then find broken links with an anchor component. |
| 92 for ref in page.anchor_refs: |
| 93 path, anchor = _SplitAnchor(ref) |
| 94 |
| 95 if path == '': |
| 96 if not anchor in page.anchors and anchor != 'top': |
| 97 broken_anchors.append((url, ref)) |
| 98 else: |
| 99 target_page = pages(path) |
| 100 if target_page.status != 200: |
| 101 broken_links.append((url, ref)) |
| 102 elif not anchor in target_page.anchors: |
| 103 broken_anchors.append((url, ref)) |
| 104 |
| 105 return broken_links, broken_anchors |
| 106 |
| 107 class _ContentParser(HTMLParser): |
| 108 '''Parse an html file pulling out all links and anchor_refs, where an |
| 109 anchor_ref is a link that contains an anchor. |
| 110 ''' |
| 111 |
| 112 def __init__(self): |
| 113 HTMLParser.__init__(self) |
| 114 self.links = [] |
| 115 self.anchors = set() |
| 116 |
| 117 def handle_starttag(self, tag, raw_attrs): |
| 118 attrs = dict(raw_attrs) |
| 119 |
| 120 if tag == 'a': |
| 121 # Handle special cases for href's that: start with a space, contain |
| 122 # just a '.' (period), contain python templating code, are an absolute |
| 123 # url, are a zip file, or execute javascript on the page. |
| 124 href = attrs.get('href', '').strip() |
| 125 if href and not href == '.' and not '{{' in href: |
| 126 if not urlsplit(href).scheme in ('http', 'https'): |
| 127 if not href.endswith('.zip') and not 'javascript:' in href: |
| 128 self.links.append(href) |
| 129 |
| 130 if attrs.get('id'): |
| 131 self.anchors.add(attrs['id']) |
| 132 if attrs.get('name'): |
| 133 self.anchors.add(attrs['name']) |
| 134 |
| 135 class LinkErrorDetector(object): |
| 136 '''Finds link errors on the doc server. This includes broken links, those with |
| 137 a target page that 404s or contain an anchor that doesn't exist, or pages that |
| 138 have no links to them. |
| 139 ''' |
| 140 |
| 141 def __init__(self, file_system, renderer, public_path, root_pages): |
| 142 '''Creates a new broken link detector. |renderer| is a callable that takes |
| 143 a path and returns a full html page. |public_path| is the path to public |
| 144 template files. All URLs in |root_pages| are used as the starting nodes for |
| 145 the orphaned page search. |
| 146 ''' |
| 147 self._file_system = file_system |
| 148 self._renderer = renderer |
| 149 self._public_path = public_path |
| 150 self._pages = defaultdict(lambda: Page(404, (), (), ())) |
| 151 self._root_pages = frozenset(root_pages) |
| 152 self._always_detached = frozenset(('apps/404.html', 'extensions/404.html')) |
| 153 |
| 154 self._RenderAllPages() |
| 155 |
| 156 def _RenderAllPages(self): |
| 157 '''Traverses the public templates directory rendering each URL and |
| 158 processing the resultant html to pull out all links and anchors. |
| 159 ''' |
| 160 top_level_directories = ( |
| 161 (svn_constants.PUBLIC_TEMPLATE_PATH, ''), |
| 162 (svn_constants.STATIC_PATH, 'static/'), |
| 163 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'), |
| 164 ) |
| 165 |
| 166 for dirpath, urlprefix in top_level_directories: |
| 167 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) |
| 168 for url, path in files: |
| 169 self._pages[url] = _Process(url, self._renderer) |
| 170 |
| 171 if self._pages[url].status != 200: |
| 172 print(url, ', a url derived from the path', dirpath + |
| 173 ', resulted in a', self._pages[url].status) |
| 174 |
| 175 def GetBrokenLinks(self): |
| 176 '''Finds all broken links. A broken link is a link that leads to a page that |
| 177 does not exist (404s when rendered) or that contains an anchor that does not |
| 178 properly resolve. |
| 179 |
| 180 Returns a pair of lists, the first all of the links that lead to a |
| 181 non-existant page, the second all of the links that contain a broken |
| 182 anchor. Each item in the lists is a tuple of the page a broken link |
| 183 occurred on and the href of the broken link. |
| 184 ''' |
| 185 broken_links = [] |
| 186 broken_anchors = [] |
| 187 |
| 188 for url in self._pages.keys(): |
| 189 page = self._pages[url] |
| 190 if page.status != 200: |
| 191 continue |
| 192 links, anchors = _CategorizeBrokenLinks( |
| 193 url, page, lambda x: self._pages[x]) |
| 194 |
| 195 broken_links.extend(links) |
| 196 broken_anchors.extend(anchors) |
| 197 |
| 198 return broken_links, broken_anchors |
| 199 |
| 200 def GetOrphanedPages(self): |
| 201 '''Crawls the server find all pages that are connected to the pages at |
| 202 |seed_url|s. Return the links that are valid on the server but are not in |
| 203 part of the connected component containing the |root_pages|. These pages |
| 204 are orphans and cannot be reached simply by clicking through the server. |
| 205 ''' |
| 206 pages_to_check = deque(self._root_pages) |
| 207 found = set(self._root_pages) | self._always_detached |
| 208 |
| 209 while pages_to_check: |
| 210 item = pages_to_check.popleft() |
| 211 for link in self._pages[item].links: |
| 212 if link not in found: |
| 213 found.add(link) |
| 214 pages_to_check.append(link) |
| 215 |
| 216 all_urls = set( |
| 217 [url for url, page in self._pages.iteritems() if page.status == 200]) |
| 218 |
| 219 return [url for url in all_urls - found if url.endswith('.html')] |
| OLD | NEW |