chrome/common/extensions/docs/server2/link_error_detector.py - Issue 17816005: Doc server broken link detection

Unified Diff: chrome/common/extensions/docs/server2/link_error_detector.py

Issue 17816005: Doc server broken link detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: patch rewrite Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: chrome/common/extensions/docs/server2/link_error_detector.py

diff --git a/chrome/common/extensions/docs/server2/link_error_detector.py b/chrome/common/extensions/docs/server2/link_error_detector.py

new file mode 100644

index 0000000000000000000000000000000000000000..9fe72535e9d0254dce2353b3d7721caa1f4322bb

--- /dev/null

+++ b/chrome/common/extensions/docs/server2/link_error_detector.py

@@ -0,0 +1,226 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+from collections import defaultdict, deque, namedtuple

+from HTMLParser import HTMLParser, HTMLParseError

+import posixpath

+from urlparse import urlsplit

+from cron_servlet import CreateURLsFromPaths

+import svn_constants

+Page = namedtuple('Page', 'status, links, anchors, anchor_refs')

+class _ContentParser(HTMLParser):

+ ''' Find all the links inside a section of the document with an id of

not at google - send to devlin 2013/07/09 00:17:32 s/''' Find/'''Finds/

jshumway 2013/07/17 00:49:55 Done.

+ |container_id|. Pull out all 'id' and 'name' attributes and add them to a set.

+ '''

+ def __init__(self, container_id, exclude):

not at google - send to devlin 2013/07/09 00:17:32 what is exclude?

jshumway 2013/07/17 00:49:55 It used to be a way to tell the parser to ignore c

+ HTMLParser.__init__(self)

+ self._container_id = container_id

+ self._in_content = 0

+ self._exclude = exclude

+ self.links = []

+ self.anchors = set()

+ def handle_starttag(self, tag, raw_attrs):

+ attrs = dict(raw_attrs)

+ if tag == 'div':

+ if attrs.get('id', '') == self._container_id or self._in_content:

not at google - send to devlin 2013/07/09 00:17:32 the ", ''" part seems unnecessary.

jshumway 2013/07/17 00:49:55 indeed it is, removed.

+ self._in_content += 1

not at google - send to devlin 2013/07/09 00:17:32 no blank line here

jshumway 2013/07/17 00:49:55 Done.

+ elif self._in_content:

+ href = attrs.get('href')

+ if tag == 'a' and href and not urlsplit(href).scheme in ('http', 'https'):

not at google - send to devlin 2013/07/09 00:17:32 let's have: if tag == 'a': href = attrs.get('hr

jshumway 2013/07/17 00:49:55 Done.

+ if href.endswith('.html') or '#' in href:

+ if all([e not in href for e in self._exclude]):

not at google - send to devlin 2013/07/09 00:17:32 []s not needed, it can just be all(e not in href..

jshumway 2013/07/17 00:49:55 I created such a function. There check for endswi

+ self.links.append(href)

+ if attrs.get('id'):

+ self.anchors.add(attrs['id'])

not at google - send to devlin 2013/07/09 00:17:32 no blank line here or below (I know, it's just tha

jshumway 2013/07/17 00:49:55 Done.

+ if attrs.get('name'):

+ self.anchors.add(attrs['name'])

+ def handle_endtag(self, tag):

+ if self._in_content and tag == 'div':

+ self._in_content -= 1

+class LinkErrorDetector(object):

not at google - send to devlin 2013/07/09 00:17:32 please write tests for this.

jshumway 2013/07/17 00:49:55 Done.

+ ''' Find link errors on the doc server. This includes broken links, those with

+ a target page that 404s or contain an anchor that doesn't exist, or pages that

+ have no links to them.

+ '''

+ def __init__(self,

+ file_system,

+ renderer,

+ public_path,

+ root_pages,

+ exclude=()):

not at google - send to devlin 2013/07/09 00:17:32 explain these arguments

jshumway 2013/07/17 00:49:55 Done.

+ self._file_system = file_system

+ self._renderer = renderer

+ self._public_path = public_path

+ self._exclude = exclude

+ self._pages = defaultdict(lambda: Page(404, (), (), ()))

+ self._root_pages = frozenset(root_pages)

+ self._ignore_detached = frozenset(('apps/404.html', 'extensions/404.html'))

not at google - send to devlin 2013/07/09 00:17:32 always_detached?

jshumway 2013/07/17 00:49:55 Done.

+ self._RenderAllPages()

+ def _RenderAllPages(self):

+ ''' Traverse the public templates directory rendering each URL and

+ processing the resultant html to pull out all links and anchors.

+ '''

+ def process(path, content_from):

+ parser = _ContentParser(content_from, self._exclude)

+ response = self._renderer(path)

+ if response.status != 200:

+ return Page(response.status, (), (), ())

+ if not path.endswith('.html'):

+ return Page(200, (), (), ())

+ try:

+ parser.feed(str(response.content))

+ except HTMLParseError:

+ return Page(200, (), (), ())

+ links, anchors = parser.links, parser.anchors

+ base, _ = path.rsplit('/', 1)

+ edges = []

+ anchor_refs = []

+ # Convert relative links to absolute links and categorize links as edges

+ # or anchor_refs.

+ for link in links:

+ # Files like experimental_history.html are refered to with the URL

+ # experimental.history.html.

+ head, last = link.rsplit('/', 1) if '/' in link else ('', link)

+ if '#' in last:

+ last, anchor = last.split('#')

+ else:

+ anchor = ''

+ if last.count('.') > 1:

+ last = last.replace('.', '_', last.count('.') - 1)

+ link = posixpath.join(head, last)

+ if anchor:

+ link = '%s#%s' % (link, anchor)

+ if link.startswith('#'):

+ anchor_refs.append(link)

+ else:

+ if link.startswith('/'):

+ link = link[1:]

+ else:

+ link = posixpath.normpath('%s/%s' % (base, link))

+ if '#' in link:

+ anchor_refs.append(link)

+ else:

+ edges.append(link)

+ return Page(200, edges, anchors, anchor_refs)

+ top_level_directories = (

+ (svn_constants.PUBLIC_TEMPLATE_PATH, ''),

+ (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),

+ )

+ for dirpath, urlprefix in top_level_directories:

+ files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)

+ for url, path in files:

+ self._pages[url] = process(

+ url, 'gc-container' if url in self._root_pages else 'gc-pagecontent')

not at google - send to devlin 2013/07/09 00:17:32 please tell me we can implement this without needi

jshumway 2013/07/17 00:49:55 As we talked about in IRC, we can implement this w

+ if self._pages[url].status != 200:

+ print (url, ', a url derived from the path', dirpath +

+ ', resulted in a', self._pages[url].status)

+ def GetBrokenLinks(self):

+ ''' Find all broken links. A broken link is a link that leads to a page that

+ does not exist (404s when rendered) or that contains an anchor that does not

+ properly resolve.

+ Returns a list of tuples, one tuple per broken link. The first item is the

+ page that a broken link occurred on; the second item the value of the link;

+ and an optional third value with some debugging information.

not at google - send to devlin 2013/07/09 00:17:32 Can we either always or never have a third argumen

jshumway 2013/07/17 00:49:55 The method now returns two lists, the first list c

+ '''

+ broken_links = []

+ for url in self._pages.keys():

+ page = self._pages[url]

+ if page.status != 200:

+ continue

+ # First test links without anchors.

+ for link in page.links:

+ if self._pages[link].status != 200:

+ broken_links.append((url, link))

+ # Then find broken links with an anchor component.

not at google - send to devlin 2013/07/09 00:17:32 use urlparse here?

jshumway 2013/07/17 00:49:55 Done.

+ for ref in page.anchor_refs:

+ if ref.startswith('#'):

+ if not ref.lstrip('#') in page.anchors and ref != '#top':

+ broken_links.append((url, ref))

+ else:

+ path, lref = ref.split('#')

+ target_page = self._pages[path]

+ if target_page.status != 200:

+ broken_links.append((url, ref, 'target page does not exist'))

+ elif not lref in target_page.anchors:

+ broken_links.append((url, ref))

+ return broken_links

+ def GetOrphanPages(self, seed_urls=None):

not at google - send to devlin 2013/07/09 00:17:32 Orphaned nor Orphan Orphan sounds so sad :(

jshumway 2013/07/17 00:49:55 Agreed.

+ ''' Crawl the server find all pages that are connected to the pages at

+ |seed_url|s. Return the links that are valid on the server but are not in

+ part of the connected component containing the |seed_url|s. These pages

+ are orphans and cannot be reached simply by clicking through the server.

+ If seed_urls is not provided, root_pages will be used instead.

+ '''

+ seeds = seed_urls or self._root_pages

+ queue = deque(seeds)

not at google - send to devlin 2013/07/09 00:17:32 pages_to_check?

jshumway 2013/07/17 00:49:55 Done.

+ connected = set(seeds)

not at google - send to devlin 2013/07/09 00:17:32 found?

jshumway 2013/07/17 00:49:55 Done.

+ while queue:

+ item = queue.popleft()

not at google - send to devlin 2013/07/09 00:17:32 no blank line here

jshumway 2013/07/17 00:49:55 Done.

+ for link in self._pages[item].links:

+ if link not in connected:

+ connected.add(link)

+ queue.append(link)

not at google - send to devlin 2013/07/09 00:17:32 if links were a set this would be a lot simpler.

jshumway 2013/07/17 00:49:55 Perhaps. Is it better that is resembles the pseudo

+ connected |= self._ignore_detached

not at google - send to devlin 2013/07/09 00:17:32 instead of this, subtract ignore_detached from the

jshumway 2013/07/17 00:49:55 I moved a bunch of stuff around and | the set into

+ return [p for p in set(self._pages.keys()) - connected if '.html' in p]

not at google - send to devlin 2013/07/09 00:17:32 when does "if '.html' in p" fail? and see comment

jshumway 2013/07/17 00:49:55 In many places. png, zip, jpg, gif, py, js, css, t

+def pprint_broken_links(links):

not at google - send to devlin 2013/07/09 00:17:32 move this to integration_test - and call it Pretty

jshumway 2013/07/17 00:49:55 Done. Moved as a function because it is called twi

+ ''' Print out broken links in a more readable format.

+ '''

+ headings = [

+ 'page a broken link occurred',

+ "broken link's href value",

+ 'notes\n'

+ ]

+ links.insert(0, headings)

+ colw0 = max([len(i[0]) for i in links])

+ colw1 = max([len(i[1]) for i in links])

+ for col in links:

+ # The first link, right aligned.

+ print '%*s ->' % (colw0, col[0]),

+ # The second link, left aligned.

+ print '%s%s' % (col[1], (colw1 - len(col[1])) * ' '),

+ # The note, if it exists.

+ print '%s' % col[2] if len(col) == 3 else ''

« chrome/common/extensions/docs/server2/integration_test.py ('K') | « chrome/common/extensions/docs/server2/integration_test.py ('k') | no next file » | no next file with comments »