| Index: third_party/closure_linter/closure_linter/common/htmlutil.py
|
| diff --git a/third_party/closure_linter/closure_linter/common/htmlutil.py b/third_party/closure_linter/closure_linter/common/htmlutil.py
|
| deleted file mode 100755
|
| index 26d44c5908353d89d56a131d01846f1328b20485..0000000000000000000000000000000000000000
|
| --- a/third_party/closure_linter/closure_linter/common/htmlutil.py
|
| +++ /dev/null
|
| @@ -1,170 +0,0 @@
|
| -#!/usr/bin/env python
|
| -#
|
| -# Copyright 2007 The Closure Linter Authors. All Rights Reserved.
|
| -#
|
| -# Licensed under the Apache License, Version 2.0 (the "License");
|
| -# you may not use this file except in compliance with the License.
|
| -# You may obtain a copy of the License at
|
| -#
|
| -# http://www.apache.org/licenses/LICENSE-2.0
|
| -#
|
| -# Unless required by applicable law or agreed to in writing, software
|
| -# distributed under the License is distributed on an "AS-IS" BASIS,
|
| -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| -# See the License for the specific language governing permissions and
|
| -# limitations under the License.
|
| -
|
| -"""Utilities for dealing with HTML."""
|
| -
|
| -__author__ = ('robbyw@google.com (Robert Walker)')
|
| -
|
| -import cStringIO
|
| -import formatter
|
| -import htmllib
|
| -import HTMLParser
|
| -import re
|
| -
|
| -
|
| -class ScriptExtractor(htmllib.HTMLParser):
|
| - """Subclass of HTMLParser that extracts script contents from an HTML file.
|
| -
|
| - Also inserts appropriate blank lines so that line numbers in the extracted
|
| - code match the line numbers in the original HTML.
|
| - """
|
| -
|
| - def __init__(self):
|
| - """Initialize a ScriptExtractor."""
|
| - htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
|
| - self._in_script = False
|
| - self._text = ''
|
| -
|
| - def start_script(self, attrs):
|
| - """Internal handler for the start of a script tag.
|
| -
|
| - Args:
|
| - attrs: The attributes of the script tag, as a list of tuples.
|
| - """
|
| - for attribute in attrs:
|
| - if attribute[0].lower() == 'src':
|
| - # Skip script tags with a src specified.
|
| - return
|
| - self._in_script = True
|
| -
|
| - def end_script(self):
|
| - """Internal handler for the end of a script tag."""
|
| - self._in_script = False
|
| -
|
| - def handle_data(self, data):
|
| - """Internal handler for character data.
|
| -
|
| - Args:
|
| - data: The character data from the HTML file.
|
| - """
|
| - if self._in_script:
|
| - # If the last line contains whitespace only, i.e. is just there to
|
| - # properly align a </script> tag, strip the whitespace.
|
| - if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'):
|
| - data = data.rstrip(' \t')
|
| - self._text += data
|
| - else:
|
| - self._AppendNewlines(data)
|
| -
|
| - def handle_comment(self, data):
|
| - """Internal handler for HTML comments.
|
| -
|
| - Args:
|
| - data: The text of the comment.
|
| - """
|
| - self._AppendNewlines(data)
|
| -
|
| - def _AppendNewlines(self, data):
|
| - """Count the number of newlines in the given string and append them.
|
| -
|
| - This ensures line numbers are correct for reported errors.
|
| -
|
| - Args:
|
| - data: The data to count newlines in.
|
| - """
|
| - # We append 'x' to both sides of the string to ensure that splitlines
|
| - # gives us an accurate count.
|
| - for i in xrange(len(('x' + data + 'x').splitlines()) - 1):
|
| - self._text += '\n'
|
| -
|
| - def GetScriptLines(self):
|
| - """Return the extracted script lines.
|
| -
|
| - Returns:
|
| - The extracted script lines as a list of strings.
|
| - """
|
| - return self._text.splitlines()
|
| -
|
| -
|
| -def GetScriptLines(f):
|
| - """Extract script tag contents from the given HTML file.
|
| -
|
| - Args:
|
| - f: The HTML file.
|
| -
|
| - Returns:
|
| - Lines in the HTML file that are from script tags.
|
| - """
|
| - extractor = ScriptExtractor()
|
| -
|
| - # The HTML parser chokes on text like Array.<!string>, so we patch
|
| - # that bug by replacing the < with < - escaping all text inside script
|
| - # tags would be better but it's a bit of a catch 22.
|
| - contents = f.read()
|
| - contents = re.sub(r'<([^\s\w/])',
|
| - lambda x: '<%s' % x.group(1),
|
| - contents)
|
| -
|
| - extractor.feed(contents)
|
| - extractor.close()
|
| - return extractor.GetScriptLines()
|
| -
|
| -
|
| -def StripTags(str):
|
| - """Returns the string with HTML tags stripped.
|
| -
|
| - Args:
|
| - str: An html string.
|
| -
|
| - Returns:
|
| - The html string with all tags stripped. If there was a parse error, returns
|
| - the text successfully parsed so far.
|
| - """
|
| - # Brute force approach to stripping as much HTML as possible. If there is a
|
| - # parsing error, don't strip text before parse error position, and continue
|
| - # trying from there.
|
| - final_text = ''
|
| - finished = False
|
| - while not finished:
|
| - try:
|
| - strip = _HtmlStripper()
|
| - strip.feed(str)
|
| - strip.close()
|
| - str = strip.get_output()
|
| - final_text += str
|
| - finished = True
|
| - except HTMLParser.HTMLParseError, e:
|
| - final_text += str[:e.offset]
|
| - str = str[e.offset + 1:]
|
| -
|
| - return final_text
|
| -
|
| -
|
| -class _HtmlStripper(HTMLParser.HTMLParser):
|
| - """Simple class to strip tags from HTML.
|
| -
|
| - Does so by doing nothing when encountering tags, and appending character data
|
| - to a buffer when that is encountered.
|
| - """
|
| - def __init__(self):
|
| - self.reset()
|
| - self.__output = cStringIO.StringIO()
|
| -
|
| - def handle_data(self, d):
|
| - self.__output.write(d)
|
| -
|
| - def get_output(self):
|
| - return self.__output.getvalue()
|
|
|