| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env python | |
| 2 # | |
| 3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved. | |
| 4 # | |
| 5 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 6 # you may not use this file except in compliance with the License. | |
| 7 # You may obtain a copy of the License at | |
| 8 # | |
| 9 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 # | |
| 11 # Unless required by applicable law or agreed to in writing, software | |
| 12 # distributed under the License is distributed on an "AS-IS" BASIS, | |
| 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 # See the License for the specific language governing permissions and | |
| 15 # limitations under the License. | |
| 16 | |
| 17 """Utilities for dealing with HTML.""" | |
| 18 | |
| 19 __author__ = ('robbyw@google.com (Robert Walker)') | |
| 20 | |
| 21 import cStringIO | |
| 22 import formatter | |
| 23 import htmllib | |
| 24 import HTMLParser | |
| 25 import re | |
| 26 | |
| 27 | |
| 28 class ScriptExtractor(htmllib.HTMLParser): | |
| 29 """Subclass of HTMLParser that extracts script contents from an HTML file. | |
| 30 | |
| 31 Also inserts appropriate blank lines so that line numbers in the extracted | |
| 32 code match the line numbers in the original HTML. | |
| 33 """ | |
| 34 | |
| 35 def __init__(self): | |
| 36 """Initialize a ScriptExtractor.""" | |
| 37 htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) | |
| 38 self._in_script = False | |
| 39 self._text = '' | |
| 40 | |
| 41 def start_script(self, attrs): | |
| 42 """Internal handler for the start of a script tag. | |
| 43 | |
| 44 Args: | |
| 45 attrs: The attributes of the script tag, as a list of tuples. | |
| 46 """ | |
| 47 for attribute in attrs: | |
| 48 if attribute[0].lower() == 'src': | |
| 49 # Skip script tags with a src specified. | |
| 50 return | |
| 51 self._in_script = True | |
| 52 | |
| 53 def end_script(self): | |
| 54 """Internal handler for the end of a script tag.""" | |
| 55 self._in_script = False | |
| 56 | |
| 57 def handle_data(self, data): | |
| 58 """Internal handler for character data. | |
| 59 | |
| 60 Args: | |
| 61 data: The character data from the HTML file. | |
| 62 """ | |
| 63 if self._in_script: | |
| 64 # If the last line contains whitespace only, i.e. is just there to | |
| 65 # properly align a </script> tag, strip the whitespace. | |
| 66 if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'): | |
| 67 data = data.rstrip(' \t') | |
| 68 self._text += data | |
| 69 else: | |
| 70 self._AppendNewlines(data) | |
| 71 | |
| 72 def handle_comment(self, data): | |
| 73 """Internal handler for HTML comments. | |
| 74 | |
| 75 Args: | |
| 76 data: The text of the comment. | |
| 77 """ | |
| 78 self._AppendNewlines(data) | |
| 79 | |
| 80 def _AppendNewlines(self, data): | |
| 81 """Count the number of newlines in the given string and append them. | |
| 82 | |
| 83 This ensures line numbers are correct for reported errors. | |
| 84 | |
| 85 Args: | |
| 86 data: The data to count newlines in. | |
| 87 """ | |
| 88 # We append 'x' to both sides of the string to ensure that splitlines | |
| 89 # gives us an accurate count. | |
| 90 for i in xrange(len(('x' + data + 'x').splitlines()) - 1): | |
| 91 self._text += '\n' | |
| 92 | |
| 93 def GetScriptLines(self): | |
| 94 """Return the extracted script lines. | |
| 95 | |
| 96 Returns: | |
| 97 The extracted script lines as a list of strings. | |
| 98 """ | |
| 99 return self._text.splitlines() | |
| 100 | |
| 101 | |
| 102 def GetScriptLines(f): | |
| 103 """Extract script tag contents from the given HTML file. | |
| 104 | |
| 105 Args: | |
| 106 f: The HTML file. | |
| 107 | |
| 108 Returns: | |
| 109 Lines in the HTML file that are from script tags. | |
| 110 """ | |
| 111 extractor = ScriptExtractor() | |
| 112 | |
| 113 # The HTML parser chokes on text like Array.<!string>, so we patch | |
| 114 # that bug by replacing the < with < - escaping all text inside script | |
| 115 # tags would be better but it's a bit of a catch 22. | |
| 116 contents = f.read() | |
| 117 contents = re.sub(r'<([^\s\w/])', | |
| 118 lambda x: '<%s' % x.group(1), | |
| 119 contents) | |
| 120 | |
| 121 extractor.feed(contents) | |
| 122 extractor.close() | |
| 123 return extractor.GetScriptLines() | |
| 124 | |
| 125 | |
| 126 def StripTags(str): | |
| 127 """Returns the string with HTML tags stripped. | |
| 128 | |
| 129 Args: | |
| 130 str: An html string. | |
| 131 | |
| 132 Returns: | |
| 133 The html string with all tags stripped. If there was a parse error, returns | |
| 134 the text successfully parsed so far. | |
| 135 """ | |
| 136 # Brute force approach to stripping as much HTML as possible. If there is a | |
| 137 # parsing error, don't strip text before parse error position, and continue | |
| 138 # trying from there. | |
| 139 final_text = '' | |
| 140 finished = False | |
| 141 while not finished: | |
| 142 try: | |
| 143 strip = _HtmlStripper() | |
| 144 strip.feed(str) | |
| 145 strip.close() | |
| 146 str = strip.get_output() | |
| 147 final_text += str | |
| 148 finished = True | |
| 149 except HTMLParser.HTMLParseError, e: | |
| 150 final_text += str[:e.offset] | |
| 151 str = str[e.offset + 1:] | |
| 152 | |
| 153 return final_text | |
| 154 | |
| 155 | |
| 156 class _HtmlStripper(HTMLParser.HTMLParser): | |
| 157 """Simple class to strip tags from HTML. | |
| 158 | |
| 159 Does so by doing nothing when encountering tags, and appending character data | |
| 160 to a buffer when that is encountered. | |
| 161 """ | |
| 162 def __init__(self): | |
| 163 self.reset() | |
| 164 self.__output = cStringIO.StringIO() | |
| 165 | |
| 166 def handle_data(self, d): | |
| 167 self.__output.write(d) | |
| 168 | |
| 169 def get_output(self): | |
| 170 return self.__output.getvalue() | |
| OLD | NEW |