OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # | |
3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved. | |
4 # | |
5 # Licensed under the Apache License, Version 2.0 (the "License"); | |
6 # you may not use this file except in compliance with the License. | |
7 # You may obtain a copy of the License at | |
8 # | |
9 # http://www.apache.org/licenses/LICENSE-2.0 | |
10 # | |
11 # Unless required by applicable law or agreed to in writing, software | |
12 # distributed under the License is distributed on an "AS-IS" BASIS, | |
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 # See the License for the specific language governing permissions and | |
15 # limitations under the License. | |
16 | |
17 """Utilities for dealing with HTML.""" | |
18 | |
19 __author__ = ('robbyw@google.com (Robert Walker)') | |
20 | |
21 import cStringIO | |
22 import formatter | |
23 import htmllib | |
24 import HTMLParser | |
25 import re | |
26 | |
27 | |
28 class ScriptExtractor(htmllib.HTMLParser): | |
29 """Subclass of HTMLParser that extracts script contents from an HTML file. | |
30 | |
31 Also inserts appropriate blank lines so that line numbers in the extracted | |
32 code match the line numbers in the original HTML. | |
33 """ | |
34 | |
35 def __init__(self): | |
36 """Initialize a ScriptExtractor.""" | |
37 htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) | |
38 self._in_script = False | |
39 self._text = '' | |
40 | |
41 def start_script(self, attrs): | |
42 """Internal handler for the start of a script tag. | |
43 | |
44 Args: | |
45 attrs: The attributes of the script tag, as a list of tuples. | |
46 """ | |
47 for attribute in attrs: | |
48 if attribute[0].lower() == 'src': | |
49 # Skip script tags with a src specified. | |
50 return | |
51 self._in_script = True | |
52 | |
53 def end_script(self): | |
54 """Internal handler for the end of a script tag.""" | |
55 self._in_script = False | |
56 | |
57 def handle_data(self, data): | |
58 """Internal handler for character data. | |
59 | |
60 Args: | |
61 data: The character data from the HTML file. | |
62 """ | |
63 if self._in_script: | |
64 # If the last line contains whitespace only, i.e. is just there to | |
65 # properly align a </script> tag, strip the whitespace. | |
66 if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'): | |
67 data = data.rstrip(' \t') | |
68 self._text += data | |
69 else: | |
70 self._AppendNewlines(data) | |
71 | |
72 def handle_comment(self, data): | |
73 """Internal handler for HTML comments. | |
74 | |
75 Args: | |
76 data: The text of the comment. | |
77 """ | |
78 self._AppendNewlines(data) | |
79 | |
80 def _AppendNewlines(self, data): | |
81 """Count the number of newlines in the given string and append them. | |
82 | |
83 This ensures line numbers are correct for reported errors. | |
84 | |
85 Args: | |
86 data: The data to count newlines in. | |
87 """ | |
88 # We append 'x' to both sides of the string to ensure that splitlines | |
89 # gives us an accurate count. | |
90 for i in xrange(len(('x' + data + 'x').splitlines()) - 1): | |
91 self._text += '\n' | |
92 | |
93 def GetScriptLines(self): | |
94 """Return the extracted script lines. | |
95 | |
96 Returns: | |
97 The extracted script lines as a list of strings. | |
98 """ | |
99 return self._text.splitlines() | |
100 | |
101 | |
102 def GetScriptLines(f): | |
103 """Extract script tag contents from the given HTML file. | |
104 | |
105 Args: | |
106 f: The HTML file. | |
107 | |
108 Returns: | |
109 Lines in the HTML file that are from script tags. | |
110 """ | |
111 extractor = ScriptExtractor() | |
112 | |
113 # The HTML parser chokes on text like Array.<!string>, so we patch | |
114 # that bug by replacing the < with < - escaping all text inside script | |
115 # tags would be better but it's a bit of a catch 22. | |
116 contents = f.read() | |
117 contents = re.sub(r'<([^\s\w/])', | |
118 lambda x: '<%s' % x.group(1), | |
119 contents) | |
120 | |
121 extractor.feed(contents) | |
122 extractor.close() | |
123 return extractor.GetScriptLines() | |
124 | |
125 | |
126 def StripTags(str): | |
127 """Returns the string with HTML tags stripped. | |
128 | |
129 Args: | |
130 str: An html string. | |
131 | |
132 Returns: | |
133 The html string with all tags stripped. If there was a parse error, returns | |
134 the text successfully parsed so far. | |
135 """ | |
136 # Brute force approach to stripping as much HTML as possible. If there is a | |
137 # parsing error, don't strip text before parse error position, and continue | |
138 # trying from there. | |
139 final_text = '' | |
140 finished = False | |
141 while not finished: | |
142 try: | |
143 strip = _HtmlStripper() | |
144 strip.feed(str) | |
145 strip.close() | |
146 str = strip.get_output() | |
147 final_text += str | |
148 finished = True | |
149 except HTMLParser.HTMLParseError, e: | |
150 final_text += str[:e.offset] | |
151 str = str[e.offset + 1:] | |
152 | |
153 return final_text | |
154 | |
155 | |
156 class _HtmlStripper(HTMLParser.HTMLParser): | |
157 """Simple class to strip tags from HTML. | |
158 | |
159 Does so by doing nothing when encountering tags, and appending character data | |
160 to a buffer when that is encountered. | |
161 """ | |
162 def __init__(self): | |
163 self.reset() | |
164 self.__output = cStringIO.StringIO() | |
165 | |
166 def handle_data(self, d): | |
167 self.__output.write(d) | |
168 | |
169 def get_output(self): | |
170 return self.__output.getvalue() | |
OLD | NEW |