third_party/closure_linter/closure_linter/common/htmlutil.py - Issue 2592193002: Remove closure_linter from Chrome

Side by Side Diff: third_party/closure_linter/closure_linter/common/htmlutil.py

Issue 2592193002: Remove closure_linter from Chrome (Closed)

Patch Set: Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/closure_linter/closure_linter/common/filetestcase.py ('k') | third_party/closure_linter/closure_linter/common/lintrunner.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 #!/usr/bin/env python

2 #

3 # Copyright 2007 The Closure Linter Authors. All Rights Reserved.

4 #

5 # Licensed under the Apache License, Version 2.0 (the "License");

6 # you may not use this file except in compliance with the License.

7 # You may obtain a copy of the License at

8 #

9 # http://www.apache.org/licenses/LICENSE-2.0

10 #

11 # Unless required by applicable law or agreed to in writing, software

12 # distributed under the License is distributed on an "AS-IS" BASIS,

13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14 # See the License for the specific language governing permissions and

15 # limitations under the License.

16

17 """Utilities for dealing with HTML."""

18

19 __author__ = ('robbyw@google.com (Robert Walker)')

20

21 import cStringIO

22 import formatter

23 import htmllib

24 import HTMLParser

25 import re

26

27

28 class ScriptExtractor(htmllib.HTMLParser):

29 """Subclass of HTMLParser that extracts script contents from an HTML file.

30

31 Also inserts appropriate blank lines so that line numbers in the extracted

32 code match the line numbers in the original HTML.

33 """

34

35 def __init__(self):

36 """Initialize a ScriptExtractor."""

37 htmllib.HTMLParser.__init__(self, formatter.NullFormatter())

38 self._in_script = False

39 self._text = ''

40

41 def start_script(self, attrs):

42 """Internal handler for the start of a script tag.

43

44 Args:

45 attrs: The attributes of the script tag, as a list of tuples.

46 """

47 for attribute in attrs:

48 if attribute[0].lower() == 'src':

49 # Skip script tags with a src specified.

50 return

51 self._in_script = True

52

53 def end_script(self):

54 """Internal handler for the end of a script tag."""

55 self._in_script = False

56

57 def handle_data(self, data):

58 """Internal handler for character data.

59

60 Args:

61 data: The character data from the HTML file.

62 """

63 if self._in_script:

64 # If the last line contains whitespace only, i.e. is just there to

65 # properly align a </script> tag, strip the whitespace.

66 if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'):

67 data = data.rstrip(' \t')

68 self._text += data

69 else:

70 self._AppendNewlines(data)

71

72 def handle_comment(self, data):

73 """Internal handler for HTML comments.

74

75 Args:

76 data: The text of the comment.

77 """

78 self._AppendNewlines(data)

79

80 def _AppendNewlines(self, data):

81 """Count the number of newlines in the given string and append them.

82

83 This ensures line numbers are correct for reported errors.

84

85 Args:

86 data: The data to count newlines in.

87 """

88 # We append 'x' to both sides of the string to ensure that splitlines

89 # gives us an accurate count.

90 for i in xrange(len(('x' + data + 'x').splitlines()) - 1):

91 self._text += '\n'

92

93 def GetScriptLines(self):

94 """Return the extracted script lines.

95

96 Returns:

97 The extracted script lines as a list of strings.

98 """

99 return self._text.splitlines()

100

101

102 def GetScriptLines(f):

103 """Extract script tag contents from the given HTML file.

104

105 Args:

106 f: The HTML file.

107

108 Returns:

109 Lines in the HTML file that are from script tags.

110 """

111 extractor = ScriptExtractor()

112

113 # The HTML parser chokes on text like Array.<!string>, so we patch

114 # that bug by replacing the < with < - escaping all text inside script

115 # tags would be better but it's a bit of a catch 22.

116 contents = f.read()

117 contents = re.sub(r'<([^\s\w/])',

118 lambda x: '<%s' % x.group(1),

119 contents)

120

121 extractor.feed(contents)

122 extractor.close()

123 return extractor.GetScriptLines()

124

125

126 def StripTags(str):

127 """Returns the string with HTML tags stripped.

128

129 Args:

130 str: An html string.

131

132 Returns:

133 The html string with all tags stripped. If there was a parse error, returns

134 the text successfully parsed so far.

135 """

136 # Brute force approach to stripping as much HTML as possible. If there is a

137 # parsing error, don't strip text before parse error position, and continue

138 # trying from there.

139 final_text = ''

140 finished = False

141 while not finished:

142 try:

143 strip = _HtmlStripper()

144 strip.feed(str)

145 strip.close()

146 str = strip.get_output()

147 final_text += str

148 finished = True

149 except HTMLParser.HTMLParseError, e:

150 final_text += str[:e.offset]

151 str = str[e.offset + 1:]

152

153 return final_text

154

155

156 class _HtmlStripper(HTMLParser.HTMLParser):

157 """Simple class to strip tags from HTML.

158

159 Does so by doing nothing when encountering tags, and appending character data

160 to a buffer when that is encountered.

161 """

162 def __init__(self):

163 self.reset()

164 self.__output = cStringIO.StringIO()

165

166 def handle_data(self, d):

167 self.__output.write(d)

168

169 def get_output(self):

170 return self.__output.getvalue()

OLD	NEW