third_party/coverage-3.6/coverage/phystokens.py - Issue 225633007: Upgrade to coverage 3.7.1 and have it auto-build itself on first use.

Side by Side Diff: third_party/coverage-3.6/coverage/phystokens.py

Issue 225633007: Upgrade to coverage 3.7.1 and have it auto-build itself on first use. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/build

Patch Set: sigh our imports are a mess Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 """Better tokenizing for coverage.py."""

2

3 import codecs, keyword, re, sys, token, tokenize

4 from coverage.backward import StringIO # pylint: disable=W0622

5

6 def phys_tokens(toks):

7 """Return all physical tokens, even line continuations.

8

9 tokenize.generate_tokens() doesn't return a token for the backslash that

10 continues lines. This wrapper provides those tokens so that we can

11 re-create a faithful representation of the original source.

12

13 Returns the same values as generate_tokens()

14

15 """

16 last_line = None

17 last_lineno = -1

18 last_ttype = None

19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:

20 if last_lineno != elineno:

21 if last_line and last_line[-2:] == "\\\n":

22 # We are at the beginning of a new line, and the last line

23 # ended with a backslash. We probably have to inject a

24 # backslash token into the stream. Unfortunately, there's more

25 # to figure out. This code::

26 #

27 # usage = """\

28 # HEY THERE

29 # """

30 #

31 # triggers this condition, but the token text is::

32 #

33 # '"""\\\nHEY THERE\n"""'

34 #

35 # so we need to figure out if the backslash is already in the

36 # string token or not.

37 inject_backslash = True

38 if last_ttype == tokenize.COMMENT:

39 # Comments like this \

40 # should never result in a new token.

41 inject_backslash = False

42 elif ttype == token.STRING:

43 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':

44 # It's a multiline string and the first line ends with

45 # a backslash, so we don't need to inject another.

46 inject_backslash = False

47 if inject_backslash:

48 # Figure out what column the backslash is in.

49 ccol = len(last_line.split("\n")[-2]) - 1

50 # Yield the token, with a fake token type.

51 yield (

52 99999, "\\\n",

53 (slineno, ccol), (slineno, ccol+2),

54 last_line

55 )

56 last_line = ltext

57 last_ttype = ttype

58 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext

59 last_lineno = elineno

60

61

62 def source_token_lines(source):

63 """Generate a series of lines, one for each line in `source`.

64

65 Each line is a list of pairs, each pair is a token::

66

67 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]

68

69 Each pair has a token class, and the token text.

70

71 If you concatenate all the token texts, and then join them with newlines,

72 you should have your original `source` back, with two differences:

73 trailing whitespace is not preserved, and a final line with no newline

74 is indistinguishable from a final line with a newline.

75

76 """

77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]

78 line = []

79 col = 0

80 source = source.expandtabs(8).replace('\r\n', '\n')

81 tokgen = tokenize.generate_tokens(StringIO(source).readline)

82 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):

83 mark_start = True

84 for part in re.split('(\n)', ttext):

85 if part == '\n':

86 yield line

87 line = []

88 col = 0

89 mark_end = False

90 elif part == '':

91 mark_end = False

92 elif ttype in ws_tokens:

93 mark_end = False

94 else:

95 if mark_start and scol > col:

96 line.append(("ws", " " * (scol - col)))

97 mark_start = False

98 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]

99 if ttype == token.NAME and keyword.iskeyword(ttext):

100 tok_class = "key"

101 line.append((tok_class, part))

102 mark_end = True

103 scol = 0

104 if mark_end:

105 col = ecol

106

107 if line:

108 yield line

109

110 def source_encoding(source):

111 """Determine the encoding for `source` (a string), according to PEP 263.

112

113 Returns a string, the name of the encoding.

114

115 """

116 # Note: this function should never be called on Python 3, since py3 has

117 # built-in tools to do this.

118 assert sys.version_info < (3, 0)

119

120 # This is mostly code adapted from Py3.2's tokenize module.

121

122 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")

123

124 # Do this so the detect_encode code we copied will work.

125 readline = iter(source.splitlines()).next

126

127 def _get_normal_name(orig_enc):

128 """Imitates get_normal_name in tokenizer.c."""

129 # Only care about the first 12 characters.

130 enc = orig_enc[:12].lower().replace("_", "-")

131 if re.match(r"^utf-8($\|-)", enc):

132 return "utf-8"

133 if re.match(r"^(latin-1\|iso-8859-1\|iso-latin-1)($\|-)", enc):

134 return "iso-8859-1"

135 return orig_enc

136

137 # From detect_encode():

138 # It detects the encoding from the presence of a utf-8 bom or an encoding

139 # cookie as specified in pep-0263. If both a bom and a cookie are present,

140 # but disagree, a SyntaxError will be raised. If the encoding cookie is an

141 # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

142 # 'utf-8-sig' is returned.

143

144 # If no encoding is specified, then the default will be returned. The

145 # default varied with version.

146

147 if sys.version_info <= (2, 4):

148 default = 'iso-8859-1'

149 else:

150 default = 'ascii'

151

152 bom_found = False

153 encoding = None

154

155 def read_or_stop():

156 """Get the next source line, or ''."""

157 try:

158 return readline()

159 except StopIteration:

160 return ''

161

162 def find_cookie(line):

163 """Find an encoding cookie in `line`."""

164 try:

165 line_string = line.decode('ascii')

166 except UnicodeDecodeError:

167 return None

168

169 matches = cookie_re.findall(line_string)

170 if not matches:

171 return None

172 encoding = _get_normal_name(matches[0])

173 try:

174 codec = codecs.lookup(encoding)

175 except LookupError:

176 # This behaviour mimics the Python interpreter

177 raise SyntaxError("unknown encoding: " + encoding)

178

179 if bom_found:

180 if codec.name != 'utf-8':

181 # This behaviour mimics the Python interpreter

182 raise SyntaxError('encoding problem: utf-8')

183 encoding += '-sig'

184 return encoding

185

186 first = read_or_stop()

187 if first.startswith(codecs.BOM_UTF8):

188 bom_found = True

189 first = first[3:]

190 default = 'utf-8-sig'

191 if not first:

192 return default

193

194 encoding = find_cookie(first)

195 if encoding:

196 return encoding

197

198 second = read_or_stop()

199 if not second:

200 return default

201

202 encoding = find_cookie(second)

203 if encoding:

204 return encoding

205

206 return default

OLD	NEW

« no previous file with comments | « third_party/coverage-3.6/coverage/parser.py ('k') | third_party/coverage-3.6/coverage/report.py » ('j') | no next file with comments »