Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(484)

Side by Side Diff: third_party/coverage/phystokens.py

Issue 63813002: Add python coverage 3.7 to depot tools. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/depot_tools
Patch Set: Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/coverage/parser.py ('k') | third_party/coverage/report.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 """Better tokenizing for coverage.py."""
2
3 import codecs, keyword, re, sys, token, tokenize
4 from coverage.backward import StringIO # pylint: disable=W0622
5
6 def phys_tokens(toks):
7 """Return all physical tokens, even line continuations.
8
9 tokenize.generate_tokens() doesn't return a token for the backslash that
10 continues lines. This wrapper provides those tokens so that we can
11 re-create a faithful representation of the original source.
12
13 Returns the same values as generate_tokens()
14
15 """
16 last_line = None
17 last_lineno = -1
18 last_ttype = None
19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
20 if last_lineno != elineno:
21 if last_line and last_line[-2:] == "\\\n":
22 # We are at the beginning of a new line, and the last line
23 # ended with a backslash. We probably have to inject a
24 # backslash token into the stream. Unfortunately, there's more
25 # to figure out. This code::
26 #
27 # usage = """\
28 # HEY THERE
29 # """
30 #
31 # triggers this condition, but the token text is::
32 #
33 # '"""\\\nHEY THERE\n"""'
34 #
35 # so we need to figure out if the backslash is already in the
36 # string token or not.
37 inject_backslash = True
38 if last_ttype == tokenize.COMMENT:
39 # Comments like this \
40 # should never result in a new token.
41 inject_backslash = False
42 elif ttype == token.STRING:
43 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
44 # It's a multiline string and the first line ends with
45 # a backslash, so we don't need to inject another.
46 inject_backslash = False
47 if inject_backslash:
48 # Figure out what column the backslash is in.
49 ccol = len(last_line.split("\n")[-2]) - 1
50 # Yield the token, with a fake token type.
51 yield (
52 99999, "\\\n",
53 (slineno, ccol), (slineno, ccol+2),
54 last_line
55 )
56 last_line = ltext
57 last_ttype = ttype
58 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
59 last_lineno = elineno
60
61
62 def source_token_lines(source):
63 """Generate a series of lines, one for each line in `source`.
64
65 Each line is a list of pairs, each pair is a token::
66
67 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
68
69 Each pair has a token class, and the token text.
70
71 If you concatenate all the token texts, and then join them with newlines,
72 you should have your original `source` back, with two differences:
73 trailing whitespace is not preserved, and a final line with no newline
74 is indistinguishable from a final line with a newline.
75
76 """
77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]
78 line = []
79 col = 0
80 source = source.expandtabs(8).replace('\r\n', '\n')
81 tokgen = tokenize.generate_tokens(StringIO(source).readline)
82 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
83 mark_start = True
84 for part in re.split('(\n)', ttext):
85 if part == '\n':
86 yield line
87 line = []
88 col = 0
89 mark_end = False
90 elif part == '':
91 mark_end = False
92 elif ttype in ws_tokens:
93 mark_end = False
94 else:
95 if mark_start and scol > col:
96 line.append(("ws", " " * (scol - col)))
97 mark_start = False
98 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
99 if ttype == token.NAME and keyword.iskeyword(ttext):
100 tok_class = "key"
101 line.append((tok_class, part))
102 mark_end = True
103 scol = 0
104 if mark_end:
105 col = ecol
106
107 if line:
108 yield line
109
110 def source_encoding(source):
111 """Determine the encoding for `source` (a string), according to PEP 263.
112
113 Returns a string, the name of the encoding.
114
115 """
116 # Note: this function should never be called on Python 3, since py3 has
117 # built-in tools to do this.
118 assert sys.version_info < (3, 0)
119
120 # This is mostly code adapted from Py3.2's tokenize module.
121
122 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
123
124 # Do this so the detect_encode code we copied will work.
125 readline = iter(source.splitlines(True)).next
126
127 def _get_normal_name(orig_enc):
128 """Imitates get_normal_name in tokenizer.c."""
129 # Only care about the first 12 characters.
130 enc = orig_enc[:12].lower().replace("_", "-")
131 if re.match(r"^utf-8($|-)", enc):
132 return "utf-8"
133 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
134 return "iso-8859-1"
135 return orig_enc
136
137 # From detect_encode():
138 # It detects the encoding from the presence of a utf-8 bom or an encoding
139 # cookie as specified in pep-0263. If both a bom and a cookie are present,
140 # but disagree, a SyntaxError will be raised. If the encoding cookie is an
141 # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
142 # 'utf-8-sig' is returned.
143
144 # If no encoding is specified, then the default will be returned. The
145 # default varied with version.
146
147 if sys.version_info <= (2, 4):
148 default = 'iso-8859-1'
149 else:
150 default = 'ascii'
151
152 bom_found = False
153 encoding = None
154
155 def read_or_stop():
156 """Get the next source line, or ''."""
157 try:
158 return readline()
159 except StopIteration:
160 return ''
161
162 def find_cookie(line):
163 """Find an encoding cookie in `line`."""
164 try:
165 line_string = line.decode('ascii')
166 except UnicodeDecodeError:
167 return None
168
169 matches = cookie_re.findall(line_string)
170 if not matches:
171 return None
172 encoding = _get_normal_name(matches[0])
173 try:
174 codec = codecs.lookup(encoding)
175 except LookupError:
176 # This behaviour mimics the Python interpreter
177 raise SyntaxError("unknown encoding: " + encoding)
178
179 if bom_found:
180 # codecs in 2.3 were raw tuples of functions, assume the best.
181 codec_name = getattr(codec, 'name', encoding)
182 if codec_name != 'utf-8':
183 # This behaviour mimics the Python interpreter
184 raise SyntaxError('encoding problem: utf-8')
185 encoding += '-sig'
186 return encoding
187
188 first = read_or_stop()
189 if first.startswith(codecs.BOM_UTF8):
190 bom_found = True
191 first = first[3:]
192 default = 'utf-8-sig'
193 if not first:
194 return default
195
196 encoding = find_cookie(first)
197 if encoding:
198 return encoding
199
200 second = read_or_stop()
201 if not second:
202 return default
203
204 encoding = find_cookie(second)
205 if encoding:
206 return encoding
207
208 return default
OLDNEW
« no previous file with comments | « third_party/coverage/parser.py ('k') | third_party/coverage/report.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698