OLD | NEW |
| (Empty) |
1 """Better tokenizing for coverage.py.""" | |
2 | |
3 import keyword, re, token, tokenize | |
4 from coverage.backward import StringIO # pylint: disable=W0622 | |
5 | |
6 def phys_tokens(toks): | |
7 """Return all physical tokens, even line continuations. | |
8 | |
9 tokenize.generate_tokens() doesn't return a token for the backslash that | |
10 continues lines. This wrapper provides those tokens so that we can | |
11 re-create a faithful representation of the original source. | |
12 | |
13 Returns the same values as generate_tokens() | |
14 | |
15 """ | |
16 last_line = None | |
17 last_lineno = -1 | |
18 last_ttype = None | |
19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: | |
20 if last_lineno != elineno: | |
21 if last_line and last_line[-2:] == "\\\n": | |
22 # We are at the beginning of a new line, and the last line | |
23 # ended with a backslash. We probably have to inject a | |
24 # backslash token into the stream. Unfortunately, there's more | |
25 # to figure out. This code:: | |
26 # | |
27 # usage = """\ | |
28 # HEY THERE | |
29 # """ | |
30 # | |
31 # triggers this condition, but the token text is:: | |
32 # | |
33 # '"""\\\nHEY THERE\n"""' | |
34 # | |
35 # so we need to figure out if the backslash is already in the | |
36 # string token or not. | |
37 inject_backslash = True | |
38 if last_ttype == tokenize.COMMENT: | |
39 # Comments like this \ | |
40 # should never result in a new token. | |
41 inject_backslash = False | |
42 elif ttype == token.STRING: | |
43 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': | |
44 # It's a multiline string and the first line ends with | |
45 # a backslash, so we don't need to inject another. | |
46 inject_backslash = False | |
47 if inject_backslash: | |
48 # Figure out what column the backslash is in. | |
49 ccol = len(last_line.split("\n")[-2]) - 1 | |
50 # Yield the token, with a fake token type. | |
51 yield ( | |
52 99999, "\\\n", | |
53 (slineno, ccol), (slineno, ccol+2), | |
54 last_line | |
55 ) | |
56 last_line = ltext | |
57 last_ttype = ttype | |
58 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext | |
59 last_lineno = elineno | |
60 | |
61 | |
62 def source_token_lines(source): | |
63 """Generate a series of lines, one for each line in `source`. | |
64 | |
65 Each line is a list of pairs, each pair is a token:: | |
66 | |
67 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] | |
68 | |
69 Each pair has a token class, and the token text. | |
70 | |
71 If you concatenate all the token texts, and then join them with newlines, | |
72 you should have your original `source` back, with two differences: | |
73 trailing whitespace is not preserved, and a final line with no newline | |
74 is indistinguishable from a final line with a newline. | |
75 | |
76 """ | |
77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] | |
78 line = [] | |
79 col = 0 | |
80 source = source.expandtabs(8).replace('\r\n', '\n') | |
81 tokgen = tokenize.generate_tokens(StringIO(source).readline) | |
82 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): | |
83 mark_start = True | |
84 for part in re.split('(\n)', ttext): | |
85 if part == '\n': | |
86 yield line | |
87 line = [] | |
88 col = 0 | |
89 mark_end = False | |
90 elif part == '': | |
91 mark_end = False | |
92 elif ttype in ws_tokens: | |
93 mark_end = False | |
94 else: | |
95 if mark_start and scol > col: | |
96 line.append(("ws", " " * (scol - col))) | |
97 mark_start = False | |
98 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] | |
99 if ttype == token.NAME and keyword.iskeyword(ttext): | |
100 tok_class = "key" | |
101 line.append((tok_class, part)) | |
102 mark_end = True | |
103 scol = 0 | |
104 if mark_end: | |
105 col = ecol | |
106 | |
107 if line: | |
108 yield line | |
OLD | NEW |