| OLD | NEW |
| (Empty) | |
| 1 """Better tokenizing for coverage.py.""" |
| 2 |
| 3 import keyword, re, token, tokenize |
| 4 from coverage.backward import StringIO # pylint: disable=W0622 |
| 5 |
| 6 def phys_tokens(toks): |
| 7 """Return all physical tokens, even line continuations. |
| 8 |
| 9 tokenize.generate_tokens() doesn't return a token for the backslash that |
| 10 continues lines. This wrapper provides those tokens so that we can |
| 11 re-create a faithful representation of the original source. |
| 12 |
| 13 Returns the same values as generate_tokens() |
| 14 |
| 15 """ |
| 16 last_line = None |
| 17 last_lineno = -1 |
| 18 last_ttype = None |
| 19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: |
| 20 if last_lineno != elineno: |
| 21 if last_line and last_line[-2:] == "\\\n": |
| 22 # We are at the beginning of a new line, and the last line |
| 23 # ended with a backslash. We probably have to inject a |
| 24 # backslash token into the stream. Unfortunately, there's more |
| 25 # to figure out. This code:: |
| 26 # |
| 27 # usage = """\ |
| 28 # HEY THERE |
| 29 # """ |
| 30 # |
| 31 # triggers this condition, but the token text is:: |
| 32 # |
| 33 # '"""\\\nHEY THERE\n"""' |
| 34 # |
| 35 # so we need to figure out if the backslash is already in the |
| 36 # string token or not. |
| 37 inject_backslash = True |
| 38 if last_ttype == tokenize.COMMENT: |
| 39 # Comments like this \ |
| 40 # should never result in a new token. |
| 41 inject_backslash = False |
| 42 elif ttype == token.STRING: |
| 43 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': |
| 44 # It's a multiline string and the first line ends with |
| 45 # a backslash, so we don't need to inject another. |
| 46 inject_backslash = False |
| 47 if inject_backslash: |
| 48 # Figure out what column the backslash is in. |
| 49 ccol = len(last_line.split("\n")[-2]) - 1 |
| 50 # Yield the token, with a fake token type. |
| 51 yield ( |
| 52 99999, "\\\n", |
| 53 (slineno, ccol), (slineno, ccol+2), |
| 54 last_line |
| 55 ) |
| 56 last_line = ltext |
| 57 last_ttype = ttype |
| 58 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
| 59 last_lineno = elineno |
| 60 |
| 61 |
| 62 def source_token_lines(source): |
| 63 """Generate a series of lines, one for each line in `source`. |
| 64 |
| 65 Each line is a list of pairs, each pair is a token:: |
| 66 |
| 67 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] |
| 68 |
| 69 Each pair has a token class, and the token text. |
| 70 |
| 71 If you concatenate all the token texts, and then join them with newlines, |
| 72 you should have your original `source` back, with two differences: |
| 73 trailing whitespace is not preserved, and a final line with no newline |
| 74 is indistinguishable from a final line with a newline. |
| 75 |
| 76 """ |
| 77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] |
| 78 line = [] |
| 79 col = 0 |
| 80 source = source.expandtabs(8).replace('\r\n', '\n') |
| 81 tokgen = tokenize.generate_tokens(StringIO(source).readline) |
| 82 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
| 83 mark_start = True |
| 84 for part in re.split('(\n)', ttext): |
| 85 if part == '\n': |
| 86 yield line |
| 87 line = [] |
| 88 col = 0 |
| 89 mark_end = False |
| 90 elif part == '': |
| 91 mark_end = False |
| 92 elif ttype in ws_tokens: |
| 93 mark_end = False |
| 94 else: |
| 95 if mark_start and scol > col: |
| 96 line.append(("ws", " " * (scol - col))) |
| 97 mark_start = False |
| 98 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
| 99 if ttype == token.NAME and keyword.iskeyword(ttext): |
| 100 tok_class = "key" |
| 101 line.append((tok_class, part)) |
| 102 mark_end = True |
| 103 scol = 0 |
| 104 if mark_end: |
| 105 col = ecol |
| 106 |
| 107 if line: |
| 108 yield line |
| OLD | NEW |