| OLD | NEW |
| 1 """Better tokenizing for coverage.py.""" | 1 """Better tokenizing for coverage.py.""" |
| 2 | 2 |
| 3 import codecs, keyword, re, sys, token, tokenize | 3 import codecs, keyword, re, sys, token, tokenize |
| 4 from coverage.backward import StringIO # pylint: disable=W0622 | 4 from coverage.backward import set # pylint: disable=W0622 |
| 5 from coverage.parser import generate_tokens |
| 6 |
| 5 | 7 |
| 6 def phys_tokens(toks): | 8 def phys_tokens(toks): |
| 7 """Return all physical tokens, even line continuations. | 9 """Return all physical tokens, even line continuations. |
| 8 | 10 |
| 9 tokenize.generate_tokens() doesn't return a token for the backslash that | 11 tokenize.generate_tokens() doesn't return a token for the backslash that |
| 10 continues lines. This wrapper provides those tokens so that we can | 12 continues lines. This wrapper provides those tokens so that we can |
| 11 re-create a faithful representation of the original source. | 13 re-create a faithful representation of the original source. |
| 12 | 14 |
| 13 Returns the same values as generate_tokens() | 15 Returns the same values as generate_tokens() |
| 14 | 16 |
| 15 """ | 17 """ |
| 16 last_line = None | 18 last_line = None |
| 17 last_lineno = -1 | 19 last_lineno = -1 |
| 18 last_ttype = None | 20 last_ttype = None |
| 19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: | 21 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: |
| 20 if last_lineno != elineno: | 22 if last_lineno != elineno: |
| 21 if last_line and last_line[-2:] == "\\\n": | 23 if last_line and last_line.endswith("\\\n"): |
| 22 # We are at the beginning of a new line, and the last line | 24 # We are at the beginning of a new line, and the last line |
| 23 # ended with a backslash. We probably have to inject a | 25 # ended with a backslash. We probably have to inject a |
| 24 # backslash token into the stream. Unfortunately, there's more | 26 # backslash token into the stream. Unfortunately, there's more |
| 25 # to figure out. This code:: | 27 # to figure out. This code:: |
| 26 # | 28 # |
| 27 # usage = """\ | 29 # usage = """\ |
| 28 # HEY THERE | 30 # HEY THERE |
| 29 # """ | 31 # """ |
| 30 # | 32 # |
| 31 # triggers this condition, but the token text is:: | 33 # triggers this condition, but the token text is:: |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 67 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] | 69 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] |
| 68 | 70 |
| 69 Each pair has a token class, and the token text. | 71 Each pair has a token class, and the token text. |
| 70 | 72 |
| 71 If you concatenate all the token texts, and then join them with newlines, | 73 If you concatenate all the token texts, and then join them with newlines, |
| 72 you should have your original `source` back, with two differences: | 74 you should have your original `source` back, with two differences: |
| 73 trailing whitespace is not preserved, and a final line with no newline | 75 trailing whitespace is not preserved, and a final line with no newline |
| 74 is indistinguishable from a final line with a newline. | 76 is indistinguishable from a final line with a newline. |
| 75 | 77 |
| 76 """ | 78 """ |
| 77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] | 79 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) |
| 78 line = [] | 80 line = [] |
| 79 col = 0 | 81 col = 0 |
| 80 source = source.expandtabs(8).replace('\r\n', '\n') | 82 source = source.expandtabs(8).replace('\r\n', '\n') |
| 81 tokgen = tokenize.generate_tokens(StringIO(source).readline) | 83 tokgen = generate_tokens(source) |
| 82 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): | 84 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
| 83 mark_start = True | 85 mark_start = True |
| 84 for part in re.split('(\n)', ttext): | 86 for part in re.split('(\n)', ttext): |
| 85 if part == '\n': | 87 if part == '\n': |
| 86 yield line | 88 yield line |
| 87 line = [] | 89 line = [] |
| 88 col = 0 | 90 col = 0 |
| 89 mark_end = False | 91 mark_end = False |
| 90 elif part == '': | 92 elif part == '': |
| 91 mark_end = False | 93 mark_end = False |
| (...skipping 23 matching lines...) Expand all Loading... |
| 115 """ | 117 """ |
| 116 # Note: this function should never be called on Python 3, since py3 has | 118 # Note: this function should never be called on Python 3, since py3 has |
| 117 # built-in tools to do this. | 119 # built-in tools to do this. |
| 118 assert sys.version_info < (3, 0) | 120 assert sys.version_info < (3, 0) |
| 119 | 121 |
| 120 # This is mostly code adapted from Py3.2's tokenize module. | 122 # This is mostly code adapted from Py3.2's tokenize module. |
| 121 | 123 |
| 122 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") | 124 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") |
| 123 | 125 |
| 124 # Do this so the detect_encode code we copied will work. | 126 # Do this so the detect_encode code we copied will work. |
| 125 readline = iter(source.splitlines()).next | 127 readline = iter(source.splitlines(True)).next |
| 126 | 128 |
| 127 def _get_normal_name(orig_enc): | 129 def _get_normal_name(orig_enc): |
| 128 """Imitates get_normal_name in tokenizer.c.""" | 130 """Imitates get_normal_name in tokenizer.c.""" |
| 129 # Only care about the first 12 characters. | 131 # Only care about the first 12 characters. |
| 130 enc = orig_enc[:12].lower().replace("_", "-") | 132 enc = orig_enc[:12].lower().replace("_", "-") |
| 131 if re.match(r"^utf-8($|-)", enc): | 133 if re.match(r"^utf-8($|-)", enc): |
| 132 return "utf-8" | 134 return "utf-8" |
| 133 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): | 135 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): |
| 134 return "iso-8859-1" | 136 return "iso-8859-1" |
| 135 return orig_enc | 137 return orig_enc |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 170 if not matches: | 172 if not matches: |
| 171 return None | 173 return None |
| 172 encoding = _get_normal_name(matches[0]) | 174 encoding = _get_normal_name(matches[0]) |
| 173 try: | 175 try: |
| 174 codec = codecs.lookup(encoding) | 176 codec = codecs.lookup(encoding) |
| 175 except LookupError: | 177 except LookupError: |
| 176 # This behaviour mimics the Python interpreter | 178 # This behaviour mimics the Python interpreter |
| 177 raise SyntaxError("unknown encoding: " + encoding) | 179 raise SyntaxError("unknown encoding: " + encoding) |
| 178 | 180 |
| 179 if bom_found: | 181 if bom_found: |
| 180 if codec.name != 'utf-8': | 182 # codecs in 2.3 were raw tuples of functions, assume the best. |
| 183 codec_name = getattr(codec, 'name', encoding) |
| 184 if codec_name != 'utf-8': |
| 181 # This behaviour mimics the Python interpreter | 185 # This behaviour mimics the Python interpreter |
| 182 raise SyntaxError('encoding problem: utf-8') | 186 raise SyntaxError('encoding problem: utf-8') |
| 183 encoding += '-sig' | 187 encoding += '-sig' |
| 184 return encoding | 188 return encoding |
| 185 | 189 |
| 186 first = read_or_stop() | 190 first = read_or_stop() |
| 187 if first.startswith(codecs.BOM_UTF8): | 191 if first.startswith(codecs.BOM_UTF8): |
| 188 bom_found = True | 192 bom_found = True |
| 189 first = first[3:] | 193 first = first[3:] |
| 190 default = 'utf-8-sig' | 194 default = 'utf-8-sig' |
| 191 if not first: | 195 if not first: |
| 192 return default | 196 return default |
| 193 | 197 |
| 194 encoding = find_cookie(first) | 198 encoding = find_cookie(first) |
| 195 if encoding: | 199 if encoding: |
| 196 return encoding | 200 return encoding |
| 197 | 201 |
| 198 second = read_or_stop() | 202 second = read_or_stop() |
| 199 if not second: | 203 if not second: |
| 200 return default | 204 return default |
| 201 | 205 |
| 202 encoding = find_cookie(second) | 206 encoding = find_cookie(second) |
| 203 if encoding: | 207 if encoding: |
| 204 return encoding | 208 return encoding |
| 205 | 209 |
| 206 return default | 210 return default |
| OLD | NEW |