| OLD | NEW |
| (Empty) |
| 1 """Better tokenizing for coverage.py.""" | |
| 2 | |
| 3 import codecs, keyword, re, sys, token, tokenize | |
| 4 from coverage.backward import StringIO # pylint: disable=W0622 | |
| 5 | |
| 6 def phys_tokens(toks): | |
| 7 """Return all physical tokens, even line continuations. | |
| 8 | |
| 9 tokenize.generate_tokens() doesn't return a token for the backslash that | |
| 10 continues lines. This wrapper provides those tokens so that we can | |
| 11 re-create a faithful representation of the original source. | |
| 12 | |
| 13 Returns the same values as generate_tokens() | |
| 14 | |
| 15 """ | |
| 16 last_line = None | |
| 17 last_lineno = -1 | |
| 18 last_ttype = None | |
| 19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: | |
| 20 if last_lineno != elineno: | |
| 21 if last_line and last_line[-2:] == "\\\n": | |
| 22 # We are at the beginning of a new line, and the last line | |
| 23 # ended with a backslash. We probably have to inject a | |
| 24 # backslash token into the stream. Unfortunately, there's more | |
| 25 # to figure out. This code:: | |
| 26 # | |
| 27 # usage = """\ | |
| 28 # HEY THERE | |
| 29 # """ | |
| 30 # | |
| 31 # triggers this condition, but the token text is:: | |
| 32 # | |
| 33 # '"""\\\nHEY THERE\n"""' | |
| 34 # | |
| 35 # so we need to figure out if the backslash is already in the | |
| 36 # string token or not. | |
| 37 inject_backslash = True | |
| 38 if last_ttype == tokenize.COMMENT: | |
| 39 # Comments like this \ | |
| 40 # should never result in a new token. | |
| 41 inject_backslash = False | |
| 42 elif ttype == token.STRING: | |
| 43 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': | |
| 44 # It's a multiline string and the first line ends with | |
| 45 # a backslash, so we don't need to inject another. | |
| 46 inject_backslash = False | |
| 47 if inject_backslash: | |
| 48 # Figure out what column the backslash is in. | |
| 49 ccol = len(last_line.split("\n")[-2]) - 1 | |
| 50 # Yield the token, with a fake token type. | |
| 51 yield ( | |
| 52 99999, "\\\n", | |
| 53 (slineno, ccol), (slineno, ccol+2), | |
| 54 last_line | |
| 55 ) | |
| 56 last_line = ltext | |
| 57 last_ttype = ttype | |
| 58 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext | |
| 59 last_lineno = elineno | |
| 60 | |
| 61 | |
| 62 def source_token_lines(source): | |
| 63 """Generate a series of lines, one for each line in `source`. | |
| 64 | |
| 65 Each line is a list of pairs, each pair is a token:: | |
| 66 | |
| 67 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] | |
| 68 | |
| 69 Each pair has a token class, and the token text. | |
| 70 | |
| 71 If you concatenate all the token texts, and then join them with newlines, | |
| 72 you should have your original `source` back, with two differences: | |
| 73 trailing whitespace is not preserved, and a final line with no newline | |
| 74 is indistinguishable from a final line with a newline. | |
| 75 | |
| 76 """ | |
| 77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] | |
| 78 line = [] | |
| 79 col = 0 | |
| 80 source = source.expandtabs(8).replace('\r\n', '\n') | |
| 81 tokgen = tokenize.generate_tokens(StringIO(source).readline) | |
| 82 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): | |
| 83 mark_start = True | |
| 84 for part in re.split('(\n)', ttext): | |
| 85 if part == '\n': | |
| 86 yield line | |
| 87 line = [] | |
| 88 col = 0 | |
| 89 mark_end = False | |
| 90 elif part == '': | |
| 91 mark_end = False | |
| 92 elif ttype in ws_tokens: | |
| 93 mark_end = False | |
| 94 else: | |
| 95 if mark_start and scol > col: | |
| 96 line.append(("ws", " " * (scol - col))) | |
| 97 mark_start = False | |
| 98 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] | |
| 99 if ttype == token.NAME and keyword.iskeyword(ttext): | |
| 100 tok_class = "key" | |
| 101 line.append((tok_class, part)) | |
| 102 mark_end = True | |
| 103 scol = 0 | |
| 104 if mark_end: | |
| 105 col = ecol | |
| 106 | |
| 107 if line: | |
| 108 yield line | |
| 109 | |
| 110 def source_encoding(source): | |
| 111 """Determine the encoding for `source` (a string), according to PEP 263. | |
| 112 | |
| 113 Returns a string, the name of the encoding. | |
| 114 | |
| 115 """ | |
| 116 # Note: this function should never be called on Python 3, since py3 has | |
| 117 # built-in tools to do this. | |
| 118 assert sys.version_info < (3, 0) | |
| 119 | |
| 120 # This is mostly code adapted from Py3.2's tokenize module. | |
| 121 | |
| 122 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") | |
| 123 | |
| 124 # Do this so the detect_encode code we copied will work. | |
| 125 readline = iter(source.splitlines()).next | |
| 126 | |
| 127 def _get_normal_name(orig_enc): | |
| 128 """Imitates get_normal_name in tokenizer.c.""" | |
| 129 # Only care about the first 12 characters. | |
| 130 enc = orig_enc[:12].lower().replace("_", "-") | |
| 131 if re.match(r"^utf-8($|-)", enc): | |
| 132 return "utf-8" | |
| 133 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): | |
| 134 return "iso-8859-1" | |
| 135 return orig_enc | |
| 136 | |
| 137 # From detect_encode(): | |
| 138 # It detects the encoding from the presence of a utf-8 bom or an encoding | |
| 139 # cookie as specified in pep-0263. If both a bom and a cookie are present, | |
| 140 # but disagree, a SyntaxError will be raised. If the encoding cookie is an | |
| 141 # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, | |
| 142 # 'utf-8-sig' is returned. | |
| 143 | |
| 144 # If no encoding is specified, then the default will be returned. The | |
| 145 # default varied with version. | |
| 146 | |
| 147 if sys.version_info <= (2, 4): | |
| 148 default = 'iso-8859-1' | |
| 149 else: | |
| 150 default = 'ascii' | |
| 151 | |
| 152 bom_found = False | |
| 153 encoding = None | |
| 154 | |
| 155 def read_or_stop(): | |
| 156 """Get the next source line, or ''.""" | |
| 157 try: | |
| 158 return readline() | |
| 159 except StopIteration: | |
| 160 return '' | |
| 161 | |
| 162 def find_cookie(line): | |
| 163 """Find an encoding cookie in `line`.""" | |
| 164 try: | |
| 165 line_string = line.decode('ascii') | |
| 166 except UnicodeDecodeError: | |
| 167 return None | |
| 168 | |
| 169 matches = cookie_re.findall(line_string) | |
| 170 if not matches: | |
| 171 return None | |
| 172 encoding = _get_normal_name(matches[0]) | |
| 173 try: | |
| 174 codec = codecs.lookup(encoding) | |
| 175 except LookupError: | |
| 176 # This behaviour mimics the Python interpreter | |
| 177 raise SyntaxError("unknown encoding: " + encoding) | |
| 178 | |
| 179 if bom_found: | |
| 180 if codec.name != 'utf-8': | |
| 181 # This behaviour mimics the Python interpreter | |
| 182 raise SyntaxError('encoding problem: utf-8') | |
| 183 encoding += '-sig' | |
| 184 return encoding | |
| 185 | |
| 186 first = read_or_stop() | |
| 187 if first.startswith(codecs.BOM_UTF8): | |
| 188 bom_found = True | |
| 189 first = first[3:] | |
| 190 default = 'utf-8-sig' | |
| 191 if not first: | |
| 192 return default | |
| 193 | |
| 194 encoding = find_cookie(first) | |
| 195 if encoding: | |
| 196 return encoding | |
| 197 | |
| 198 second = read_or_stop() | |
| 199 if not second: | |
| 200 return default | |
| 201 | |
| 202 encoding = find_cookie(second) | |
| 203 if encoding: | |
| 204 return encoding | |
| 205 | |
| 206 return default | |
| OLD | NEW |