| OLD | NEW | 
|---|
| (Empty) |  | 
|  | 1 """Better tokenizing for coverage.py.""" | 
|  | 2 | 
|  | 3 import codecs, keyword, re, sys, token, tokenize | 
|  | 4 from coverage.backward import StringIO              # pylint: disable=W0622 | 
|  | 5 | 
|  | 6 def phys_tokens(toks): | 
|  | 7     """Return all physical tokens, even line continuations. | 
|  | 8 | 
|  | 9     tokenize.generate_tokens() doesn't return a token for the backslash that | 
|  | 10     continues lines.  This wrapper provides those tokens so that we can | 
|  | 11     re-create a faithful representation of the original source. | 
|  | 12 | 
|  | 13     Returns the same values as generate_tokens() | 
|  | 14 | 
|  | 15     """ | 
|  | 16     last_line = None | 
|  | 17     last_lineno = -1 | 
|  | 18     last_ttype = None | 
|  | 19     for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: | 
|  | 20         if last_lineno != elineno: | 
|  | 21             if last_line and last_line[-2:] == "\\\n": | 
|  | 22                 # We are at the beginning of a new line, and the last line | 
|  | 23                 # ended with a backslash.  We probably have to inject a | 
|  | 24                 # backslash token into the stream. Unfortunately, there's more | 
|  | 25                 # to figure out.  This code:: | 
|  | 26                 # | 
|  | 27                 #   usage = """\ | 
|  | 28                 #   HEY THERE | 
|  | 29                 #   """ | 
|  | 30                 # | 
|  | 31                 # triggers this condition, but the token text is:: | 
|  | 32                 # | 
|  | 33                 #   '"""\\\nHEY THERE\n"""' | 
|  | 34                 # | 
|  | 35                 # so we need to figure out if the backslash is already in the | 
|  | 36                 # string token or not. | 
|  | 37                 inject_backslash = True | 
|  | 38                 if last_ttype == tokenize.COMMENT: | 
|  | 39                     # Comments like this \ | 
|  | 40                     # should never result in a new token. | 
|  | 41                     inject_backslash = False | 
|  | 42                 elif ttype == token.STRING: | 
|  | 43                     if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': | 
|  | 44                         # It's a multiline string and the first line ends with | 
|  | 45                         # a backslash, so we don't need to inject another. | 
|  | 46                         inject_backslash = False | 
|  | 47                 if inject_backslash: | 
|  | 48                     # Figure out what column the backslash is in. | 
|  | 49                     ccol = len(last_line.split("\n")[-2]) - 1 | 
|  | 50                     # Yield the token, with a fake token type. | 
|  | 51                     yield ( | 
|  | 52                         99999, "\\\n", | 
|  | 53                         (slineno, ccol), (slineno, ccol+2), | 
|  | 54                         last_line | 
|  | 55                         ) | 
|  | 56             last_line = ltext | 
|  | 57             last_ttype = ttype | 
|  | 58         yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext | 
|  | 59         last_lineno = elineno | 
|  | 60 | 
|  | 61 | 
|  | 62 def source_token_lines(source): | 
|  | 63     """Generate a series of lines, one for each line in `source`. | 
|  | 64 | 
|  | 65     Each line is a list of pairs, each pair is a token:: | 
|  | 66 | 
|  | 67         [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] | 
|  | 68 | 
|  | 69     Each pair has a token class, and the token text. | 
|  | 70 | 
|  | 71     If you concatenate all the token texts, and then join them with newlines, | 
|  | 72     you should have your original `source` back, with two differences: | 
|  | 73     trailing whitespace is not preserved, and a final line with no newline | 
|  | 74     is indistinguishable from a final line with a newline. | 
|  | 75 | 
|  | 76     """ | 
|  | 77     ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] | 
|  | 78     line = [] | 
|  | 79     col = 0 | 
|  | 80     source = source.expandtabs(8).replace('\r\n', '\n') | 
|  | 81     tokgen = tokenize.generate_tokens(StringIO(source).readline) | 
|  | 82     for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): | 
|  | 83         mark_start = True | 
|  | 84         for part in re.split('(\n)', ttext): | 
|  | 85             if part == '\n': | 
|  | 86                 yield line | 
|  | 87                 line = [] | 
|  | 88                 col = 0 | 
|  | 89                 mark_end = False | 
|  | 90             elif part == '': | 
|  | 91                 mark_end = False | 
|  | 92             elif ttype in ws_tokens: | 
|  | 93                 mark_end = False | 
|  | 94             else: | 
|  | 95                 if mark_start and scol > col: | 
|  | 96                     line.append(("ws", " " * (scol - col))) | 
|  | 97                     mark_start = False | 
|  | 98                 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] | 
|  | 99                 if ttype == token.NAME and keyword.iskeyword(ttext): | 
|  | 100                     tok_class = "key" | 
|  | 101                 line.append((tok_class, part)) | 
|  | 102                 mark_end = True | 
|  | 103             scol = 0 | 
|  | 104         if mark_end: | 
|  | 105             col = ecol | 
|  | 106 | 
|  | 107     if line: | 
|  | 108         yield line | 
|  | 109 | 
|  | 110 def source_encoding(source): | 
|  | 111     """Determine the encoding for `source` (a string), according to PEP 263. | 
|  | 112 | 
|  | 113     Returns a string, the name of the encoding. | 
|  | 114 | 
|  | 115     """ | 
|  | 116     # Note: this function should never be called on Python 3, since py3 has | 
|  | 117     # built-in tools to do this. | 
|  | 118     assert sys.version_info < (3, 0) | 
|  | 119 | 
|  | 120     # This is mostly code adapted from Py3.2's tokenize module. | 
|  | 121 | 
|  | 122     cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") | 
|  | 123 | 
|  | 124     # Do this so the detect_encode code we copied will work. | 
|  | 125     readline = iter(source.splitlines(True)).next | 
|  | 126 | 
|  | 127     def _get_normal_name(orig_enc): | 
|  | 128         """Imitates get_normal_name in tokenizer.c.""" | 
|  | 129         # Only care about the first 12 characters. | 
|  | 130         enc = orig_enc[:12].lower().replace("_", "-") | 
|  | 131         if re.match(r"^utf-8($|-)", enc): | 
|  | 132             return "utf-8" | 
|  | 133         if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): | 
|  | 134             return "iso-8859-1" | 
|  | 135         return orig_enc | 
|  | 136 | 
|  | 137     # From detect_encode(): | 
|  | 138     # It detects the encoding from the presence of a utf-8 bom or an encoding | 
|  | 139     # cookie as specified in pep-0263.  If both a bom and a cookie are present, | 
|  | 140     # but disagree, a SyntaxError will be raised.  If the encoding cookie is an | 
|  | 141     # invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found, | 
|  | 142     # 'utf-8-sig' is returned. | 
|  | 143 | 
|  | 144     # If no encoding is specified, then the default will be returned.  The | 
|  | 145     # default varied with version. | 
|  | 146 | 
|  | 147     if sys.version_info <= (2, 4): | 
|  | 148         default = 'iso-8859-1' | 
|  | 149     else: | 
|  | 150         default = 'ascii' | 
|  | 151 | 
|  | 152     bom_found = False | 
|  | 153     encoding = None | 
|  | 154 | 
|  | 155     def read_or_stop(): | 
|  | 156         """Get the next source line, or ''.""" | 
|  | 157         try: | 
|  | 158             return readline() | 
|  | 159         except StopIteration: | 
|  | 160             return '' | 
|  | 161 | 
|  | 162     def find_cookie(line): | 
|  | 163         """Find an encoding cookie in `line`.""" | 
|  | 164         try: | 
|  | 165             line_string = line.decode('ascii') | 
|  | 166         except UnicodeDecodeError: | 
|  | 167             return None | 
|  | 168 | 
|  | 169         matches = cookie_re.findall(line_string) | 
|  | 170         if not matches: | 
|  | 171             return None | 
|  | 172         encoding = _get_normal_name(matches[0]) | 
|  | 173         try: | 
|  | 174             codec = codecs.lookup(encoding) | 
|  | 175         except LookupError: | 
|  | 176             # This behaviour mimics the Python interpreter | 
|  | 177             raise SyntaxError("unknown encoding: " + encoding) | 
|  | 178 | 
|  | 179         if bom_found: | 
|  | 180             # codecs in 2.3 were raw tuples of functions, assume the best. | 
|  | 181             codec_name = getattr(codec, 'name', encoding) | 
|  | 182             if codec_name != 'utf-8': | 
|  | 183                 # This behaviour mimics the Python interpreter | 
|  | 184                 raise SyntaxError('encoding problem: utf-8') | 
|  | 185             encoding += '-sig' | 
|  | 186         return encoding | 
|  | 187 | 
|  | 188     first = read_or_stop() | 
|  | 189     if first.startswith(codecs.BOM_UTF8): | 
|  | 190         bom_found = True | 
|  | 191         first = first[3:] | 
|  | 192         default = 'utf-8-sig' | 
|  | 193     if not first: | 
|  | 194         return default | 
|  | 195 | 
|  | 196     encoding = find_cookie(first) | 
|  | 197     if encoding: | 
|  | 198         return encoding | 
|  | 199 | 
|  | 200     second = read_or_stop() | 
|  | 201     if not second: | 
|  | 202         return default | 
|  | 203 | 
|  | 204     encoding = find_cookie(second) | 
|  | 205     if encoding: | 
|  | 206         return encoding | 
|  | 207 | 
|  | 208     return default | 
| OLD | NEW | 
|---|