| Index: tools/telemetry/third_party/coverage/coverage/phystokens.py
|
| diff --git a/third_party/pycoverage/coverage/phystokens.py b/tools/telemetry/third_party/coverage/coverage/phystokens.py
|
| similarity index 61%
|
| copy from third_party/pycoverage/coverage/phystokens.py
|
| copy to tools/telemetry/third_party/coverage/coverage/phystokens.py
|
| index 99b1d5ba0c79771e43338cc8a37ce09e7085d7e2..7092d39e2b055b1bb875041c590c59508e8c9f33 100644
|
| --- a/third_party/pycoverage/coverage/phystokens.py
|
| +++ b/tools/telemetry/third_party/coverage/coverage/phystokens.py
|
| @@ -1,8 +1,17 @@
|
| +# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
|
| +# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
|
| +
|
| """Better tokenizing for coverage.py."""
|
|
|
| -import codecs, keyword, re, sys, token, tokenize
|
| -from coverage.backward import set # pylint: disable=W0622
|
| -from coverage.parser import generate_tokens
|
| +import codecs
|
| +import keyword
|
| +import re
|
| +import token
|
| +import tokenize
|
| +
|
| +from coverage import env
|
| +from coverage.backward import iternext
|
| +from coverage.misc import contract
|
|
|
|
|
| def phys_tokens(toks):
|
| @@ -43,7 +52,7 @@ def phys_tokens(toks):
|
| inject_backslash = False
|
| elif ttype == token.STRING:
|
| if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
|
| - # It's a multiline string and the first line ends with
|
| + # It's a multi-line string and the first line ends with
|
| # a backslash, so we don't need to inject another.
|
| inject_backslash = False
|
| if inject_backslash:
|
| @@ -61,6 +70,7 @@ def phys_tokens(toks):
|
| last_lineno = elineno
|
|
|
|
|
| +@contract(source='unicode')
|
| def source_token_lines(source):
|
| """Generate a series of lines, one for each line in `source`.
|
|
|
| @@ -76,11 +86,15 @@ def source_token_lines(source):
|
| is indistinguishable from a final line with a newline.
|
|
|
| """
|
| +
|
| ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
|
| line = []
|
| col = 0
|
| - source = source.expandtabs(8).replace('\r\n', '\n')
|
| +
|
| + # The \f is because of http://bugs.python.org/issue19035
|
| + source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ')
|
| tokgen = generate_tokens(source)
|
| +
|
| for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
|
| mark_start = True
|
| for part in re.split('(\n)', ttext):
|
| @@ -95,7 +109,7 @@ def source_token_lines(source):
|
| mark_end = False
|
| else:
|
| if mark_start and scol > col:
|
| - line.append(("ws", " " * (scol - col)))
|
| + line.append(("ws", u" " * (scol - col)))
|
| mark_start = False
|
| tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
|
| if ttype == token.NAME and keyword.iskeyword(ttext):
|
| @@ -109,22 +123,52 @@ def source_token_lines(source):
|
| if line:
|
| yield line
|
|
|
| -def source_encoding(source):
|
| - """Determine the encoding for `source` (a string), according to PEP 263.
|
|
|
| - Returns a string, the name of the encoding.
|
| +class CachedTokenizer(object):
|
| + """A one-element cache around tokenize.generate_tokens.
|
| +
|
| + When reporting, coverage.py tokenizes files twice, once to find the
|
| + structure of the file, and once to syntax-color it. Tokenizing is
|
| + expensive, and easily cached.
|
| +
|
| + This is a one-element cache so that our twice-in-a-row tokenizing doesn't
|
| + actually tokenize twice.
|
|
|
| """
|
| - # Note: this function should never be called on Python 3, since py3 has
|
| - # built-in tools to do this.
|
| - assert sys.version_info < (3, 0)
|
| + def __init__(self):
|
| + self.last_text = None
|
| + self.last_tokens = None
|
| +
|
| + @contract(text='unicode')
|
| + def generate_tokens(self, text):
|
| + """A stand-in for `tokenize.generate_tokens`."""
|
| + if text != self.last_text:
|
| + self.last_text = text
|
| + readline = iternext(text.splitlines(True))
|
| + self.last_tokens = list(tokenize.generate_tokens(readline))
|
| + return self.last_tokens
|
| +
|
| +# Create our generate_tokens cache as a callable replacement function.
|
| +generate_tokens = CachedTokenizer().generate_tokens
|
|
|
| - # This is mostly code adapted from Py3.2's tokenize module.
|
|
|
| - cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
|
| +COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE)
|
| +
|
| +@contract(source='bytes')
|
| +def _source_encoding_py2(source):
|
| + """Determine the encoding for `source`, according to PEP 263.
|
| +
|
| + `source` is a byte string, the text of the program.
|
| +
|
| + Returns a string, the name of the encoding.
|
| +
|
| + """
|
| + assert isinstance(source, bytes)
|
|
|
| # Do this so the detect_encode code we copied will work.
|
| - readline = iter(source.splitlines(True)).next
|
| + readline = iternext(source.splitlines(True))
|
| +
|
| + # This is mostly code adapted from Py3.2's tokenize module.
|
|
|
| def _get_normal_name(orig_enc):
|
| """Imitates get_normal_name in tokenizer.c."""
|
| @@ -137,19 +181,14 @@ def source_encoding(source):
|
| return orig_enc
|
|
|
| # From detect_encode():
|
| - # It detects the encoding from the presence of a utf-8 bom or an encoding
|
| - # cookie as specified in pep-0263. If both a bom and a cookie are present,
|
| + # It detects the encoding from the presence of a UTF-8 BOM or an encoding
|
| + # cookie as specified in PEP-0263. If both a BOM and a cookie are present,
|
| # but disagree, a SyntaxError will be raised. If the encoding cookie is an
|
| - # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
|
| + # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found,
|
| # 'utf-8-sig' is returned.
|
|
|
| - # If no encoding is specified, then the default will be returned. The
|
| - # default varied with version.
|
| -
|
| - if sys.version_info <= (2, 4):
|
| - default = 'iso-8859-1'
|
| - else:
|
| - default = 'ascii'
|
| + # If no encoding is specified, then the default will be returned.
|
| + default = 'ascii'
|
|
|
| bom_found = False
|
| encoding = None
|
| @@ -168,21 +207,21 @@ def source_encoding(source):
|
| except UnicodeDecodeError:
|
| return None
|
|
|
| - matches = cookie_re.findall(line_string)
|
| + matches = COOKIE_RE.findall(line_string)
|
| if not matches:
|
| return None
|
| encoding = _get_normal_name(matches[0])
|
| try:
|
| codec = codecs.lookup(encoding)
|
| except LookupError:
|
| - # This behaviour mimics the Python interpreter
|
| + # This behavior mimics the Python interpreter
|
| raise SyntaxError("unknown encoding: " + encoding)
|
|
|
| if bom_found:
|
| # codecs in 2.3 were raw tuples of functions, assume the best.
|
| codec_name = getattr(codec, 'name', encoding)
|
| if codec_name != 'utf-8':
|
| - # This behaviour mimics the Python interpreter
|
| + # This behavior mimics the Python interpreter
|
| raise SyntaxError('encoding problem: utf-8')
|
| encoding += '-sig'
|
| return encoding
|
| @@ -208,3 +247,57 @@ def source_encoding(source):
|
| return encoding
|
|
|
| return default
|
| +
|
| +
|
| +@contract(source='bytes')
|
| +def _source_encoding_py3(source):
|
| + """Determine the encoding for `source`, according to PEP 263.
|
| +
|
| + `source` is a byte string: the text of the program.
|
| +
|
| + Returns a string, the name of the encoding.
|
| +
|
| + """
|
| + readline = iternext(source.splitlines(True))
|
| + return tokenize.detect_encoding(readline)[0]
|
| +
|
| +
|
| +if env.PY3:
|
| + source_encoding = _source_encoding_py3
|
| +else:
|
| + source_encoding = _source_encoding_py2
|
| +
|
| +
|
| +@contract(source='unicode')
|
| +def compile_unicode(source, filename, mode):
|
| + """Just like the `compile` builtin, but works on any Unicode string.
|
| +
|
| + Python 2's compile() builtin has a stupid restriction: if the source string
|
| + is Unicode, then it may not have a encoding declaration in it. Why not?
|
| + Who knows!
|
| +
|
| + This function catches that exception, neuters the coding declaration, and
|
| + compiles it anyway.
|
| +
|
| + """
|
| + try:
|
| + code = compile(source, filename, mode)
|
| + except SyntaxError as synerr:
|
| + if "coding declaration in unicode string" not in synerr.args[0].lower():
|
| + raise
|
| + source = neuter_encoding_declaration(source)
|
| + code = compile(source, filename, mode)
|
| +
|
| + return code
|
| +
|
| +
|
| +@contract(source='unicode', returns='unicode')
|
| +def neuter_encoding_declaration(source):
|
| + """Return `source`, with any encoding declaration neutered.
|
| +
|
| + This function will only ever be called on `source` that has an encoding
|
| + declaration, so some edge cases can be ignored.
|
| +
|
| + """
|
| + source = COOKIE_RE.sub("# (deleted declaration)", source)
|
| + return source
|
|
|