Index: tools/telemetry/third_party/coverage/coverage/phystokens.py |
diff --git a/third_party/pycoverage/coverage/phystokens.py b/tools/telemetry/third_party/coverage/coverage/phystokens.py |
similarity index 61% |
copy from third_party/pycoverage/coverage/phystokens.py |
copy to tools/telemetry/third_party/coverage/coverage/phystokens.py |
index 99b1d5ba0c79771e43338cc8a37ce09e7085d7e2..7092d39e2b055b1bb875041c590c59508e8c9f33 100644 |
--- a/third_party/pycoverage/coverage/phystokens.py |
+++ b/tools/telemetry/third_party/coverage/coverage/phystokens.py |
@@ -1,8 +1,17 @@ |
+# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 |
+# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt |
+ |
"""Better tokenizing for coverage.py.""" |
-import codecs, keyword, re, sys, token, tokenize |
-from coverage.backward import set # pylint: disable=W0622 |
-from coverage.parser import generate_tokens |
+import codecs |
+import keyword |
+import re |
+import token |
+import tokenize |
+ |
+from coverage import env |
+from coverage.backward import iternext |
+from coverage.misc import contract |
def phys_tokens(toks): |
@@ -43,7 +52,7 @@ def phys_tokens(toks): |
inject_backslash = False |
elif ttype == token.STRING: |
if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': |
- # It's a multiline string and the first line ends with |
+ # It's a multi-line string and the first line ends with |
# a backslash, so we don't need to inject another. |
inject_backslash = False |
if inject_backslash: |
@@ -61,6 +70,7 @@ def phys_tokens(toks): |
last_lineno = elineno |
+@contract(source='unicode') |
def source_token_lines(source): |
"""Generate a series of lines, one for each line in `source`. |
@@ -76,11 +86,15 @@ def source_token_lines(source): |
is indistinguishable from a final line with a newline. |
""" |
+ |
ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) |
line = [] |
col = 0 |
- source = source.expandtabs(8).replace('\r\n', '\n') |
+ |
+ # The \f is because of http://bugs.python.org/issue19035 |
+ source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ') |
tokgen = generate_tokens(source) |
+ |
for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
mark_start = True |
for part in re.split('(\n)', ttext): |
@@ -95,7 +109,7 @@ def source_token_lines(source): |
mark_end = False |
else: |
if mark_start and scol > col: |
- line.append(("ws", " " * (scol - col))) |
+ line.append(("ws", u" " * (scol - col))) |
mark_start = False |
tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
if ttype == token.NAME and keyword.iskeyword(ttext): |
@@ -109,22 +123,52 @@ def source_token_lines(source): |
if line: |
yield line |
-def source_encoding(source): |
- """Determine the encoding for `source` (a string), according to PEP 263. |
- Returns a string, the name of the encoding. |
+class CachedTokenizer(object): |
+ """A one-element cache around tokenize.generate_tokens. |
+ |
+ When reporting, coverage.py tokenizes files twice, once to find the |
+ structure of the file, and once to syntax-color it. Tokenizing is |
+ expensive, and easily cached. |
+ |
+ This is a one-element cache so that our twice-in-a-row tokenizing doesn't |
+ actually tokenize twice. |
""" |
- # Note: this function should never be called on Python 3, since py3 has |
- # built-in tools to do this. |
- assert sys.version_info < (3, 0) |
+ def __init__(self): |
+ self.last_text = None |
+ self.last_tokens = None |
+ |
+ @contract(text='unicode') |
+ def generate_tokens(self, text): |
+ """A stand-in for `tokenize.generate_tokens`.""" |
+ if text != self.last_text: |
+ self.last_text = text |
+ readline = iternext(text.splitlines(True)) |
+ self.last_tokens = list(tokenize.generate_tokens(readline)) |
+ return self.last_tokens |
+ |
+# Create our generate_tokens cache as a callable replacement function. |
+generate_tokens = CachedTokenizer().generate_tokens |
- # This is mostly code adapted from Py3.2's tokenize module. |
- cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") |
+COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE) |
+ |
+@contract(source='bytes') |
+def _source_encoding_py2(source): |
+ """Determine the encoding for `source`, according to PEP 263. |
+ |
+ `source` is a byte string, the text of the program. |
+ |
+ Returns a string, the name of the encoding. |
+ |
+ """ |
+ assert isinstance(source, bytes) |
# Do this so the detect_encode code we copied will work. |
- readline = iter(source.splitlines(True)).next |
+ readline = iternext(source.splitlines(True)) |
+ |
+ # This is mostly code adapted from Py3.2's tokenize module. |
def _get_normal_name(orig_enc): |
"""Imitates get_normal_name in tokenizer.c.""" |
@@ -137,19 +181,14 @@ def source_encoding(source): |
return orig_enc |
# From detect_encode(): |
- # It detects the encoding from the presence of a utf-8 bom or an encoding |
- # cookie as specified in pep-0263. If both a bom and a cookie are present, |
+ # It detects the encoding from the presence of a UTF-8 BOM or an encoding |
+ # cookie as specified in PEP-0263. If both a BOM and a cookie are present, |
# but disagree, a SyntaxError will be raised. If the encoding cookie is an |
- # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, |
+ # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, |
# 'utf-8-sig' is returned. |
- # If no encoding is specified, then the default will be returned. The |
- # default varied with version. |
- |
- if sys.version_info <= (2, 4): |
- default = 'iso-8859-1' |
- else: |
- default = 'ascii' |
+ # If no encoding is specified, then the default will be returned. |
+ default = 'ascii' |
bom_found = False |
encoding = None |
@@ -168,21 +207,21 @@ def source_encoding(source): |
except UnicodeDecodeError: |
return None |
- matches = cookie_re.findall(line_string) |
+ matches = COOKIE_RE.findall(line_string) |
if not matches: |
return None |
encoding = _get_normal_name(matches[0]) |
try: |
codec = codecs.lookup(encoding) |
except LookupError: |
- # This behaviour mimics the Python interpreter |
+ # This behavior mimics the Python interpreter |
raise SyntaxError("unknown encoding: " + encoding) |
if bom_found: |
# codecs in 2.3 were raw tuples of functions, assume the best. |
codec_name = getattr(codec, 'name', encoding) |
if codec_name != 'utf-8': |
- # This behaviour mimics the Python interpreter |
+ # This behavior mimics the Python interpreter |
raise SyntaxError('encoding problem: utf-8') |
encoding += '-sig' |
return encoding |
@@ -208,3 +247,57 @@ def source_encoding(source): |
return encoding |
return default |
+ |
+ |
+@contract(source='bytes') |
+def _source_encoding_py3(source): |
+ """Determine the encoding for `source`, according to PEP 263. |
+ |
+ `source` is a byte string: the text of the program. |
+ |
+ Returns a string, the name of the encoding. |
+ |
+ """ |
+ readline = iternext(source.splitlines(True)) |
+ return tokenize.detect_encoding(readline)[0] |
+ |
+ |
+if env.PY3: |
+ source_encoding = _source_encoding_py3 |
+else: |
+ source_encoding = _source_encoding_py2 |
+ |
+ |
+@contract(source='unicode') |
+def compile_unicode(source, filename, mode): |
+ """Just like the `compile` builtin, but works on any Unicode string. |
+ |
+ Python 2's compile() builtin has a stupid restriction: if the source string |
+ is Unicode, then it may not have a encoding declaration in it. Why not? |
+ Who knows! |
+ |
+ This function catches that exception, neuters the coding declaration, and |
+ compiles it anyway. |
+ |
+ """ |
+ try: |
+ code = compile(source, filename, mode) |
+ except SyntaxError as synerr: |
+ if "coding declaration in unicode string" not in synerr.args[0].lower(): |
+ raise |
+ source = neuter_encoding_declaration(source) |
+ code = compile(source, filename, mode) |
+ |
+ return code |
+ |
+ |
+@contract(source='unicode', returns='unicode') |
+def neuter_encoding_declaration(source): |
+ """Return `source`, with any encoding declaration neutered. |
+ |
+ This function will only ever be called on `source` that has an encoding |
+ declaration, so some edge cases can be ignored. |
+ |
+ """ |
+ source = COOKIE_RE.sub("# (deleted declaration)", source) |
+ return source |