Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(76)

Unified Diff: third_party/pycoverage/coverage/phystokens.py

Issue 727003004: Add python coverage module to third_party (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/pycoverage/coverage/parser.py ('k') | third_party/pycoverage/coverage/report.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/pycoverage/coverage/phystokens.py
diff --git a/third_party/pycoverage/coverage/phystokens.py b/third_party/pycoverage/coverage/phystokens.py
new file mode 100644
index 0000000000000000000000000000000000000000..99b1d5ba0c79771e43338cc8a37ce09e7085d7e2
--- /dev/null
+++ b/third_party/pycoverage/coverage/phystokens.py
@@ -0,0 +1,210 @@
+"""Better tokenizing for coverage.py."""
+
+import codecs, keyword, re, sys, token, tokenize
+from coverage.backward import set # pylint: disable=W0622
+from coverage.parser import generate_tokens
+
+
+def phys_tokens(toks):
+ """Return all physical tokens, even line continuations.
+
+ tokenize.generate_tokens() doesn't return a token for the backslash that
+ continues lines. This wrapper provides those tokens so that we can
+ re-create a faithful representation of the original source.
+
+ Returns the same values as generate_tokens()
+
+ """
+ last_line = None
+ last_lineno = -1
+ last_ttype = None
+ for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
+ if last_lineno != elineno:
+ if last_line and last_line.endswith("\\\n"):
+ # We are at the beginning of a new line, and the last line
+ # ended with a backslash. We probably have to inject a
+ # backslash token into the stream. Unfortunately, there's more
+ # to figure out. This code::
+ #
+ # usage = """\
+ # HEY THERE
+ # """
+ #
+ # triggers this condition, but the token text is::
+ #
+ # '"""\\\nHEY THERE\n"""'
+ #
+ # so we need to figure out if the backslash is already in the
+ # string token or not.
+ inject_backslash = True
+ if last_ttype == tokenize.COMMENT:
+ # Comments like this \
+ # should never result in a new token.
+ inject_backslash = False
+ elif ttype == token.STRING:
+ if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
+ # It's a multiline string and the first line ends with
+ # a backslash, so we don't need to inject another.
+ inject_backslash = False
+ if inject_backslash:
+ # Figure out what column the backslash is in.
+ ccol = len(last_line.split("\n")[-2]) - 1
+ # Yield the token, with a fake token type.
+ yield (
+ 99999, "\\\n",
+ (slineno, ccol), (slineno, ccol+2),
+ last_line
+ )
+ last_line = ltext
+ last_ttype = ttype
+ yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
+ last_lineno = elineno
+
+
+def source_token_lines(source):
+ """Generate a series of lines, one for each line in `source`.
+
+ Each line is a list of pairs, each pair is a token::
+
+ [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
+
+ Each pair has a token class, and the token text.
+
+ If you concatenate all the token texts, and then join them with newlines,
+ you should have your original `source` back, with two differences:
+ trailing whitespace is not preserved, and a final line with no newline
+ is indistinguishable from a final line with a newline.
+
+ """
+ ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
+ line = []
+ col = 0
+ source = source.expandtabs(8).replace('\r\n', '\n')
+ tokgen = generate_tokens(source)
+ for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
+ mark_start = True
+ for part in re.split('(\n)', ttext):
+ if part == '\n':
+ yield line
+ line = []
+ col = 0
+ mark_end = False
+ elif part == '':
+ mark_end = False
+ elif ttype in ws_tokens:
+ mark_end = False
+ else:
+ if mark_start and scol > col:
+ line.append(("ws", " " * (scol - col)))
+ mark_start = False
+ tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
+ if ttype == token.NAME and keyword.iskeyword(ttext):
+ tok_class = "key"
+ line.append((tok_class, part))
+ mark_end = True
+ scol = 0
+ if mark_end:
+ col = ecol
+
+ if line:
+ yield line
+
+def source_encoding(source):
+ """Determine the encoding for `source` (a string), according to PEP 263.
+
+ Returns a string, the name of the encoding.
+
+ """
+ # Note: this function should never be called on Python 3, since py3 has
+ # built-in tools to do this.
+ assert sys.version_info < (3, 0)
+
+ # This is mostly code adapted from Py3.2's tokenize module.
+
+ cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
+
+ # Do this so the detect_encode code we copied will work.
+ readline = iter(source.splitlines(True)).next
+
+ def _get_normal_name(orig_enc):
+ """Imitates get_normal_name in tokenizer.c."""
+ # Only care about the first 12 characters.
+ enc = orig_enc[:12].lower().replace("_", "-")
+ if re.match(r"^utf-8($|-)", enc):
+ return "utf-8"
+ if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
+ return "iso-8859-1"
+ return orig_enc
+
+ # From detect_encode():
+ # It detects the encoding from the presence of a utf-8 bom or an encoding
+ # cookie as specified in pep-0263. If both a bom and a cookie are present,
+ # but disagree, a SyntaxError will be raised. If the encoding cookie is an
+ # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
+ # 'utf-8-sig' is returned.
+
+ # If no encoding is specified, then the default will be returned. The
+ # default varied with version.
+
+ if sys.version_info <= (2, 4):
+ default = 'iso-8859-1'
+ else:
+ default = 'ascii'
+
+ bom_found = False
+ encoding = None
+
+ def read_or_stop():
+ """Get the next source line, or ''."""
+ try:
+ return readline()
+ except StopIteration:
+ return ''
+
+ def find_cookie(line):
+ """Find an encoding cookie in `line`."""
+ try:
+ line_string = line.decode('ascii')
+ except UnicodeDecodeError:
+ return None
+
+ matches = cookie_re.findall(line_string)
+ if not matches:
+ return None
+ encoding = _get_normal_name(matches[0])
+ try:
+ codec = codecs.lookup(encoding)
+ except LookupError:
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError("unknown encoding: " + encoding)
+
+ if bom_found:
+ # codecs in 2.3 were raw tuples of functions, assume the best.
+ codec_name = getattr(codec, 'name', encoding)
+ if codec_name != 'utf-8':
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError('encoding problem: utf-8')
+ encoding += '-sig'
+ return encoding
+
+ first = read_or_stop()
+ if first.startswith(codecs.BOM_UTF8):
+ bom_found = True
+ first = first[3:]
+ default = 'utf-8-sig'
+ if not first:
+ return default
+
+ encoding = find_cookie(first)
+ if encoding:
+ return encoding
+
+ second = read_or_stop()
+ if not second:
+ return default
+
+ encoding = find_cookie(second)
+ if encoding:
+ return encoding
+
+ return default
« no previous file with comments | « third_party/pycoverage/coverage/parser.py ('k') | third_party/pycoverage/coverage/report.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698