third_party/pycoverage/coverage/phystokens.py - Issue 727003004: Add python coverage module to third_party

Unified Diff: third_party/pycoverage/coverage/phystokens.py

Issue 727003004: Add python coverage module to third_party (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/pycoverage/coverage/phystokens.py

diff --git a/third_party/pycoverage/coverage/phystokens.py b/third_party/pycoverage/coverage/phystokens.py

new file mode 100644

index 0000000000000000000000000000000000000000..99b1d5ba0c79771e43338cc8a37ce09e7085d7e2

--- /dev/null

+++ b/third_party/pycoverage/coverage/phystokens.py

@@ -0,0 +1,210 @@

+"""Better tokenizing for coverage.py."""

+import codecs, keyword, re, sys, token, tokenize

+from coverage.backward import set # pylint: disable=W0622

+from coverage.parser import generate_tokens

+def phys_tokens(toks):

+ """Return all physical tokens, even line continuations.

+ tokenize.generate_tokens() doesn't return a token for the backslash that

+ continues lines. This wrapper provides those tokens so that we can

+ re-create a faithful representation of the original source.

+ Returns the same values as generate_tokens()

+ """

+ last_line = None

+ last_lineno = -1

+ last_ttype = None

+ for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:

+ if last_lineno != elineno:

+ if last_line and last_line.endswith("\\\n"):

+ # We are at the beginning of a new line, and the last line

+ # ended with a backslash. We probably have to inject a

+ # backslash token into the stream. Unfortunately, there's more

+ # to figure out. This code::

+ #

+ # usage = """\

+ # HEY THERE

+ # """

+ #

+ # triggers this condition, but the token text is::

+ #

+ # '"""\\\nHEY THERE\n"""'

+ #

+ # so we need to figure out if the backslash is already in the

+ # string token or not.

+ inject_backslash = True

+ if last_ttype == tokenize.COMMENT:

+ # Comments like this \

+ # should never result in a new token.

+ inject_backslash = False

+ elif ttype == token.STRING:

+ if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':

+ # It's a multiline string and the first line ends with

+ # a backslash, so we don't need to inject another.

+ inject_backslash = False

+ if inject_backslash:

+ # Figure out what column the backslash is in.

+ ccol = len(last_line.split("\n")[-2]) - 1

+ # Yield the token, with a fake token type.

+ yield (

+ 99999, "\\\n",

+ (slineno, ccol), (slineno, ccol+2),

+ last_line

+ )

+ last_line = ltext

+ last_ttype = ttype

+ yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext

+ last_lineno = elineno

+def source_token_lines(source):

+ """Generate a series of lines, one for each line in `source`.

+ Each line is a list of pairs, each pair is a token::

+ [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]

+ Each pair has a token class, and the token text.

+ If you concatenate all the token texts, and then join them with newlines,

+ you should have your original `source` back, with two differences:

+ trailing whitespace is not preserved, and a final line with no newline

+ is indistinguishable from a final line with a newline.

+ """

+ ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])

+ line = []

+ col = 0

+ source = source.expandtabs(8).replace('\r\n', '\n')

+ tokgen = generate_tokens(source)

+ for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):

+ mark_start = True

+ for part in re.split('(\n)', ttext):

+ if part == '\n':

+ yield line

+ line = []

+ col = 0

+ mark_end = False

+ elif part == '':

+ mark_end = False

+ elif ttype in ws_tokens:

+ mark_end = False

+ else:

+ if mark_start and scol > col:

+ line.append(("ws", " " * (scol - col)))

+ mark_start = False

+ tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]

+ if ttype == token.NAME and keyword.iskeyword(ttext):

+ tok_class = "key"

+ line.append((tok_class, part))

+ mark_end = True

+ scol = 0

+ if mark_end:

+ col = ecol

+ if line:

+ yield line

+def source_encoding(source):

+ """Determine the encoding for `source` (a string), according to PEP 263.

+ Returns a string, the name of the encoding.

+ """

+ # Note: this function should never be called on Python 3, since py3 has

+ # built-in tools to do this.

+ assert sys.version_info < (3, 0)

+ # This is mostly code adapted from Py3.2's tokenize module.

+ cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")

+ # Do this so the detect_encode code we copied will work.

+ readline = iter(source.splitlines(True)).next

+ def _get_normal_name(orig_enc):

+ """Imitates get_normal_name in tokenizer.c."""

+ # Only care about the first 12 characters.

+ enc = orig_enc[:12].lower().replace("_", "-")

+ if re.match(r"^utf-8($|-)", enc):

+ return "utf-8"

+ if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):

+ return "iso-8859-1"

+ return orig_enc

+ # From detect_encode():

+ # It detects the encoding from the presence of a utf-8 bom or an encoding

+ # cookie as specified in pep-0263. If both a bom and a cookie are present,

+ # but disagree, a SyntaxError will be raised. If the encoding cookie is an

+ # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

+ # 'utf-8-sig' is returned.

+ # If no encoding is specified, then the default will be returned. The

+ # default varied with version.

+ if sys.version_info <= (2, 4):

+ default = 'iso-8859-1'

+ else:

+ default = 'ascii'

+ bom_found = False

+ encoding = None

+ def read_or_stop():

+ """Get the next source line, or ''."""

+ try:

+ return readline()

+ except StopIteration:

+ return ''

+ def find_cookie(line):

+ """Find an encoding cookie in `line`."""

+ try:

+ line_string = line.decode('ascii')

+ except UnicodeDecodeError:

+ return None

+ matches = cookie_re.findall(line_string)

+ if not matches:

+ return None

+ encoding = _get_normal_name(matches[0])

+ try:

+ codec = codecs.lookup(encoding)

+ except LookupError:

+ # This behaviour mimics the Python interpreter

+ raise SyntaxError("unknown encoding: " + encoding)

+ if bom_found:

+ # codecs in 2.3 were raw tuples of functions, assume the best.

+ codec_name = getattr(codec, 'name', encoding)

+ if codec_name != 'utf-8':

+ # This behaviour mimics the Python interpreter

+ raise SyntaxError('encoding problem: utf-8')

+ encoding += '-sig'

+ return encoding

+ first = read_or_stop()

+ if first.startswith(codecs.BOM_UTF8):

+ bom_found = True

+ first = first[3:]

+ default = 'utf-8-sig'

+ if not first:

+ return default

+ encoding = find_cookie(first)

+ if encoding:

+ return encoding

+ second = read_or_stop()

+ if not second:

+ return default

+ encoding = find_cookie(second)

+ if encoding:

+ return encoding

+ return default

« no previous file with comments | « third_party/pycoverage/coverage/parser.py ('k') | third_party/pycoverage/coverage/report.py » ('j') | no next file with comments »