Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Side by Side Diff: third_party/pycoverage/coverage/parser.py

Issue 727003004: Add python coverage module to third_party (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/pycoverage/coverage/misc.py ('k') | third_party/pycoverage/coverage/phystokens.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 """Code parsing for Coverage."""
2
3 import dis, re, sys, token, tokenize
4
5 from coverage.backward import set, sorted, StringIO # pylint: disable=W0622
6 from coverage.backward import open_source, range # pylint: disable=W0622
7 from coverage.backward import reversed # pylint: disable=W0622
8 from coverage.backward import bytes_to_ints
9 from coverage.bytecode import ByteCodes, CodeObjects
10 from coverage.misc import nice_pair, expensive, join_regex
11 from coverage.misc import CoverageException, NoSource, NotPython
12
13
14 class CodeParser(object):
15 """Parse code to find executable lines, excluded lines, etc."""
16
17 def __init__(self, text=None, filename=None, exclude=None):
18 """
19 Source can be provided as `text`, the text itself, or `filename`, from
20 which the text will be read. Excluded lines are those that match
21 `exclude`, a regex.
22
23 """
24 assert text or filename, "CodeParser needs either text or filename"
25 self.filename = filename or "<code>"
26 self.text = text
27 if not self.text:
28 try:
29 sourcef = open_source(self.filename)
30 try:
31 self.text = sourcef.read()
32 finally:
33 sourcef.close()
34 except IOError:
35 _, err, _ = sys.exc_info()
36 raise NoSource(
37 "No source for code: '%s': %s" % (self.filename, err)
38 )
39
40 # Scrap the BOM if it exists.
41 if self.text and ord(self.text[0]) == 0xfeff:
42 self.text = self.text[1:]
43
44 self.exclude = exclude
45
46 self.show_tokens = False
47
48 # The text lines of the parsed code.
49 self.lines = self.text.split('\n')
50
51 # The line numbers of excluded lines of code.
52 self.excluded = set()
53
54 # The line numbers of docstring lines.
55 self.docstrings = set()
56
57 # The line numbers of class definitions.
58 self.classdefs = set()
59
60 # A dict mapping line numbers to (lo,hi) for multi-line statements.
61 self.multiline = {}
62
63 # The line numbers that start statements.
64 self.statement_starts = set()
65
66 # Lazily-created ByteParser
67 self._byte_parser = None
68
69 def _get_byte_parser(self):
70 """Create a ByteParser on demand."""
71 if not self._byte_parser:
72 self._byte_parser = \
73 ByteParser(text=self.text, filename=self.filename)
74 return self._byte_parser
75 byte_parser = property(_get_byte_parser)
76
77 def lines_matching(self, *regexes):
78 """Find the lines matching one of a list of regexes.
79
80 Returns a set of line numbers, the lines that contain a match for one
81 of the regexes in `regexes`. The entire line needn't match, just a
82 part of it.
83
84 """
85 regex_c = re.compile(join_regex(regexes))
86 matches = set()
87 for i, ltext in enumerate(self.lines):
88 if regex_c.search(ltext):
89 matches.add(i+1)
90 return matches
91
92 def _raw_parse(self):
93 """Parse the source to find the interesting facts about its lines.
94
95 A handful of member fields are updated.
96
97 """
98 # Find lines which match an exclusion pattern.
99 if self.exclude:
100 self.excluded = self.lines_matching(self.exclude)
101
102 # Tokenize, to find excluded suites, to find docstrings, and to find
103 # multi-line statements.
104 indent = 0
105 exclude_indent = 0
106 excluding = False
107 prev_toktype = token.INDENT
108 first_line = None
109 empty = True
110
111 tokgen = generate_tokens(self.text)
112 for toktype, ttext, (slineno, _), (elineno, _), ltext in tokgen:
113 if self.show_tokens: # pragma: not covered
114 print("%10s %5s %-20r %r" % (
115 tokenize.tok_name.get(toktype, toktype),
116 nice_pair((slineno, elineno)), ttext, ltext
117 ))
118 if toktype == token.INDENT:
119 indent += 1
120 elif toktype == token.DEDENT:
121 indent -= 1
122 elif toktype == token.NAME and ttext == 'class':
123 # Class definitions look like branches in the byte code, so
124 # we need to exclude them. The simplest way is to note the
125 # lines with the 'class' keyword.
126 self.classdefs.add(slineno)
127 elif toktype == token.OP and ttext == ':':
128 if not excluding and elineno in self.excluded:
129 # Start excluding a suite. We trigger off of the colon
130 # token so that the #pragma comment will be recognized on
131 # the same line as the colon.
132 exclude_indent = indent
133 excluding = True
134 elif toktype == token.STRING and prev_toktype == token.INDENT:
135 # Strings that are first on an indented line are docstrings.
136 # (a trick from trace.py in the stdlib.) This works for
137 # 99.9999% of cases. For the rest (!) see:
138 # http://stackoverflow.com/questions/1769332/x/1769794#1769794
139 self.docstrings.update(range(slineno, elineno+1))
140 elif toktype == token.NEWLINE:
141 if first_line is not None and elineno != first_line:
142 # We're at the end of a line, and we've ended on a
143 # different line than the first line of the statement,
144 # so record a multi-line range.
145 rng = (first_line, elineno)
146 for l in range(first_line, elineno+1):
147 self.multiline[l] = rng
148 first_line = None
149
150 if ttext.strip() and toktype != tokenize.COMMENT:
151 # A non-whitespace token.
152 empty = False
153 if first_line is None:
154 # The token is not whitespace, and is the first in a
155 # statement.
156 first_line = slineno
157 # Check whether to end an excluded suite.
158 if excluding and indent <= exclude_indent:
159 excluding = False
160 if excluding:
161 self.excluded.add(elineno)
162
163 prev_toktype = toktype
164
165 # Find the starts of the executable statements.
166 if not empty:
167 self.statement_starts.update(self.byte_parser._find_statements())
168
169 def first_line(self, line):
170 """Return the first line number of the statement including `line`."""
171 rng = self.multiline.get(line)
172 if rng:
173 first_line = rng[0]
174 else:
175 first_line = line
176 return first_line
177
178 def first_lines(self, lines, *ignores):
179 """Map the line numbers in `lines` to the correct first line of the
180 statement.
181
182 Skip any line mentioned in any of the sequences in `ignores`.
183
184 Returns a set of the first lines.
185
186 """
187 ignore = set()
188 for ign in ignores:
189 ignore.update(ign)
190 lset = set()
191 for l in lines:
192 if l in ignore:
193 continue
194 new_l = self.first_line(l)
195 if new_l not in ignore:
196 lset.add(new_l)
197 return lset
198
199 def parse_source(self):
200 """Parse source text to find executable lines, excluded lines, etc.
201
202 Return values are 1) a set of executable line numbers, and 2) a set of
203 excluded line numbers.
204
205 Reported line numbers are normalized to the first line of multi-line
206 statements.
207
208 """
209 try:
210 self._raw_parse()
211 except (tokenize.TokenError, IndentationError):
212 _, tokerr, _ = sys.exc_info()
213 msg, lineno = tokerr.args
214 raise NotPython(
215 "Couldn't parse '%s' as Python source: '%s' at %s" %
216 (self.filename, msg, lineno)
217 )
218
219 excluded_lines = self.first_lines(self.excluded)
220 lines = self.first_lines(
221 self.statement_starts,
222 excluded_lines,
223 self.docstrings
224 )
225
226 return lines, excluded_lines
227
228 def arcs(self):
229 """Get information about the arcs available in the code.
230
231 Returns a sorted list of line number pairs. Line numbers have been
232 normalized to the first line of multiline statements.
233
234 """
235 all_arcs = []
236 for l1, l2 in self.byte_parser._all_arcs():
237 fl1 = self.first_line(l1)
238 fl2 = self.first_line(l2)
239 if fl1 != fl2:
240 all_arcs.append((fl1, fl2))
241 return sorted(all_arcs)
242 arcs = expensive(arcs)
243
244 def exit_counts(self):
245 """Get a mapping from line numbers to count of exits from that line.
246
247 Excluded lines are excluded.
248
249 """
250 excluded_lines = self.first_lines(self.excluded)
251 exit_counts = {}
252 for l1, l2 in self.arcs():
253 if l1 < 0:
254 # Don't ever report -1 as a line number
255 continue
256 if l1 in excluded_lines:
257 # Don't report excluded lines as line numbers.
258 continue
259 if l2 in excluded_lines:
260 # Arcs to excluded lines shouldn't count.
261 continue
262 if l1 not in exit_counts:
263 exit_counts[l1] = 0
264 exit_counts[l1] += 1
265
266 # Class definitions have one extra exit, so remove one for each:
267 for l in self.classdefs:
268 # Ensure key is there: classdefs can include excluded lines.
269 if l in exit_counts:
270 exit_counts[l] -= 1
271
272 return exit_counts
273 exit_counts = expensive(exit_counts)
274
275
276 ## Opcodes that guide the ByteParser.
277
278 def _opcode(name):
279 """Return the opcode by name from the dis module."""
280 return dis.opmap[name]
281
282 def _opcode_set(*names):
283 """Return a set of opcodes by the names in `names`."""
284 s = set()
285 for name in names:
286 try:
287 s.add(_opcode(name))
288 except KeyError:
289 pass
290 return s
291
292 # Opcodes that leave the code object.
293 OPS_CODE_END = _opcode_set('RETURN_VALUE')
294
295 # Opcodes that unconditionally end the code chunk.
296 OPS_CHUNK_END = _opcode_set(
297 'JUMP_ABSOLUTE', 'JUMP_FORWARD', 'RETURN_VALUE', 'RAISE_VARARGS',
298 'BREAK_LOOP', 'CONTINUE_LOOP',
299 )
300
301 # Opcodes that unconditionally begin a new code chunk. By starting new chunks
302 # with unconditional jump instructions, we neatly deal with jumps to jumps
303 # properly.
304 OPS_CHUNK_BEGIN = _opcode_set('JUMP_ABSOLUTE', 'JUMP_FORWARD')
305
306 # Opcodes that push a block on the block stack.
307 OPS_PUSH_BLOCK = _opcode_set(
308 'SETUP_LOOP', 'SETUP_EXCEPT', 'SETUP_FINALLY', 'SETUP_WITH'
309 )
310
311 # Block types for exception handling.
312 OPS_EXCEPT_BLOCKS = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY')
313
314 # Opcodes that pop a block from the block stack.
315 OPS_POP_BLOCK = _opcode_set('POP_BLOCK')
316
317 # Opcodes that have a jump destination, but aren't really a jump.
318 OPS_NO_JUMP = OPS_PUSH_BLOCK
319
320 # Individual opcodes we need below.
321 OP_BREAK_LOOP = _opcode('BREAK_LOOP')
322 OP_END_FINALLY = _opcode('END_FINALLY')
323 OP_COMPARE_OP = _opcode('COMPARE_OP')
324 COMPARE_EXCEPTION = 10 # just have to get this const from the code.
325 OP_LOAD_CONST = _opcode('LOAD_CONST')
326 OP_RETURN_VALUE = _opcode('RETURN_VALUE')
327
328
329 class ByteParser(object):
330 """Parse byte codes to understand the structure of code."""
331
332 def __init__(self, code=None, text=None, filename=None):
333 if code:
334 self.code = code
335 self.text = text
336 else:
337 if not text:
338 assert filename, "If no code or text, need a filename"
339 sourcef = open_source(filename)
340 try:
341 text = sourcef.read()
342 finally:
343 sourcef.close()
344 self.text = text
345
346 try:
347 # Python 2.3 and 2.4 don't like partial last lines, so be sure
348 # the text ends nicely for them.
349 self.code = compile(text + '\n', filename, "exec")
350 except SyntaxError:
351 _, synerr, _ = sys.exc_info()
352 raise NotPython(
353 "Couldn't parse '%s' as Python source: '%s' at line %d" %
354 (filename, synerr.msg, synerr.lineno)
355 )
356
357 # Alternative Python implementations don't always provide all the
358 # attributes on code objects that we need to do the analysis.
359 for attr in ['co_lnotab', 'co_firstlineno', 'co_consts', 'co_code']:
360 if not hasattr(self.code, attr):
361 raise CoverageException(
362 "This implementation of Python doesn't support code "
363 "analysis.\n"
364 "Run coverage.py under CPython for this command."
365 )
366
367 def child_parsers(self):
368 """Iterate over all the code objects nested within this one.
369
370 The iteration includes `self` as its first value.
371
372 """
373 children = CodeObjects(self.code)
374 return [ByteParser(code=c, text=self.text) for c in children]
375
376 def _bytes_lines(self):
377 """Map byte offsets to line numbers in `code`.
378
379 Uses co_lnotab described in Python/compile.c to map byte offsets to
380 line numbers. Produces a sequence: (b0, l0), (b1, l1), ...
381
382 Only byte offsets that correspond to line numbers are included in the
383 results.
384
385 """
386 # Adapted from dis.py in the standard library.
387 byte_increments = bytes_to_ints(self.code.co_lnotab[0::2])
388 line_increments = bytes_to_ints(self.code.co_lnotab[1::2])
389
390 last_line_num = None
391 line_num = self.code.co_firstlineno
392 byte_num = 0
393 for byte_incr, line_incr in zip(byte_increments, line_increments):
394 if byte_incr:
395 if line_num != last_line_num:
396 yield (byte_num, line_num)
397 last_line_num = line_num
398 byte_num += byte_incr
399 line_num += line_incr
400 if line_num != last_line_num:
401 yield (byte_num, line_num)
402
403 def _find_statements(self):
404 """Find the statements in `self.code`.
405
406 Produce a sequence of line numbers that start statements. Recurses
407 into all code objects reachable from `self.code`.
408
409 """
410 for bp in self.child_parsers():
411 # Get all of the lineno information from this code.
412 for _, l in bp._bytes_lines():
413 yield l
414
415 def _block_stack_repr(self, block_stack):
416 """Get a string version of `block_stack`, for debugging."""
417 blocks = ", ".join(
418 ["(%s, %r)" % (dis.opname[b[0]], b[1]) for b in block_stack]
419 )
420 return "[" + blocks + "]"
421
422 def _split_into_chunks(self):
423 """Split the code object into a list of `Chunk` objects.
424
425 Each chunk is only entered at its first instruction, though there can
426 be many exits from a chunk.
427
428 Returns a list of `Chunk` objects.
429
430 """
431 # The list of chunks so far, and the one we're working on.
432 chunks = []
433 chunk = None
434
435 # A dict mapping byte offsets of line starts to the line numbers.
436 bytes_lines_map = dict(self._bytes_lines())
437
438 # The block stack: loops and try blocks get pushed here for the
439 # implicit jumps that can occur.
440 # Each entry is a tuple: (block type, destination)
441 block_stack = []
442
443 # Some op codes are followed by branches that should be ignored. This
444 # is a count of how many ignores are left.
445 ignore_branch = 0
446
447 # We have to handle the last two bytecodes specially.
448 ult = penult = None
449
450 # Get a set of all of the jump-to points.
451 jump_to = set()
452 bytecodes = list(ByteCodes(self.code.co_code))
453 for bc in bytecodes:
454 if bc.jump_to >= 0:
455 jump_to.add(bc.jump_to)
456
457 chunk_lineno = 0
458
459 # Walk the byte codes building chunks.
460 for bc in bytecodes:
461 # Maybe have to start a new chunk
462 start_new_chunk = False
463 first_chunk = False
464 if bc.offset in bytes_lines_map:
465 # Start a new chunk for each source line number.
466 start_new_chunk = True
467 chunk_lineno = bytes_lines_map[bc.offset]
468 first_chunk = True
469 elif bc.offset in jump_to:
470 # To make chunks have a single entrance, we have to make a new
471 # chunk when we get to a place some bytecode jumps to.
472 start_new_chunk = True
473 elif bc.op in OPS_CHUNK_BEGIN:
474 # Jumps deserve their own unnumbered chunk. This fixes
475 # problems with jumps to jumps getting confused.
476 start_new_chunk = True
477
478 if not chunk or start_new_chunk:
479 if chunk:
480 chunk.exits.add(bc.offset)
481 chunk = Chunk(bc.offset, chunk_lineno, first_chunk)
482 chunks.append(chunk)
483
484 # Look at the opcode
485 if bc.jump_to >= 0 and bc.op not in OPS_NO_JUMP:
486 if ignore_branch:
487 # Someone earlier wanted us to ignore this branch.
488 ignore_branch -= 1
489 else:
490 # The opcode has a jump, it's an exit for this chunk.
491 chunk.exits.add(bc.jump_to)
492
493 if bc.op in OPS_CODE_END:
494 # The opcode can exit the code object.
495 chunk.exits.add(-self.code.co_firstlineno)
496 if bc.op in OPS_PUSH_BLOCK:
497 # The opcode adds a block to the block_stack.
498 block_stack.append((bc.op, bc.jump_to))
499 if bc.op in OPS_POP_BLOCK:
500 # The opcode pops a block from the block stack.
501 block_stack.pop()
502 if bc.op in OPS_CHUNK_END:
503 # This opcode forces the end of the chunk.
504 if bc.op == OP_BREAK_LOOP:
505 # A break is implicit: jump where the top of the
506 # block_stack points.
507 chunk.exits.add(block_stack[-1][1])
508 chunk = None
509 if bc.op == OP_END_FINALLY:
510 # For the finally clause we need to find the closest exception
511 # block, and use its jump target as an exit.
512 for block in reversed(block_stack):
513 if block[0] in OPS_EXCEPT_BLOCKS:
514 chunk.exits.add(block[1])
515 break
516 if bc.op == OP_COMPARE_OP and bc.arg == COMPARE_EXCEPTION:
517 # This is an except clause. We want to overlook the next
518 # branch, so that except's don't count as branches.
519 ignore_branch += 1
520
521 penult = ult
522 ult = bc
523
524 if chunks:
525 # The last two bytecodes could be a dummy "return None" that
526 # shouldn't be counted as real code. Every Python code object seems
527 # to end with a return, and a "return None" is inserted if there
528 # isn't an explicit return in the source.
529 if ult and penult:
530 if penult.op == OP_LOAD_CONST and ult.op == OP_RETURN_VALUE:
531 if self.code.co_consts[penult.arg] is None:
532 # This is "return None", but is it dummy? A real line
533 # would be a last chunk all by itself.
534 if chunks[-1].byte != penult.offset:
535 ex = -self.code.co_firstlineno
536 # Split the last chunk
537 last_chunk = chunks[-1]
538 last_chunk.exits.remove(ex)
539 last_chunk.exits.add(penult.offset)
540 chunk = Chunk(
541 penult.offset, last_chunk.line, False
542 )
543 chunk.exits.add(ex)
544 chunks.append(chunk)
545
546 # Give all the chunks a length.
547 chunks[-1].length = bc.next_offset - chunks[-1].byte # pylint: disab le=W0631,C0301
548 for i in range(len(chunks)-1):
549 chunks[i].length = chunks[i+1].byte - chunks[i].byte
550
551 #self.validate_chunks(chunks)
552 return chunks
553
554 def validate_chunks(self, chunks):
555 """Validate the rule that chunks have a single entrance."""
556 # starts is the entrances to the chunks
557 starts = set([ch.byte for ch in chunks])
558 for ch in chunks:
559 assert all([(ex in starts or ex < 0) for ex in ch.exits])
560
561 def _arcs(self):
562 """Find the executable arcs in the code.
563
564 Yields pairs: (from,to). From and to are integer line numbers. If
565 from is < 0, then the arc is an entrance into the code object. If to
566 is < 0, the arc is an exit from the code object.
567
568 """
569 chunks = self._split_into_chunks()
570
571 # A map from byte offsets to chunks jumped into.
572 byte_chunks = dict([(c.byte, c) for c in chunks])
573
574 # There's always an entrance at the first chunk.
575 yield (-1, byte_chunks[0].line)
576
577 # Traverse from the first chunk in each line, and yield arcs where
578 # the trace function will be invoked.
579 for chunk in chunks:
580 if not chunk.first:
581 continue
582
583 chunks_considered = set()
584 chunks_to_consider = [chunk]
585 while chunks_to_consider:
586 # Get the chunk we're considering, and make sure we don't
587 # consider it again
588 this_chunk = chunks_to_consider.pop()
589 chunks_considered.add(this_chunk)
590
591 # For each exit, add the line number if the trace function
592 # would be triggered, or add the chunk to those being
593 # considered if not.
594 for ex in this_chunk.exits:
595 if ex < 0:
596 yield (chunk.line, ex)
597 else:
598 next_chunk = byte_chunks[ex]
599 if next_chunk in chunks_considered:
600 continue
601
602 # The trace function is invoked if visiting the first
603 # bytecode in a line, or if the transition is a
604 # backward jump.
605 backward_jump = next_chunk.byte < this_chunk.byte
606 if next_chunk.first or backward_jump:
607 if next_chunk.line != chunk.line:
608 yield (chunk.line, next_chunk.line)
609 else:
610 chunks_to_consider.append(next_chunk)
611
612 def _all_chunks(self):
613 """Returns a list of `Chunk` objects for this code and its children.
614
615 See `_split_into_chunks` for details.
616
617 """
618 chunks = []
619 for bp in self.child_parsers():
620 chunks.extend(bp._split_into_chunks())
621
622 return chunks
623
624 def _all_arcs(self):
625 """Get the set of all arcs in this code object and its children.
626
627 See `_arcs` for details.
628
629 """
630 arcs = set()
631 for bp in self.child_parsers():
632 arcs.update(bp._arcs())
633
634 return arcs
635
636
637 class Chunk(object):
638 """A sequence of byte codes with a single entrance.
639
640 To analyze byte code, we have to divide it into chunks, sequences of byte
641 codes such that each chunk has only one entrance, the first instruction in
642 the block.
643
644 This is almost the CS concept of `basic block`_, except that we're willing
645 to have many exits from a chunk, and "basic block" is a more cumbersome
646 term.
647
648 .. _basic block: http://en.wikipedia.org/wiki/Basic_block
649
650 `line` is the source line number containing this chunk.
651
652 `first` is true if this is the first chunk in the source line.
653
654 An exit < 0 means the chunk can leave the code (return). The exit is
655 the negative of the starting line number of the code block.
656
657 """
658 def __init__(self, byte, line, first):
659 self.byte = byte
660 self.line = line
661 self.first = first
662 self.length = 0
663 self.exits = set()
664
665 def __repr__(self):
666 if self.first:
667 bang = "!"
668 else:
669 bang = ""
670 return "<%d+%d @%d%s %r>" % (
671 self.byte, self.length, self.line, bang, list(self.exits)
672 )
673
674
675 class CachedTokenizer(object):
676 """A one-element cache around tokenize.generate_tokens.
677
678 When reporting, coverage.py tokenizes files twice, once to find the
679 structure of the file, and once to syntax-color it. Tokenizing is
680 expensive, and easily cached.
681
682 This is a one-element cache so that our twice-in-a-row tokenizing doesn't
683 actually tokenize twice.
684
685 """
686 def __init__(self):
687 self.last_text = None
688 self.last_tokens = None
689
690 def generate_tokens(self, text):
691 """A stand-in for `tokenize.generate_tokens`."""
692 if text != self.last_text:
693 self.last_text = text
694 self.last_tokens = list(
695 tokenize.generate_tokens(StringIO(text).readline)
696 )
697 return self.last_tokens
698
699 # Create our generate_tokens cache as a callable replacement function.
700 generate_tokens = CachedTokenizer().generate_tokens
OLDNEW
« no previous file with comments | « third_party/pycoverage/coverage/misc.py ('k') | third_party/pycoverage/coverage/phystokens.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698