| Index: tools/ll_prof.py
|
| diff --git a/tools/ll_prof.py b/tools/ll_prof.py
|
| index 3afe179d2f8adbc2935135c4efa9cb2f899c92a4..216929d1e20171452e73082976778f8db4c603c6 100755
|
| --- a/tools/ll_prof.py
|
| +++ b/tools/ll_prof.py
|
| @@ -45,7 +45,7 @@ USAGE="""usage: %prog [OPTION]...
|
| Analyses V8 and perf logs to produce profiles.
|
|
|
| Perf logs can be collected using a command like:
|
| - $ perf record -R -e cycles -c 10000 -f -i ./shell bench.js --ll-prof
|
| + $ perf record -R -e cycles -c 10000 -f -i ./d8 bench.js --ll-prof
|
| # -R: collect all data
|
| # -e cycles: use cpu-cycles event (run "perf list" for details)
|
| # -c 10000: write a sample after each 10000 events
|
| @@ -54,6 +54,16 @@ Perf logs can be collected using a command like:
|
| # --ll-prof shell flag enables the right V8 logs
|
| This will produce a binary trace file (perf.data) that %prog can analyse.
|
|
|
| +IMPORTANT:
|
| + The kernel has an internal maximum for events per second, it is 100K by
|
| + default. That's not enough for "-c 10000". Set it to some higher value:
|
| + $ echo 10000000 | sudo tee /proc/sys/kernel/perf_event_max_sample_rate
|
| + You can also make the warning about kernel address maps go away:
|
| + $ echo 0 | sudo tee /proc/sys/kernel/kptr_restrict
|
| +
|
| +We have a convenience script that handles all of the above for you:
|
| + $ tools/run-llprof.sh ./d8 bench.js
|
| +
|
| Examples:
|
| # Print flat profile with annotated disassembly for the 10 top
|
| # symbols. Use default log names and include the snapshot log.
|
| @@ -75,6 +85,10 @@ class Code(object):
|
| """Code object."""
|
|
|
| _id = 0
|
| + UNKNOWN = 0
|
| + V8INTERNAL = 1
|
| + FULL_CODEGEN = 2
|
| + OPTIMIZED = 3
|
|
|
| def __init__(self, name, start_address, end_address, origin, origin_offset):
|
| self.id = Code._id
|
| @@ -88,6 +102,14 @@ class Code(object):
|
| self.self_ticks = 0
|
| self.self_ticks_map = None
|
| self.callee_ticks = None
|
| + if name.startswith("LazyCompile:*"):
|
| + self.codetype = Code.OPTIMIZED
|
| + elif name.startswith("LazyCompile:"):
|
| + self.codetype = Code.FULL_CODEGEN
|
| + elif name.startswith("v8::internal::"):
|
| + self.codetype = Code.V8INTERNAL
|
| + else:
|
| + self.codetype = Code.UNKNOWN
|
|
|
| def AddName(self, name):
|
| assert self.name != name
|
| @@ -185,7 +207,7 @@ class Code(object):
|
| class CodePage(object):
|
| """Group of adjacent code objects."""
|
|
|
| - SHIFT = 12 # 4K pages
|
| + SHIFT = 20 # 1M pages
|
| SIZE = (1 << SHIFT)
|
| MASK = ~(SIZE - 1)
|
|
|
| @@ -507,6 +529,7 @@ class Descriptor(object):
|
| # for the gory details.
|
|
|
|
|
| +# Reference: struct perf_file_header in kernel/tools/perf/util/header.h
|
| TRACE_HEADER_DESC = Descriptor([
|
| ("magic", "u64"),
|
| ("size", "u64"),
|
| @@ -520,6 +543,7 @@ TRACE_HEADER_DESC = Descriptor([
|
| ])
|
|
|
|
|
| +# Reference: /usr/include/linux/perf_event.h
|
| PERF_EVENT_ATTR_DESC = Descriptor([
|
| ("type", "u32"),
|
| ("size", "u32"),
|
| @@ -529,12 +553,13 @@ PERF_EVENT_ATTR_DESC = Descriptor([
|
| ("read_format", "u64"),
|
| ("flags", "u64"),
|
| ("wakeup_events_or_watermark", "u32"),
|
| - ("bt_type", "u32"),
|
| + ("bp_type", "u32"),
|
| ("bp_addr", "u64"),
|
| - ("bp_len", "u64"),
|
| + ("bp_len", "u64")
|
| ])
|
|
|
|
|
| +# Reference: /usr/include/linux/perf_event.h
|
| PERF_EVENT_HEADER_DESC = Descriptor([
|
| ("type", "u32"),
|
| ("misc", "u16"),
|
| @@ -542,6 +567,7 @@ PERF_EVENT_HEADER_DESC = Descriptor([
|
| ])
|
|
|
|
|
| +# Reference: kernel/events/core.c
|
| PERF_MMAP_EVENT_BODY_DESC = Descriptor([
|
| ("pid", "u32"),
|
| ("tid", "u32"),
|
| @@ -566,6 +592,7 @@ PERF_SAMPLE_STREAM_ID = 1 << 9
|
| PERF_SAMPLE_RAW = 1 << 10
|
|
|
|
|
| +# Reference: /usr/include/perf_event.h, the comment for PERF_RECORD_SAMPLE.
|
| PERF_SAMPLE_EVENT_BODY_FIELDS = [
|
| ("ip", "u64", PERF_SAMPLE_IP),
|
| ("pid", "u32", PERF_SAMPLE_TID),
|
| @@ -702,8 +729,12 @@ class LibraryRepo(object):
|
| # Unfortunately, section headers span two lines, so we have to
|
| # keep the just seen section name (from the first line in each
|
| # section header) in the after_section variable.
|
| + if mmap_info.filename.endswith(".ko"):
|
| + dynamic_symbols = ""
|
| + else:
|
| + dynamic_symbols = "-T"
|
| process = subprocess.Popen(
|
| - "%s -h -t -T -C %s" % (OBJDUMP_BIN, mmap_info.filename),
|
| + "%s -h -t %s -C %s" % (OBJDUMP_BIN, dynamic_symbols, mmap_info.filename),
|
| shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
| pipe = process.stdout
|
| after_section = None
|
| @@ -795,7 +826,7 @@ def PrintReport(code_map, library_repo, arch, ticks, options):
|
| code.PrintAnnotated(arch, options)
|
| print
|
| print "Ticks per library:"
|
| - mmap_infos = [m for m in library_repo.infos]
|
| + mmap_infos = [m for m in library_repo.infos if m.ticks > 0]
|
| mmap_infos.sort(key=lambda m: m.ticks, reverse=True)
|
| for mmap_info in mmap_infos:
|
| mmap_ticks = mmap_info.ticks
|
| @@ -885,6 +916,9 @@ if __name__ == "__main__":
|
| ticks = 0
|
| missed_ticks = 0
|
| really_missed_ticks = 0
|
| + optimized_ticks = 0
|
| + generated_ticks = 0
|
| + v8_internal_ticks = 0
|
| mmap_time = 0
|
| sample_time = 0
|
|
|
| @@ -928,6 +962,12 @@ if __name__ == "__main__":
|
| code = code_map.Find(sample.ip)
|
| if code:
|
| code.Tick(sample.ip)
|
| + if code.codetype == Code.OPTIMIZED:
|
| + optimized_ticks += 1
|
| + elif code.codetype == Code.FULL_CODEGEN:
|
| + generated_ticks += 1
|
| + elif code.codetype == Code.V8INTERNAL:
|
| + v8_internal_ticks += 1
|
| else:
|
| missed_ticks += 1
|
| if not library_repo.Tick(sample.ip) and not code:
|
| @@ -947,12 +987,21 @@ if __name__ == "__main__":
|
| PrintReport(code_map, library_repo, log_reader.arch, ticks, options)
|
|
|
| if not options.quiet:
|
| + def PrintTicks(number, total, description):
|
| + print("%10d %5.1f%% ticks in %s" %
|
| + (number, 100.0*number/total, description))
|
| print
|
| print "Stats:"
|
| print "%10d total trace events" % events
|
| print "%10d total ticks" % ticks
|
| print "%10d ticks not in symbols" % missed_ticks
|
| - print "%10d unaccounted ticks" % really_missed_ticks
|
| + unaccounted = "unaccounted ticks"
|
| + if really_missed_ticks > 0:
|
| + unaccounted += " (probably in the kernel, try --kernel)"
|
| + PrintTicks(really_missed_ticks, ticks, unaccounted)
|
| + PrintTicks(optimized_ticks, ticks, "ticks in optimized code")
|
| + PrintTicks(generated_ticks, ticks, "ticks in other lazily compiled code")
|
| + PrintTicks(v8_internal_ticks, ticks, "ticks in v8::internal::*")
|
| print "%10d total symbols" % len([c for c in code_map.AllCode()])
|
| print "%10d used symbols" % len([c for c in code_map.UsedCode()])
|
| print "%9.2fs library processing time" % mmap_time
|
|
|