Chromium Code Reviews| Index: base/profiler/native_stack_sampler_mac.cc |
| diff --git a/base/profiler/native_stack_sampler_mac.cc b/base/profiler/native_stack_sampler_mac.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..9bc20c28764e45ba87fffd223b56674e094ddd2d |
| --- /dev/null |
| +++ b/base/profiler/native_stack_sampler_mac.cc |
| @@ -0,0 +1,501 @@ |
| +// Copyright 2017 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "base/profiler/native_stack_sampler.h" |
| + |
| +#include <dlfcn.h> |
| +#include <libkern/OSByteOrder.h> |
| +#include <libunwind.h> |
| +#include <mach-o/swap.h> |
| +#include <mach/kern_return.h> |
| +#include <mach/mach.h> |
| +#include <mach/thread_act.h> |
| +#include <pthread.h> |
| +#include <sys/syslimits.h> |
| + |
| +#include <map> |
| +#include <memory> |
| + |
| +#include "base/logging.h" |
| +#include "base/macros.h" |
| +#include "base/memory/ptr_util.h" |
| +#include "base/strings/string_number_conversions.h" |
| + |
| +namespace base { |
| + |
| +namespace { |
| + |
| +// Stack walking -------------------------------------------------------------- |
| + |
| +// Copy of x86_64 thread context structure from x86_thread_state64_t type. |
| +// Copied struct since fields can have different names on different versions of |
| +// Darwin. |
| +struct ThreadContext { |
| + uint64_t rax; |
| + uint64_t rbx; |
| + uint64_t rcx; |
| + uint64_t rdx; |
| + uint64_t rdi; |
| + uint64_t rsi; |
| + uint64_t rbp; |
| + uint64_t rsp; |
| + uint64_t r8; |
| + uint64_t r9; |
| + uint64_t r10; |
| + uint64_t r11; |
| + uint64_t r12; |
| + uint64_t r13; |
| + uint64_t r14; |
| + uint64_t r15; |
| + uint64_t rip; |
| + uint64_t rflags; |
| + uint64_t cs; |
| + uint64_t fs; |
| + uint64_t gs; |
| +}; |
| + |
| +// Fills |state| with |target_thread|'s context. |
|
Mike Wittman
2017/02/16 21:51:34
Should we have deadlock warnings analogous to the
Avi (use Gerrit)
2017/02/17 03:41:09
Done.
|
| +bool GetThreadContext(thread_act_t target_thread, ThreadContext* state) { |
| + mach_msg_type_number_t count = |
| + static_cast<mach_msg_type_number_t>(MACHINE_THREAD_STATE_COUNT); |
| + return thread_get_state(target_thread, x86_THREAD_STATE64, |
| + reinterpret_cast<thread_state_t>(state), |
| + &count) == KERN_SUCCESS; |
| +} |
| + |
| +// If the value at |pointer| points to the original stack, rewrite it to point |
| +// to the corresponding location in the copied stack. |
| +uint64_t RewritePointerIfInOriginalStack(uint64_t* original_stack_bottom, |
| + uint64_t* original_stack_top, |
| + uint64_t* stack_copy_bottom, |
| + uint64_t pointer) { |
| + uint64_t original_stack_bottom_int = |
| + reinterpret_cast<uint64_t>(original_stack_bottom); |
| + uint64_t original_stack_top_int = |
| + reinterpret_cast<uint64_t>(original_stack_top); |
| + uint64_t stack_copy_bottom_int = |
| + reinterpret_cast<uint64_t>(stack_copy_bottom); |
| + |
| + if ((pointer < original_stack_bottom_int) || |
| + (pointer >= original_stack_top_int)) { |
| + return pointer; |
| + } |
| + |
| + return stack_copy_bottom_int + (pointer - original_stack_bottom_int); |
| +} |
| + |
| +void CopyStackAndRewritePointers(void* dest, |
| + void* from, |
| + void* to, |
| + ThreadContext* thread_context) |
| + NO_SANITIZE("address") { |
| + uint64_t* original_stack_bottom = static_cast<uint64_t*>(from); |
| + uint64_t* original_stack_top = static_cast<uint64_t*>(to); |
| + uint64_t* stack_copy_bottom = static_cast<uint64_t*>(dest); |
| + DCHECK_EQ( |
| + 0u, reinterpret_cast<uint64_t>(original_stack_bottom) % sizeof(uint64_t)); |
| + DCHECK_EQ(0u, |
| + reinterpret_cast<uint64_t>(original_stack_top) % sizeof(uint64_t)); |
| + DCHECK_EQ(0u, |
| + reinterpret_cast<uint64_t>(stack_copy_bottom) % sizeof(uint64_t)); |
| + |
| + size_t count = original_stack_top - original_stack_bottom; |
| + for (size_t pos = 0; pos < count; ++pos) { |
| + stack_copy_bottom[pos] = RewritePointerIfInOriginalStack( |
| + original_stack_bottom, original_stack_top, stack_copy_bottom, |
| + original_stack_bottom[pos]); |
| + } |
| + |
| + thread_context->rbp = |
| + RewritePointerIfInOriginalStack(original_stack_bottom, original_stack_top, |
| + stack_copy_bottom, thread_context->rbp); |
| + thread_context->rsp = |
| + RewritePointerIfInOriginalStack(original_stack_bottom, original_stack_top, |
| + stack_copy_bottom, thread_context->rsp); |
| +} |
| + |
| +const char* LibSystemKernelName() { |
| + static char path[PATH_MAX]; |
| + static char* name = nullptr; |
| + if (name) |
| + return name; |
| + |
| + Dl_info info; |
| + dladdr(reinterpret_cast<void*>(_exit), &info); |
| + strncpy(path, info.dli_fname, PATH_MAX); |
| + name = path; |
| + DCHECK_EQ(std::string(name), |
| + std::string("/usr/lib/system/libsystem_kernel.dylib")); |
| + return name; |
| +} |
| + |
| +enum StackWalkResult : int { |
| + ERROR = -1, |
| + SUCCESS, |
| + SYSCALL, |
| +}; |
| + |
| +// Walks the stack represented by |unwind_context|, calling back to the provided |
| +// lambda for each frame. |
| +template <typename StackFrameCallback> |
| +StackWalkResult WalkStackFromContext(unw_context_t* unwind_context, |
| + const StackFrameCallback& callback) { |
| + unw_cursor_t unwind_cursor; |
| + unw_init_local(&unwind_cursor, unwind_context); |
| + |
| + int step_result; |
| + unw_word_t ip; |
| + size_t frames = 0; |
| + do { |
| + ++frames; |
| + unw_get_reg(&unwind_cursor, UNW_REG_IP, &ip); |
| + |
| + callback(static_cast<uintptr_t>(ip)); |
| + |
| + step_result = unw_step(&unwind_cursor); |
|
Mike Wittman
2017/02/16 21:51:34
General questions: How is unwinding of leaf functi
Avi (use Gerrit)
2017/02/17 03:41:09
I believe the compiler puts in full debug info eve
Mark Mentovai
2017/02/17 05:21:05
Avi wrote:
Mike Wittman
2017/02/17 17:09:15
Can we verify the behavior by running the profiler
Avi (use Gerrit)
2017/02/17 17:18:12
a) Didn't catch that the stack_sampling_configurat
Mike Wittman
2017/02/17 17:38:51
That's the only other thing necessary to enable fo
|
| + } while (step_result > 0); |
| + |
| + if (step_result != 0) |
| + return StackWalkResult::ERROR; |
| + |
| + Dl_info info; |
| + if (frames == 1 && dladdr(reinterpret_cast<void*>(ip), &info) != 0 && |
| + strcmp(info.dli_fname, LibSystemKernelName()) == 0) { |
| + return StackWalkResult::SYSCALL; |
| + } |
| + |
| + return StackWalkResult::SUCCESS; |
| +} |
| + |
| +// Walks the stack represented by |thread_context|, calling back to the provided |
| +// lambda for each frame. |
| +template <typename StackFrameCallback> |
| +void WalkStack(const ThreadContext& thread_context, |
| + const StackFrameCallback& callback) { |
| + // This uses libunwind to walk the stack. libunwind is designed to be used for |
| + // a thread to walk its own stack. This creates two problems. |
| + |
| + // Problem 1: There is no official way to create a unw_context other than to |
| + // create it from the current state of the current thread's stack. To get |
| + // around this, forge a context. A unw_context is just a copy of the register |
| + // file followed by the instruction pointer. Coincidentally, the first 17 |
| + // items of the ThreadContext type are exactly that! |
| + unw_context_t unwind_context; |
| + memcpy(&unwind_context, &thread_context, sizeof(uint64_t) * 17); |
| + StackWalkResult result = WalkStackFromContext(&unwind_context, callback); |
| + |
| + if (result == StackWalkResult::SYSCALL) { |
| + // Problem 2: Because libunwind is designed to be triggered by user code on |
| + // their own thread, if it hits a library that has no unwind info for the |
| + // function that is being executed, it just stops. This isn't a problem in |
| + // the normal case, but in this case, it's quite possible that the stack |
| + // being walked is stopped in a function that bridges to the kernel and thus |
| + // is missing the unwind info. |
| + // |
| + // If so, cheat by manually unwinding one stack frame and trying again. |
| + unwind_context.data[7] = thread_context.rsp + 8; // rsp++ |
| + unwind_context.data[16] = |
| + *reinterpret_cast<uint64_t*>(thread_context.rsp); // rip = *rsp |
| + WalkStackFromContext(&unwind_context, callback); |
| + } |
| +} |
| + |
| +// Module identifiers --------------------------------------------------------- |
| + |
| +// Helper that swaps byte order in |x| if |swap| flag is set. |
| +uint32_t SwapIfBig32(uint32_t x, bool swap) { |
| + if (swap) |
| + return OSSwapBigToHostInt32(x); |
| + return x; |
| +} |
| + |
| +// Returns the offset in bytes where the x86_64 header is located in a binary |
| +// loaded at |module_addr|. Returns 0 if |module_addr| is not a valid FAT |
| +// Mach-O binary or has not been built for x86_64. |
| +off_t GetMach64HeaderOffset(const void* module_addr) { |
| + const fat_header* header = reinterpret_cast<const fat_header*>(module_addr); |
| + if (header->magic != FAT_MAGIC && header->magic != FAT_CIGAM) |
| + return 0; |
| + |
| + // Search all FAT architectures for x86_64. |
| + const fat_arch* fat_arches = reinterpret_cast<const fat_arch*>( |
| + reinterpret_cast<const uint8_t*>(module_addr) + sizeof(header)); |
| + uint32_t n_arches = OSSwapBigToHostInt32(header->nfat_arch); |
| + for (uint32_t i = 0; i < n_arches; ++i) { |
| + const fat_arch& arch = fat_arches[i]; |
| + if (OSSwapBigToHostInt32(arch.cputype) == CPU_TYPE_X86_64) |
| + return OSSwapBigToHostInt32(arch.offset); |
| + } |
| + return 0; |
| +} |
| + |
| +// Returns true if the Mach-O binary at |module_addr| was built specifically for |
| +// the x86_64 CPU architecture. |
| +bool IsX64Header(const void* module_addr) { |
| + const mach_header_64* header = |
| + reinterpret_cast<const mach_header_64*>(module_addr); |
| + if (header->magic != MH_MAGIC_64 && header->magic != MH_CIGAM_64) |
| + return false; |
| + bool swap = header->magic == MH_CIGAM_64; |
| + return SwapIfBig32(header->cputype, swap) == CPU_TYPE_X86_64; |
| +} |
| + |
| +// Fills |id| with the UUID of the x86_64 Mach-O binary loaded at |module_addr|. |
| +// |offset| is the offset in bytes into |module_addr| where the x86_64 header is |
| +// located. |offset| is only relevant if the binary is FAT and contains multiple |
| +// architecture headers. Returns false if the header is malformed or the header |
| +// does not specify the UUID load command. |
| +bool GetX64UUIDAt(const void* module_addr, unsigned char* id, off_t offset) { |
| + const mach_header_64* header = reinterpret_cast<const mach_header_64*>( |
| + reinterpret_cast<const uint8_t*>(module_addr) + offset); |
| + if (header->magic != MH_MAGIC_64 && header->magic != MH_CIGAM_64) |
| + return false; |
| + |
| + bool swap = header->magic == MH_CIGAM_64; |
| + // Search all load commands for UUID command. |
| + offset += sizeof(mach_header_64); |
| + for (uint32_t i = 0; i < SwapIfBig32(header->ncmds, swap); ++i) { |
| + const load_command* current_cmd = reinterpret_cast<const load_command*>( |
| + reinterpret_cast<const uint8_t*>(module_addr) + offset); |
| + |
| + if (SwapIfBig32(current_cmd->cmd, swap) == LC_UUID) { |
| + const uuid_command* uuid_cmd = |
| + reinterpret_cast<const uuid_command*>(current_cmd); |
| + static_assert(sizeof(uuid_cmd->uuid) == sizeof(uuid_t), |
| + "UUID field of UUID command should be 16 bytes."); |
| + memcpy(id, &uuid_cmd->uuid, sizeof(uuid_t)); |
| + return true; |
| + } |
| + offset += SwapIfBig32(current_cmd->cmdsize, swap); |
| + } |
| + return false; |
| +} |
| + |
| +// Fills |id| with the Mach-O UUID retrieved from Mach-O binary loaded at |
| +// |module_addr|. This function returns false if the binary was not built for |
| +// X86_64 or if the UUID cannot be found. |
| +bool GetUUID(const void* module_addr, unsigned char* id) { |
| + off_t offset = 0; |
| + // If the module is not x86_64 exclusive, it could be a module that supports |
| + // multiple architectures. In that case, the appropriate header will be at |
| + // some non-zero offset. |
| + if (!IsX64Header(module_addr) && |
| + !(offset = GetMach64HeaderOffset(module_addr))) { |
| + return false; |
| + } |
| + return GetX64UUIDAt(module_addr, id, offset); |
| +} |
| + |
| +// Returns the hex encoding of a 16-byte ID for the binary loaded at |
| +// |module_addr|. Returns an empty string if the UUID cannot be found at |
| +// |module_addr|. |
| +std::string GetUniqueId(const void* module_addr) { |
| + unsigned char id[sizeof(uuid_t)]; |
| + if (!GetUUID(module_addr, id)) |
| + return ""; |
| + return HexEncode(id, sizeof(uuid_t)); |
| +} |
| + |
| +// Gets the index for the Module containing |instruction_pointer| in |
| +// |modules|, adding it if it's not already present. Returns |
| +// StackSamplingProfiler::Frame::kUnknownModuleIndex if no Module can be |
| +// determined for |module|. |
| +size_t GetModuleIndex(const uintptr_t instruction_pointer, |
| + std::vector<StackSamplingProfiler::Module>* modules, |
| + std::map<const void*, size_t>* profile_module_index) { |
| + Dl_info inf; |
| + if (!dladdr(reinterpret_cast<const void*>(instruction_pointer), &inf)) |
| + return StackSamplingProfiler::Frame::kUnknownModuleIndex; |
| + |
| + auto module_index = profile_module_index->find(inf.dli_fbase); |
| + if (module_index == profile_module_index->end()) { |
| + StackSamplingProfiler::Module module( |
| + reinterpret_cast<uintptr_t>(inf.dli_fbase), GetUniqueId(inf.dli_fbase), |
| + base::FilePath(inf.dli_fname)); |
| + modules->push_back(module); |
| + module_index = |
| + profile_module_index |
| + ->insert(std::make_pair(inf.dli_fbase, modules->size() - 1)) |
| + .first; |
| + } |
| + return module_index->second; |
| +} |
| + |
| +// ScopedSuspendThread -------------------------------------------------------- |
| + |
| +// Suspends a thread for the lifetime of the object. |
| +class ScopedSuspendThread { |
| + public: |
| + explicit ScopedSuspendThread(mach_port_t thread_port); |
| + ~ScopedSuspendThread(); |
| + |
| + bool was_successful() const { return was_successful_; } |
| + |
| + private: |
| + mach_port_t thread_port_; |
| + bool was_successful_; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(ScopedSuspendThread); |
| +}; |
| + |
| +ScopedSuspendThread::ScopedSuspendThread(mach_port_t thread_port) |
| + : thread_port_(thread_port), |
| + was_successful_(thread_suspend(thread_port) == KERN_SUCCESS) {} |
| + |
| +ScopedSuspendThread::~ScopedSuspendThread() { |
| + if (!was_successful_) |
| + return; |
| + |
| + kern_return_t resume_result = thread_resume(thread_port_); |
| + CHECK_EQ(KERN_SUCCESS, resume_result) << "thread_resume failed"; |
| +} |
| + |
| +// NativeStackSamplerMac ------------------------------------------------------ |
| + |
| +class NativeStackSamplerMac : public NativeStackSampler { |
| + public: |
| + NativeStackSamplerMac(mach_port_t thread_port, |
| + AnnotateCallback annotator, |
| + NativeStackSamplerTestDelegate* test_delegate); |
| + ~NativeStackSamplerMac() override; |
| + |
| + // StackSamplingProfiler::NativeStackSampler: |
| + void ProfileRecordingStarting( |
| + std::vector<StackSamplingProfiler::Module>* modules) override; |
| + void RecordStackSample(StackSamplingProfiler::Sample* sample) override; |
| + void ProfileRecordingStopped() override; |
| + |
| + private: |
| + enum { |
| + // Intended to hold the largest stack used by Chrome. The default macOS main |
| + // thread stack size is 8 MB, and this allows for expansion if it occurs. |
| + kStackCopyBufferSize = 12 * 1024 * 1024 |
| + }; |
| + |
| + // Suspends the thread with |thread_port_|, copies its stack and resumes the |
| + // thread, then records the stack frames and associated modules into |sample|. |
| + void SuspendThreadAndRecordStack(StackSamplingProfiler::Sample* sample); |
| + |
| + // Weak reference: Mach port for thread being profiled. |
| + mach_port_t thread_port_; |
| + |
| + const AnnotateCallback annotator_; |
| + |
| + NativeStackSamplerTestDelegate* const test_delegate_; |
| + |
| + // The stack base address corresponding to |thread_handle_|. |
| + const void* const thread_stack_base_address_; |
| + |
| + // Buffer to use for copies of the stack. We use the same buffer for all the |
| + // samples to avoid the overhead of multiple allocations and frees. |
| + const std::unique_ptr<unsigned char[]> stack_copy_buffer_; |
| + |
| + // Weak. Points to the modules associated with the profile being recorded |
| + // between ProfileRecordingStarting() and ProfileRecordingStopped(). |
| + std::vector<StackSamplingProfiler::Module>* current_modules_ = nullptr; |
| + |
| + // Maps a module's base address to the corresponding Module's index within |
| + // current_modules_. |
| + std::map<const void*, size_t> profile_module_index_; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(NativeStackSamplerMac); |
| +}; |
| + |
| +NativeStackSamplerMac::NativeStackSamplerMac( |
| + mach_port_t thread_port, |
| + AnnotateCallback annotator, |
| + NativeStackSamplerTestDelegate* test_delegate) |
| + : thread_port_(thread_port), |
| + annotator_(annotator), |
| + test_delegate_(test_delegate), |
| + thread_stack_base_address_( |
| + pthread_get_stackaddr_np(pthread_from_mach_thread_np(thread_port))), |
| + stack_copy_buffer_(new unsigned char[kStackCopyBufferSize]) { |
| + DCHECK(annotator_); |
| + |
| + // This class suspends threads, and those threads might be suspended in dyld. |
| + // Therefore, for all the system functions that might be linked in dynamically |
| + // that are used while threads are suspended, make calls to them to make sure |
| + // that they are linked up. |
| + ThreadContext thread_context; |
| + GetThreadContext(thread_port_, &thread_context); |
| +} |
| + |
| +NativeStackSamplerMac::~NativeStackSamplerMac() {} |
| + |
| +void NativeStackSamplerMac::ProfileRecordingStarting( |
| + std::vector<StackSamplingProfiler::Module>* modules) { |
| + current_modules_ = modules; |
| + profile_module_index_.clear(); |
| +} |
| + |
| +void NativeStackSamplerMac::RecordStackSample( |
| + StackSamplingProfiler::Sample* sample) { |
| + DCHECK(current_modules_); |
| + |
| + if (!stack_copy_buffer_) |
| + return; |
| + |
| + SuspendThreadAndRecordStack(sample); |
| +} |
| + |
| +void NativeStackSamplerMac::ProfileRecordingStopped() { |
| + current_modules_ = nullptr; |
| +} |
| + |
| +void NativeStackSamplerMac::SuspendThreadAndRecordStack( |
| + StackSamplingProfiler::Sample* sample) { |
| + ThreadContext thread_context; |
| + |
| + // Copy the stack. |
| + |
| + { |
| + ScopedSuspendThread suspend_thread(thread_port_); |
| + if (!suspend_thread.was_successful()) |
| + return; |
| + |
| + if (!GetThreadContext(thread_port_, &thread_context)) |
| + return; |
| + uint64_t stack_top = reinterpret_cast<uint64_t>(thread_stack_base_address_); |
| + uint64_t stack_bottom = thread_context.rsp; |
| + |
| + if ((stack_top - stack_bottom) > kStackCopyBufferSize) |
| + return; |
| + |
| + (*annotator_)(sample); |
| + |
| + CopyStackAndRewritePointers( |
| + stack_copy_buffer_.get(), reinterpret_cast<void*>(stack_bottom), |
| + reinterpret_cast<void*>(stack_top), &thread_context); |
| + } // ScopedSuspendThread |
| + |
| + if (test_delegate_) |
| + test_delegate_->OnPreStackWalk(); |
| + |
| + // Walk the stack and record it. |
| + |
| + auto current_modules = current_modules_; |
| + auto profile_module_index = &profile_module_index_; |
|
Mike Wittman
2017/02/16 21:51:34
Can we add a sample->frames.reserve() call here to
Avi (use Gerrit)
2017/02/17 03:41:09
Ah! Done.
|
| + WalkStack(thread_context, [sample, current_modules, |
|
Mike Wittman
2017/02/16 21:51:34
Nice, using lambdas makes for a much cleaner solut
Avi (use Gerrit)
2017/02/17 03:41:09
Acknowledged.
|
| + profile_module_index](uintptr_t frame_ip) { |
| + sample->frames.push_back(StackSamplingProfiler::Frame( |
| + frame_ip, |
| + GetModuleIndex(frame_ip, current_modules, profile_module_index))); |
| + }); |
| +} |
| + |
| +} // namespace |
| + |
| +std::unique_ptr<NativeStackSampler> NativeStackSampler::Create( |
| + PlatformThreadId thread_id, |
| + AnnotateCallback annotator, |
| + NativeStackSamplerTestDelegate* test_delegate) { |
| +#if defined(__i386__) |
| + return nullptr; |
| +#endif |
| + return base::MakeUnique<NativeStackSamplerMac>(thread_id, annotator, |
| + test_delegate); |
| +} |
| + |
| +} // namespace base |