OLD | NEW |
(Empty) | |
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "base/profiler/native_stack_sampler.h" |
| 6 |
| 7 #include <dlfcn.h> |
| 8 #include <libkern/OSByteOrder.h> |
| 9 #include <libunwind.h> |
| 10 #include <mach-o/swap.h> |
| 11 #include <mach/kern_return.h> |
| 12 #include <mach/mach.h> |
| 13 #include <mach/thread_act.h> |
| 14 #include <pthread.h> |
| 15 #include <sys/syslimits.h> |
| 16 |
| 17 #include <algorithm> |
| 18 #include <map> |
| 19 #include <memory> |
| 20 |
| 21 #include "base/logging.h" |
| 22 #include "base/mac/mach_logging.h" |
| 23 #include "base/macros.h" |
| 24 #include "base/memory/ptr_util.h" |
| 25 #include "base/strings/string_number_conversions.h" |
| 26 |
| 27 namespace base { |
| 28 |
| 29 namespace { |
| 30 |
| 31 // Stack walking -------------------------------------------------------------- |
| 32 |
| 33 // Fills |state| with |target_thread|'s context. |
| 34 // |
| 35 // Note that this is called while a thread is suspended. Make very very sure |
| 36 // that no shared resources (e.g. memory allocators) are used for the duration |
| 37 // of this function. |
| 38 bool GetThreadState(thread_act_t target_thread, x86_thread_state64_t* state) { |
| 39 mach_msg_type_number_t count = |
| 40 static_cast<mach_msg_type_number_t>(x86_THREAD_STATE64_COUNT); |
| 41 return thread_get_state(target_thread, x86_THREAD_STATE64, |
| 42 reinterpret_cast<thread_state_t>(state), |
| 43 &count) == KERN_SUCCESS; |
| 44 } |
| 45 |
| 46 // If the value at |pointer| points to the original stack, rewrites it to point |
| 47 // to the corresponding location in the copied stack. |
| 48 // |
| 49 // Note that this is called while a thread is suspended. Make very very sure |
| 50 // that no shared resources (e.g. memory allocators) are used for the duration |
| 51 // of this function. |
| 52 uintptr_t RewritePointerIfInOriginalStack(uintptr_t* original_stack_bottom, |
| 53 uintptr_t* original_stack_top, |
| 54 uintptr_t* stack_copy_bottom, |
| 55 uintptr_t pointer) { |
| 56 uintptr_t original_stack_bottom_int = |
| 57 reinterpret_cast<uintptr_t>(original_stack_bottom); |
| 58 uintptr_t original_stack_top_int = |
| 59 reinterpret_cast<uintptr_t>(original_stack_top); |
| 60 uintptr_t stack_copy_bottom_int = |
| 61 reinterpret_cast<uintptr_t>(stack_copy_bottom); |
| 62 |
| 63 if ((pointer < original_stack_bottom_int) || |
| 64 (pointer >= original_stack_top_int)) { |
| 65 return pointer; |
| 66 } |
| 67 |
| 68 return stack_copy_bottom_int + (pointer - original_stack_bottom_int); |
| 69 } |
| 70 |
| 71 // Copies the stack to a buffer while rewriting possible pointers to locations |
| 72 // within the stack to point to the corresponding locations in the copy. This is |
| 73 // necessary to handle stack frames with dynamic stack allocation, where a |
| 74 // pointer to the beginning of the dynamic allocation area is stored on the |
| 75 // stack and/or in a non-volatile register. |
| 76 // |
| 77 // Eager rewriting of anything that looks like a pointer to the stack, as done |
| 78 // in this function, does not adversely affect the stack unwinding. The only |
| 79 // other values on the stack the unwinding depends on are return addresses, |
| 80 // which should not point within the stack memory. The rewriting is guaranteed |
| 81 // to catch all pointers because the stacks are guaranteed by the ABI to be |
| 82 // sizeof(void*) aligned. |
| 83 // |
| 84 // Note that this is called while a thread is suspended. Make very very sure |
| 85 // that no shared resources (e.g. memory allocators) are used for the duration |
| 86 // of this function. |
| 87 void CopyStackAndRewritePointers(uintptr_t* stack_copy_bottom, |
| 88 uintptr_t* original_stack_bottom, |
| 89 uintptr_t* original_stack_top, |
| 90 x86_thread_state64_t* thread_state) |
| 91 NO_SANITIZE("address") { |
| 92 size_t count = original_stack_top - original_stack_bottom; |
| 93 for (size_t pos = 0; pos < count; ++pos) { |
| 94 stack_copy_bottom[pos] = RewritePointerIfInOriginalStack( |
| 95 original_stack_bottom, original_stack_top, stack_copy_bottom, |
| 96 original_stack_bottom[pos]); |
| 97 } |
| 98 |
| 99 uint64_t* rewrite_registers[] = {&thread_state->__rbx, &thread_state->__rbp, |
| 100 &thread_state->__rsp, &thread_state->__r12, |
| 101 &thread_state->__r13, &thread_state->__r14, |
| 102 &thread_state->__r15}; |
| 103 for (auto* reg : rewrite_registers) { |
| 104 *reg = RewritePointerIfInOriginalStack( |
| 105 original_stack_bottom, original_stack_top, stack_copy_bottom, *reg); |
| 106 } |
| 107 } |
| 108 |
| 109 // Walks the stack represented by |unwind_context|, calling back to the provided |
| 110 // lambda for each frame. Returns false if an error occurred, otherwise returns |
| 111 // true. |
| 112 template <typename StackFrameCallback> |
| 113 bool WalkStackFromContext(unw_context_t* unwind_context, |
| 114 size_t* frame_count, |
| 115 const StackFrameCallback& callback) { |
| 116 unw_cursor_t unwind_cursor; |
| 117 unw_init_local(&unwind_cursor, unwind_context); |
| 118 |
| 119 int step_result; |
| 120 unw_word_t ip; |
| 121 do { |
| 122 ++(*frame_count); |
| 123 unw_get_reg(&unwind_cursor, UNW_REG_IP, &ip); |
| 124 |
| 125 callback(static_cast<uintptr_t>(ip)); |
| 126 |
| 127 step_result = unw_step(&unwind_cursor); |
| 128 } while (step_result > 0); |
| 129 |
| 130 if (step_result != 0) |
| 131 return false; |
| 132 |
| 133 return true; |
| 134 } |
| 135 |
| 136 bool IsIPInValidImage(unw_context_t* unwind_context) { |
| 137 unw_cursor_t unwind_cursor; |
| 138 unw_init_local(&unwind_cursor, unwind_context); |
| 139 unw_proc_info_t proc_info; |
| 140 unw_get_proc_info(&unwind_cursor, &proc_info); |
| 141 return proc_info.extra != 0; |
| 142 } |
| 143 |
| 144 // Walks the stack represented by |thread_state|, calling back to the provided |
| 145 // lambda for each frame. |
| 146 template <typename StackFrameCallback> |
| 147 void WalkStack(const x86_thread_state64_t& thread_state, |
| 148 uintptr_t stack_top, |
| 149 const StackFrameCallback& callback) { |
| 150 size_t frame_count = 0; |
| 151 // This uses libunwind to walk the stack. libunwind is designed to be used for |
| 152 // a thread to walk its own stack. This creates two problems. |
| 153 |
| 154 // Problem 1: There is no official way to create a unw_context other than to |
| 155 // create it from the current state of the current thread's stack. To get |
| 156 // around this, forge a context. A unw_context is just a copy of the 16 main |
| 157 // registers followed by the instruction pointer, nothing more. |
| 158 // Coincidentally, the first 17 items of the x86_thread_state64_t type are |
| 159 // exactly those registers in exactly the same order, so just bulk copy them |
| 160 // over. |
| 161 unw_context_t unwind_context; |
| 162 memcpy(&unwind_context, &thread_state, sizeof(uintptr_t) * 17); |
| 163 bool result = WalkStackFromContext(&unwind_context, &frame_count, callback); |
| 164 |
| 165 if (!result) |
| 166 return; |
| 167 |
| 168 if (frame_count == 1) { |
| 169 // Problem 2: Because libunwind is designed to be triggered by user code on |
| 170 // their own thread, if it hits a library that has no unwind info for the |
| 171 // function that is being executed, it just stops. This isn't a problem in |
| 172 // the normal case, but in this case, it's quite possible that the stack |
| 173 // being walked is stopped in a function that bridges to the kernel and thus |
| 174 // is missing the unwind info. |
| 175 // |
| 176 // If so, cheat by scanning the stack and trying again. Only do this if the |
| 177 // first time using libunwind fails after one frame. |
| 178 bool ip_in_valid_image = false; |
| 179 auto& rsp = unwind_context.data[7]; |
| 180 auto& rip = unwind_context.data[16]; |
| 181 const uintptr_t kMaxScanDepth = 50; |
| 182 uintptr_t scan_limit = std::min<uintptr_t>(stack_top, rsp + kMaxScanDepth); |
| 183 do { |
| 184 rip = *reinterpret_cast<uintptr_t*>(rsp); // rip = *rsp |
| 185 rsp += sizeof(uintptr_t); // rsp++ |
| 186 if (rsp % sizeof(uintptr_t)) { |
| 187 // The "stack pointer" isn't aligned. Just give up. |
| 188 return; |
| 189 } |
| 190 |
| 191 ip_in_valid_image = IsIPInValidImage(&unwind_context); |
| 192 } while (!ip_in_valid_image && rsp < scan_limit); |
| 193 |
| 194 if (ip_in_valid_image) |
| 195 WalkStackFromContext(&unwind_context, &frame_count, callback); |
| 196 } |
| 197 } |
| 198 |
| 199 // Module identifiers --------------------------------------------------------- |
| 200 |
| 201 // Returns the hex encoding of a 16-byte ID for the binary loaded at |
| 202 // |module_addr|. Returns an empty string if the UUID cannot be found at |
| 203 // |module_addr|. |
| 204 std::string GetUniqueId(const void* module_addr) { |
| 205 const mach_header_64* mach_header = |
| 206 reinterpret_cast<const mach_header_64*>(module_addr); |
| 207 DCHECK_EQ(MH_MAGIC_64, mach_header->magic); |
| 208 |
| 209 size_t offset = sizeof(mach_header_64); |
| 210 size_t offset_limit = sizeof(mach_header_64) + mach_header->sizeofcmds; |
| 211 for (uint32_t i = 0; (i < mach_header->ncmds) && |
| 212 (offset + sizeof(load_command) < offset_limit); |
| 213 ++i) { |
| 214 const load_command* current_cmd = reinterpret_cast<const load_command*>( |
| 215 reinterpret_cast<const uint8_t*>(mach_header) + offset); |
| 216 |
| 217 if (offset + current_cmd->cmdsize > offset_limit) { |
| 218 // This command runs off the end of the command list. This is malformed. |
| 219 return std::string(); |
| 220 } |
| 221 |
| 222 if (current_cmd->cmd == LC_UUID) { |
| 223 if (current_cmd->cmdsize < sizeof(uuid_command)) { |
| 224 // This "UUID command" is too small. This is malformed. |
| 225 return std::string(); |
| 226 } |
| 227 |
| 228 const uuid_command* uuid_cmd = |
| 229 reinterpret_cast<const uuid_command*>(current_cmd); |
| 230 static_assert(sizeof(uuid_cmd->uuid) == sizeof(uuid_t), |
| 231 "UUID field of UUID command should be 16 bytes."); |
| 232 return HexEncode(&uuid_cmd->uuid, sizeof(uuid_cmd->uuid)); |
| 233 } |
| 234 offset += current_cmd->cmdsize; |
| 235 } |
| 236 return std::string(); |
| 237 } |
| 238 |
| 239 // Gets the index for the Module containing |instruction_pointer| in |
| 240 // |modules|, adding it if it's not already present. Returns |
| 241 // StackSamplingProfiler::Frame::kUnknownModuleIndex if no Module can be |
| 242 // determined for |module|. |
| 243 size_t GetModuleIndex(const uintptr_t instruction_pointer, |
| 244 std::vector<StackSamplingProfiler::Module>* modules, |
| 245 std::map<const void*, size_t>* profile_module_index) { |
| 246 Dl_info inf; |
| 247 if (!dladdr(reinterpret_cast<const void*>(instruction_pointer), &inf)) |
| 248 return StackSamplingProfiler::Frame::kUnknownModuleIndex; |
| 249 |
| 250 auto module_index = profile_module_index->find(inf.dli_fbase); |
| 251 if (module_index == profile_module_index->end()) { |
| 252 StackSamplingProfiler::Module module( |
| 253 reinterpret_cast<uintptr_t>(inf.dli_fbase), GetUniqueId(inf.dli_fbase), |
| 254 base::FilePath(inf.dli_fname)); |
| 255 modules->push_back(module); |
| 256 module_index = |
| 257 profile_module_index |
| 258 ->insert(std::make_pair(inf.dli_fbase, modules->size() - 1)) |
| 259 .first; |
| 260 } |
| 261 return module_index->second; |
| 262 } |
| 263 |
| 264 // ScopedSuspendThread -------------------------------------------------------- |
| 265 |
| 266 // Suspends a thread for the lifetime of the object. |
| 267 class ScopedSuspendThread { |
| 268 public: |
| 269 explicit ScopedSuspendThread(mach_port_t thread_port) |
| 270 : thread_port_(thread_suspend(thread_port) == KERN_SUCCESS |
| 271 ? thread_port |
| 272 : MACH_PORT_NULL) {} |
| 273 |
| 274 ~ScopedSuspendThread() { |
| 275 if (!was_successful()) |
| 276 return; |
| 277 |
| 278 kern_return_t kr = thread_resume(thread_port_); |
| 279 MACH_CHECK(kr == KERN_SUCCESS, kr) << "thread_resume"; |
| 280 } |
| 281 |
| 282 bool was_successful() const { return thread_port_ != MACH_PORT_NULL; } |
| 283 |
| 284 private: |
| 285 mach_port_t thread_port_; |
| 286 |
| 287 DISALLOW_COPY_AND_ASSIGN(ScopedSuspendThread); |
| 288 }; |
| 289 |
| 290 // NativeStackSamplerMac ------------------------------------------------------ |
| 291 |
| 292 class NativeStackSamplerMac : public NativeStackSampler { |
| 293 public: |
| 294 NativeStackSamplerMac(mach_port_t thread_port, |
| 295 AnnotateCallback annotator, |
| 296 NativeStackSamplerTestDelegate* test_delegate); |
| 297 ~NativeStackSamplerMac() override; |
| 298 |
| 299 // StackSamplingProfiler::NativeStackSampler: |
| 300 void ProfileRecordingStarting( |
| 301 std::vector<StackSamplingProfiler::Module>* modules) override; |
| 302 void RecordStackSample(StackSamplingProfiler::Sample* sample) override; |
| 303 void ProfileRecordingStopped() override; |
| 304 |
| 305 private: |
| 306 // Intended to hold the largest stack used by Chrome. The default macOS main |
| 307 // thread stack size is 8 MB, and this allows for expansion if it occurs. |
| 308 static constexpr size_t kStackCopyBufferSize = 12 * 1024 * 1024; |
| 309 |
| 310 // Suspends the thread with |thread_port_|, copies its stack and resumes the |
| 311 // thread, then records the stack frames and associated modules into |sample|. |
| 312 void SuspendThreadAndRecordStack(StackSamplingProfiler::Sample* sample); |
| 313 |
| 314 // Weak reference: Mach port for thread being profiled. |
| 315 mach_port_t thread_port_; |
| 316 |
| 317 const AnnotateCallback annotator_; |
| 318 |
| 319 NativeStackSamplerTestDelegate* const test_delegate_; |
| 320 |
| 321 // The stack base address corresponding to |thread_handle_|. |
| 322 const void* const thread_stack_base_address_; |
| 323 |
| 324 // Buffer to use for copies of the stack. We use the same buffer for all the |
| 325 // samples to avoid the overhead of multiple allocations and frees. |
| 326 const std::unique_ptr<unsigned char[]> stack_copy_buffer_; |
| 327 |
| 328 // Weak. Points to the modules associated with the profile being recorded |
| 329 // between ProfileRecordingStarting() and ProfileRecordingStopped(). |
| 330 std::vector<StackSamplingProfiler::Module>* current_modules_ = nullptr; |
| 331 |
| 332 // Maps a module's base address to the corresponding Module's index within |
| 333 // current_modules_. |
| 334 std::map<const void*, size_t> profile_module_index_; |
| 335 |
| 336 DISALLOW_COPY_AND_ASSIGN(NativeStackSamplerMac); |
| 337 }; |
| 338 |
| 339 NativeStackSamplerMac::NativeStackSamplerMac( |
| 340 mach_port_t thread_port, |
| 341 AnnotateCallback annotator, |
| 342 NativeStackSamplerTestDelegate* test_delegate) |
| 343 : thread_port_(thread_port), |
| 344 annotator_(annotator), |
| 345 test_delegate_(test_delegate), |
| 346 thread_stack_base_address_( |
| 347 pthread_get_stackaddr_np(pthread_from_mach_thread_np(thread_port))), |
| 348 stack_copy_buffer_(new unsigned char[kStackCopyBufferSize]) { |
| 349 DCHECK(annotator_); |
| 350 |
| 351 // This class suspends threads, and those threads might be suspended in dyld. |
| 352 // Therefore, for all the system functions that might be linked in dynamically |
| 353 // that are used while threads are suspended, make calls to them to make sure |
| 354 // that they are linked up. |
| 355 x86_thread_state64_t thread_state; |
| 356 GetThreadState(thread_port_, &thread_state); |
| 357 } |
| 358 |
| 359 NativeStackSamplerMac::~NativeStackSamplerMac() {} |
| 360 |
| 361 void NativeStackSamplerMac::ProfileRecordingStarting( |
| 362 std::vector<StackSamplingProfiler::Module>* modules) { |
| 363 current_modules_ = modules; |
| 364 profile_module_index_.clear(); |
| 365 } |
| 366 |
| 367 void NativeStackSamplerMac::RecordStackSample( |
| 368 StackSamplingProfiler::Sample* sample) { |
| 369 DCHECK(current_modules_); |
| 370 |
| 371 SuspendThreadAndRecordStack(sample); |
| 372 } |
| 373 |
| 374 void NativeStackSamplerMac::ProfileRecordingStopped() { |
| 375 current_modules_ = nullptr; |
| 376 } |
| 377 |
| 378 void NativeStackSamplerMac::SuspendThreadAndRecordStack( |
| 379 StackSamplingProfiler::Sample* sample) { |
| 380 x86_thread_state64_t thread_state; |
| 381 |
| 382 // Copy the stack. |
| 383 |
| 384 uintptr_t new_stack_top = 0; |
| 385 { |
| 386 // IMPORTANT NOTE: Do not do ANYTHING in this in this scope that might |
| 387 // allocate memory, including indirectly via use of DCHECK/CHECK or other |
| 388 // logging statements. Otherwise this code can deadlock on heap locks in the |
| 389 // default heap acquired by the target thread before it was suspended. |
| 390 ScopedSuspendThread suspend_thread(thread_port_); |
| 391 if (!suspend_thread.was_successful()) |
| 392 return; |
| 393 |
| 394 if (!GetThreadState(thread_port_, &thread_state)) |
| 395 return; |
| 396 uintptr_t stack_top = |
| 397 reinterpret_cast<uintptr_t>(thread_stack_base_address_); |
| 398 uintptr_t stack_bottom = thread_state.__rsp; |
| 399 if (stack_bottom >= stack_top) |
| 400 return; |
| 401 uintptr_t stack_size = stack_top - stack_bottom; |
| 402 |
| 403 if (stack_size > kStackCopyBufferSize) |
| 404 return; |
| 405 |
| 406 (*annotator_)(sample); |
| 407 |
| 408 CopyStackAndRewritePointers( |
| 409 reinterpret_cast<uintptr_t*>(stack_copy_buffer_.get()), |
| 410 reinterpret_cast<uintptr_t*>(stack_bottom), |
| 411 reinterpret_cast<uintptr_t*>(stack_top), &thread_state); |
| 412 |
| 413 new_stack_top = |
| 414 reinterpret_cast<uintptr_t>(stack_copy_buffer_.get()) + stack_size; |
| 415 } // ScopedSuspendThread |
| 416 |
| 417 if (test_delegate_) |
| 418 test_delegate_->OnPreStackWalk(); |
| 419 |
| 420 // Walk the stack and record it. |
| 421 |
| 422 // Reserve enough memory for most stacks, to avoid repeated allocations. |
| 423 // Approximately 99.9% of recorded stacks are 128 frames or fewer. |
| 424 sample->frames.reserve(128); |
| 425 |
| 426 auto* current_modules = current_modules_; |
| 427 auto* profile_module_index = &profile_module_index_; |
| 428 WalkStack( |
| 429 thread_state, new_stack_top, |
| 430 [sample, current_modules, profile_module_index](uintptr_t frame_ip) { |
| 431 sample->frames.push_back(StackSamplingProfiler::Frame( |
| 432 frame_ip, |
| 433 GetModuleIndex(frame_ip, current_modules, profile_module_index))); |
| 434 }); |
| 435 } |
| 436 |
| 437 } // namespace |
| 438 |
| 439 std::unique_ptr<NativeStackSampler> NativeStackSampler::Create( |
| 440 PlatformThreadId thread_id, |
| 441 AnnotateCallback annotator, |
| 442 NativeStackSamplerTestDelegate* test_delegate) { |
| 443 #if !defined(__x86_64__) |
| 444 // No. |
| 445 return nullptr; |
| 446 #endif |
| 447 return base::MakeUnique<NativeStackSamplerMac>(thread_id, annotator, |
| 448 test_delegate); |
| 449 } |
| 450 |
| 451 } // namespace base |
OLD | NEW |