Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "base/profiler/native_stack_sampler.h" | |
| 6 | |
| 7 #include <dlfcn.h> | |
| 8 #include <libkern/OSByteOrder.h> | |
| 9 #include <libunwind.h> | |
| 10 #include <mach-o/swap.h> | |
| 11 #include <mach/kern_return.h> | |
| 12 #include <mach/mach.h> | |
| 13 #include <mach/thread_act.h> | |
| 14 #include <pthread.h> | |
| 15 #include <sys/resource.h> | |
| 16 #include <sys/syslimits.h> | |
| 17 | |
| 18 #include <algorithm> | |
| 19 #include <map> | |
| 20 #include <memory> | |
| 21 | |
| 22 #include "base/logging.h" | |
| 23 #include "base/mac/mach_logging.h" | |
| 24 #include "base/macros.h" | |
| 25 #include "base/memory/ptr_util.h" | |
| 26 #include "base/strings/string_number_conversions.h" | |
| 27 | |
| 28 namespace base { | |
| 29 | |
| 30 namespace { | |
| 31 | |
| 32 // Miscellaneous -------------------------------------------------------------- | |
| 33 | |
| 34 size_t StackCopyBufferSize() { | |
| 35 static size_t stack_size = 0; | |
| 36 if (stack_size) | |
| 37 return stack_size; | |
| 38 | |
| 39 // In platform_thread_mac's GetDefaultThreadStackSize(), RLIMIT_STACK is used | |
| 40 // for all stacks, not just the main thread's, so it is good for use here. | |
| 41 struct rlimit stack_rlimit; | |
| 42 if (getrlimit(RLIMIT_STACK, &stack_rlimit) == 0 && | |
| 43 stack_rlimit.rlim_cur != RLIM_INFINITY) { | |
| 44 stack_size = stack_rlimit.rlim_cur; | |
| 45 return stack_size; | |
| 46 } | |
| 47 | |
| 48 // If getrlimit somehow fails, return the default macOS main thread stack size | |
| 49 // of 8 MB (DFLSSIZ in <i386/vmparam.h>) with extra wiggle room. | |
| 50 return 12 * 1024 * 1024; | |
| 51 } | |
| 52 | |
| 53 // Stack walking -------------------------------------------------------------- | |
| 54 | |
| 55 // Fills |state| with |target_thread|'s context. | |
| 56 // | |
| 57 // Note that this is called while a thread is suspended. Make very very sure | |
| 58 // that no shared resources (e.g. memory allocators) are used for the duration | |
| 59 // of this function. | |
| 60 bool GetThreadState(thread_act_t target_thread, x86_thread_state64_t* state) { | |
| 61 mach_msg_type_number_t count = | |
| 62 static_cast<mach_msg_type_number_t>(x86_THREAD_STATE64_COUNT); | |
| 63 return thread_get_state(target_thread, x86_THREAD_STATE64, | |
| 64 reinterpret_cast<thread_state_t>(state), | |
| 65 &count) == KERN_SUCCESS; | |
| 66 } | |
| 67 | |
| 68 // If the value at |pointer| points to the original stack, rewrites it to point | |
| 69 // to the corresponding location in the copied stack. | |
| 70 // | |
| 71 // Note that this is called while a thread is suspended. Make very very sure | |
| 72 // that no shared resources (e.g. memory allocators) are used for the duration | |
| 73 // of this function. | |
| 74 uintptr_t RewritePointerIfInOriginalStack( | |
| 75 const uintptr_t* original_stack_bottom, | |
| 76 const uintptr_t* original_stack_top, | |
| 77 uintptr_t* stack_copy_bottom, | |
| 78 uintptr_t pointer) { | |
| 79 uintptr_t original_stack_bottom_int = | |
| 80 reinterpret_cast<uintptr_t>(original_stack_bottom); | |
| 81 uintptr_t original_stack_top_int = | |
| 82 reinterpret_cast<uintptr_t>(original_stack_top); | |
| 83 uintptr_t stack_copy_bottom_int = | |
| 84 reinterpret_cast<uintptr_t>(stack_copy_bottom); | |
| 85 | |
| 86 if ((pointer < original_stack_bottom_int) || | |
| 87 (pointer >= original_stack_top_int)) { | |
| 88 return pointer; | |
| 89 } | |
| 90 | |
| 91 return stack_copy_bottom_int + (pointer - original_stack_bottom_int); | |
| 92 } | |
| 93 | |
| 94 // Copies the stack to a buffer while rewriting possible pointers to locations | |
| 95 // within the stack to point to the corresponding locations in the copy. This is | |
| 96 // necessary to handle stack frames with dynamic stack allocation, where a | |
| 97 // pointer to the beginning of the dynamic allocation area is stored on the | |
| 98 // stack and/or in a non-volatile register. | |
| 99 // | |
| 100 // Eager rewriting of anything that looks like a pointer to the stack, as done | |
| 101 // in this function, does not adversely affect the stack unwinding. The only | |
| 102 // other values on the stack the unwinding depends on are return addresses, | |
| 103 // which should not point within the stack memory. The rewriting is guaranteed | |
| 104 // to catch all pointers because the stacks are guaranteed by the ABI to be | |
| 105 // sizeof(void*) aligned. | |
| 106 // | |
| 107 // Note that this is called while a thread is suspended. Make very very sure | |
| 108 // that no shared resources (e.g. memory allocators) are used for the duration | |
| 109 // of this function. | |
| 110 void CopyStackAndRewritePointers(uintptr_t* stack_copy_bottom, | |
| 111 const uintptr_t* original_stack_bottom, | |
| 112 const uintptr_t* original_stack_top, | |
| 113 x86_thread_state64_t* thread_state) | |
| 114 NO_SANITIZE("address") { | |
| 115 size_t count = original_stack_top - original_stack_bottom; | |
| 116 for (size_t pos = 0; pos < count; ++pos) { | |
| 117 stack_copy_bottom[pos] = RewritePointerIfInOriginalStack( | |
| 118 original_stack_bottom, original_stack_top, stack_copy_bottom, | |
| 119 original_stack_bottom[pos]); | |
| 120 } | |
| 121 | |
| 122 uint64_t* rewrite_registers[] = {&thread_state->__rbx, &thread_state->__rbp, | |
| 123 &thread_state->__rsp, &thread_state->__r12, | |
| 124 &thread_state->__r13, &thread_state->__r14, | |
| 125 &thread_state->__r15}; | |
| 126 for (auto* reg : rewrite_registers) { | |
| 127 *reg = RewritePointerIfInOriginalStack( | |
| 128 original_stack_bottom, original_stack_top, stack_copy_bottom, *reg); | |
| 129 } | |
| 130 } | |
| 131 | |
| 132 // Walks the stack represented by |unwind_context|, calling back to the provided | |
| 133 // lambda for each frame. Returns false if an error occurred, otherwise returns | |
| 134 // true. | |
| 135 template <typename StackFrameCallback> | |
| 136 bool WalkStackFromContext(unw_context_t* unwind_context, | |
|
Mike Wittman
2017/04/28 22:33:10
nit: remove the unused bool return value
Avi (use Gerrit)
2017/04/28 22:45:55
Done.
| |
| 137 size_t* frame_count, | |
| 138 const StackFrameCallback& callback) { | |
| 139 unw_cursor_t unwind_cursor; | |
| 140 unw_init_local(&unwind_cursor, unwind_context); | |
| 141 | |
| 142 int step_result; | |
| 143 unw_word_t ip; | |
| 144 do { | |
| 145 ++(*frame_count); | |
| 146 unw_get_reg(&unwind_cursor, UNW_REG_IP, &ip); | |
| 147 | |
| 148 callback(static_cast<uintptr_t>(ip)); | |
| 149 | |
| 150 step_result = unw_step(&unwind_cursor); | |
| 151 } while (step_result > 0); | |
| 152 | |
| 153 if (step_result != 0) | |
| 154 return false; | |
| 155 | |
| 156 return true; | |
| 157 } | |
| 158 | |
| 159 bool IsIPInValidImage(unw_context_t* unwind_context) { | |
| 160 unw_cursor_t unwind_cursor; | |
| 161 unw_init_local(&unwind_cursor, unwind_context); | |
| 162 unw_proc_info_t proc_info; | |
| 163 unw_get_proc_info(&unwind_cursor, &proc_info); | |
| 164 return proc_info.extra != 0; | |
| 165 } | |
| 166 | |
| 167 // Walks the stack represented by |thread_state|, calling back to the provided | |
| 168 // lambda for each frame. | |
| 169 template <typename StackFrameCallback> | |
| 170 void WalkStack(const x86_thread_state64_t& thread_state, | |
| 171 uintptr_t stack_top, | |
| 172 const StackFrameCallback& callback) { | |
| 173 size_t frame_count = 0; | |
| 174 // This uses libunwind to walk the stack. libunwind is designed to be used for | |
| 175 // a thread to walk its own stack. This creates two problems. | |
| 176 | |
| 177 // Problem 1: There is no official way to create a unw_context other than to | |
| 178 // create it from the current state of the current thread's stack. To get | |
| 179 // around this, forge a context. A unw_context is just a copy of the 16 main | |
| 180 // registers followed by the instruction pointer, nothing more. | |
| 181 // Coincidentally, the first 17 items of the x86_thread_state64_t type are | |
| 182 // exactly those registers in exactly the same order, so just bulk copy them | |
| 183 // over. | |
| 184 unw_context_t unwind_context; | |
| 185 memcpy(&unwind_context, &thread_state, sizeof(uintptr_t) * 17); | |
| 186 WalkStackFromContext(&unwind_context, &frame_count, callback); | |
| 187 | |
| 188 // The second problem is one-frame walks, but for now see if this one walk | |
| 189 // crashes. | |
| 190 } | |
| 191 | |
| 192 // Module identifiers --------------------------------------------------------- | |
| 193 | |
| 194 // Returns the hex encoding of a 16-byte ID for the binary loaded at | |
| 195 // |module_addr|. Returns an empty string if the UUID cannot be found at | |
| 196 // |module_addr|. | |
| 197 std::string GetUniqueId(const void* module_addr) { | |
| 198 const mach_header_64* mach_header = | |
| 199 reinterpret_cast<const mach_header_64*>(module_addr); | |
| 200 DCHECK_EQ(MH_MAGIC_64, mach_header->magic); | |
| 201 | |
| 202 size_t offset = sizeof(mach_header_64); | |
| 203 size_t offset_limit = sizeof(mach_header_64) + mach_header->sizeofcmds; | |
| 204 for (uint32_t i = 0; (i < mach_header->ncmds) && | |
| 205 (offset + sizeof(load_command) < offset_limit); | |
| 206 ++i) { | |
| 207 const load_command* current_cmd = reinterpret_cast<const load_command*>( | |
| 208 reinterpret_cast<const uint8_t*>(mach_header) + offset); | |
| 209 | |
| 210 if (offset + current_cmd->cmdsize > offset_limit) { | |
| 211 // This command runs off the end of the command list. This is malformed. | |
| 212 return std::string(); | |
| 213 } | |
| 214 | |
| 215 if (current_cmd->cmd == LC_UUID) { | |
| 216 if (current_cmd->cmdsize < sizeof(uuid_command)) { | |
| 217 // This "UUID command" is too small. This is malformed. | |
| 218 return std::string(); | |
| 219 } | |
| 220 | |
| 221 const uuid_command* uuid_cmd = | |
| 222 reinterpret_cast<const uuid_command*>(current_cmd); | |
| 223 static_assert(sizeof(uuid_cmd->uuid) == sizeof(uuid_t), | |
| 224 "UUID field of UUID command should be 16 bytes."); | |
| 225 return HexEncode(&uuid_cmd->uuid, sizeof(uuid_cmd->uuid)); | |
| 226 } | |
| 227 offset += current_cmd->cmdsize; | |
| 228 } | |
| 229 return std::string(); | |
| 230 } | |
| 231 | |
| 232 // Gets the index for the Module containing |instruction_pointer| in | |
| 233 // |modules|, adding it if it's not already present. Returns | |
| 234 // StackSamplingProfiler::Frame::kUnknownModuleIndex if no Module can be | |
| 235 // determined for |module|. | |
| 236 size_t GetModuleIndex(const uintptr_t instruction_pointer, | |
| 237 std::vector<StackSamplingProfiler::Module>* modules, | |
| 238 std::map<const void*, size_t>* profile_module_index) { | |
| 239 Dl_info inf; | |
| 240 if (!dladdr(reinterpret_cast<const void*>(instruction_pointer), &inf)) | |
| 241 return StackSamplingProfiler::Frame::kUnknownModuleIndex; | |
| 242 | |
| 243 auto module_index = profile_module_index->find(inf.dli_fbase); | |
| 244 if (module_index == profile_module_index->end()) { | |
| 245 StackSamplingProfiler::Module module( | |
| 246 reinterpret_cast<uintptr_t>(inf.dli_fbase), GetUniqueId(inf.dli_fbase), | |
| 247 base::FilePath(inf.dli_fname)); | |
| 248 modules->push_back(module); | |
| 249 module_index = | |
| 250 profile_module_index | |
| 251 ->insert(std::make_pair(inf.dli_fbase, modules->size() - 1)) | |
| 252 .first; | |
| 253 } | |
| 254 return module_index->second; | |
| 255 } | |
| 256 | |
| 257 // ScopedSuspendThread -------------------------------------------------------- | |
| 258 | |
| 259 // Suspends a thread for the lifetime of the object. | |
| 260 class ScopedSuspendThread { | |
| 261 public: | |
| 262 explicit ScopedSuspendThread(mach_port_t thread_port) | |
| 263 : thread_port_(thread_suspend(thread_port) == KERN_SUCCESS | |
| 264 ? thread_port | |
| 265 : MACH_PORT_NULL) {} | |
| 266 | |
| 267 ~ScopedSuspendThread() { | |
| 268 if (!was_successful()) | |
| 269 return; | |
| 270 | |
| 271 kern_return_t kr = thread_resume(thread_port_); | |
| 272 MACH_CHECK(kr == KERN_SUCCESS, kr) << "thread_resume"; | |
| 273 } | |
| 274 | |
| 275 bool was_successful() const { return thread_port_ != MACH_PORT_NULL; } | |
| 276 | |
| 277 private: | |
| 278 mach_port_t thread_port_; | |
| 279 | |
| 280 DISALLOW_COPY_AND_ASSIGN(ScopedSuspendThread); | |
| 281 }; | |
| 282 | |
| 283 // NativeStackSamplerMac ------------------------------------------------------ | |
| 284 | |
| 285 class NativeStackSamplerMac : public NativeStackSampler { | |
| 286 public: | |
| 287 NativeStackSamplerMac(mach_port_t thread_port, | |
| 288 AnnotateCallback annotator, | |
| 289 NativeStackSamplerTestDelegate* test_delegate); | |
| 290 ~NativeStackSamplerMac() override; | |
| 291 | |
| 292 // StackSamplingProfiler::NativeStackSampler: | |
| 293 void ProfileRecordingStarting( | |
| 294 std::vector<StackSamplingProfiler::Module>* modules) override; | |
| 295 void RecordStackSample(StackSamplingProfiler::Sample* sample) override; | |
| 296 void ProfileRecordingStopped() override; | |
| 297 | |
| 298 private: | |
| 299 // Suspends the thread with |thread_port_|, copies its stack and resumes the | |
| 300 // thread, then records the stack frames and associated modules into |sample|. | |
| 301 void SuspendThreadAndRecordStack(StackSamplingProfiler::Sample* sample); | |
| 302 | |
| 303 // Weak reference: Mach port for thread being profiled. | |
| 304 mach_port_t thread_port_; | |
| 305 | |
| 306 const AnnotateCallback annotator_; | |
| 307 | |
| 308 NativeStackSamplerTestDelegate* const test_delegate_; | |
| 309 | |
| 310 // The stack base address corresponding to |thread_handle_|. | |
| 311 const void* const thread_stack_base_address_; | |
| 312 | |
| 313 // The size of the |stack_copy_buffer_|. | |
| 314 const size_t stack_copy_buffer_size_; | |
| 315 | |
| 316 // Buffer to use for copies of the stack. We use the same buffer for all the | |
| 317 // samples to avoid the overhead of multiple allocations and frees. | |
| 318 const std::unique_ptr<unsigned char[]> stack_copy_buffer_; | |
| 319 | |
| 320 // Weak. Points to the modules associated with the profile being recorded | |
| 321 // between ProfileRecordingStarting() and ProfileRecordingStopped(). | |
| 322 std::vector<StackSamplingProfiler::Module>* current_modules_ = nullptr; | |
| 323 | |
| 324 // Maps a module's base address to the corresponding Module's index within | |
| 325 // current_modules_. | |
| 326 std::map<const void*, size_t> profile_module_index_; | |
| 327 | |
| 328 DISALLOW_COPY_AND_ASSIGN(NativeStackSamplerMac); | |
| 329 }; | |
| 330 | |
| 331 NativeStackSamplerMac::NativeStackSamplerMac( | |
| 332 mach_port_t thread_port, | |
| 333 AnnotateCallback annotator, | |
| 334 NativeStackSamplerTestDelegate* test_delegate) | |
| 335 : thread_port_(thread_port), | |
| 336 annotator_(annotator), | |
| 337 test_delegate_(test_delegate), | |
| 338 thread_stack_base_address_( | |
| 339 pthread_get_stackaddr_np(pthread_from_mach_thread_np(thread_port))), | |
| 340 stack_copy_buffer_size_(StackCopyBufferSize()), | |
| 341 stack_copy_buffer_(new unsigned char[stack_copy_buffer_size_]) { | |
| 342 DCHECK(annotator_); | |
| 343 | |
| 344 // This class suspends threads, and those threads might be suspended in dyld. | |
| 345 // Therefore, for all the system functions that might be linked in dynamically | |
| 346 // that are used while threads are suspended, make calls to them to make sure | |
| 347 // that they are linked up. | |
| 348 x86_thread_state64_t thread_state; | |
| 349 GetThreadState(thread_port_, &thread_state); | |
| 350 } | |
| 351 | |
| 352 NativeStackSamplerMac::~NativeStackSamplerMac() {} | |
| 353 | |
| 354 void NativeStackSamplerMac::ProfileRecordingStarting( | |
| 355 std::vector<StackSamplingProfiler::Module>* modules) { | |
| 356 current_modules_ = modules; | |
| 357 profile_module_index_.clear(); | |
| 358 } | |
| 359 | |
| 360 void NativeStackSamplerMac::RecordStackSample( | |
| 361 StackSamplingProfiler::Sample* sample) { | |
| 362 DCHECK(current_modules_); | |
| 363 | |
| 364 SuspendThreadAndRecordStack(sample); | |
| 365 } | |
| 366 | |
| 367 void NativeStackSamplerMac::ProfileRecordingStopped() { | |
| 368 current_modules_ = nullptr; | |
| 369 } | |
| 370 | |
| 371 void NativeStackSamplerMac::SuspendThreadAndRecordStack( | |
| 372 StackSamplingProfiler::Sample* sample) { | |
| 373 x86_thread_state64_t thread_state; | |
| 374 | |
| 375 // Copy the stack. | |
| 376 | |
| 377 uintptr_t new_stack_top = 0; | |
| 378 { | |
| 379 // IMPORTANT NOTE: Do not do ANYTHING in this in this scope that might | |
| 380 // allocate memory, including indirectly via use of DCHECK/CHECK or other | |
| 381 // logging statements. Otherwise this code can deadlock on heap locks in the | |
| 382 // default heap acquired by the target thread before it was suspended. | |
| 383 ScopedSuspendThread suspend_thread(thread_port_); | |
| 384 if (!suspend_thread.was_successful()) | |
| 385 return; | |
| 386 | |
| 387 if (!GetThreadState(thread_port_, &thread_state)) | |
| 388 return; | |
| 389 uintptr_t stack_top = | |
| 390 reinterpret_cast<uintptr_t>(thread_stack_base_address_); | |
| 391 uintptr_t stack_bottom = thread_state.__rsp; | |
| 392 if (stack_bottom >= stack_top) | |
| 393 return; | |
| 394 uintptr_t stack_size = stack_top - stack_bottom; | |
| 395 | |
| 396 if (stack_size > stack_copy_buffer_size_) | |
| 397 return; | |
| 398 | |
| 399 (*annotator_)(sample); | |
| 400 | |
| 401 CopyStackAndRewritePointers( | |
| 402 reinterpret_cast<uintptr_t*>(stack_copy_buffer_.get()), | |
| 403 reinterpret_cast<uintptr_t*>(stack_bottom), | |
| 404 reinterpret_cast<uintptr_t*>(stack_top), &thread_state); | |
| 405 | |
| 406 new_stack_top = | |
| 407 reinterpret_cast<uintptr_t>(stack_copy_buffer_.get()) + stack_size; | |
| 408 } // ScopedSuspendThread | |
| 409 | |
| 410 if (test_delegate_) | |
| 411 test_delegate_->OnPreStackWalk(); | |
| 412 | |
| 413 // Walk the stack and record it. | |
| 414 | |
| 415 // Reserve enough memory for most stacks, to avoid repeated allocations. | |
| 416 // Approximately 99.9% of recorded stacks are 128 frames or fewer. | |
| 417 sample->frames.reserve(128); | |
| 418 | |
| 419 auto* current_modules = current_modules_; | |
| 420 auto* profile_module_index = &profile_module_index_; | |
| 421 WalkStack( | |
| 422 thread_state, new_stack_top, | |
| 423 [sample, current_modules, profile_module_index](uintptr_t frame_ip) { | |
| 424 sample->frames.push_back(StackSamplingProfiler::Frame( | |
| 425 frame_ip, | |
| 426 GetModuleIndex(frame_ip, current_modules, profile_module_index))); | |
| 427 }); | |
| 428 } | |
| 429 | |
| 430 } // namespace | |
| 431 | |
| 432 std::unique_ptr<NativeStackSampler> NativeStackSampler::Create( | |
| 433 PlatformThreadId thread_id, | |
| 434 AnnotateCallback annotator, | |
| 435 NativeStackSamplerTestDelegate* test_delegate) { | |
| 436 return base::MakeUnique<NativeStackSamplerMac>(thread_id, annotator, | |
| 437 test_delegate); | |
| 438 } | |
| 439 | |
| 440 } // namespace base | |
| OLD | NEW |