OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "base/profiler/native_stack_sampler.h" | |
6 | |
7 #include <dlfcn.h> | |
8 #include <libkern/OSByteOrder.h> | |
9 #include <libunwind.h> | |
10 #include <mach-o/swap.h> | |
11 #include <mach/kern_return.h> | |
12 #include <mach/mach.h> | |
13 #include <mach/thread_act.h> | |
14 #include <pthread.h> | |
15 #include <sys/syslimits.h> | |
16 | |
17 #include <map> | |
18 #include <memory> | |
19 | |
20 #include "base/logging.h" | |
21 #include "base/macros.h" | |
22 #include "base/memory/ptr_util.h" | |
23 #include "base/strings/string_number_conversions.h" | |
24 | |
25 namespace base { | |
26 | |
27 namespace { | |
28 | |
29 // Stack walking -------------------------------------------------------------- | |
30 | |
31 // Copy of x86_64 thread context structure from x86_thread_state64_t type. | |
32 // Copied struct since fields can have different names on different versions of | |
33 // Darwin. | |
34 struct ThreadContext { | |
35 uint64_t rax; | |
36 uint64_t rbx; | |
37 uint64_t rcx; | |
38 uint64_t rdx; | |
39 uint64_t rdi; | |
40 uint64_t rsi; | |
41 uint64_t rbp; | |
42 uint64_t rsp; | |
43 uint64_t r8; | |
44 uint64_t r9; | |
45 uint64_t r10; | |
46 uint64_t r11; | |
47 uint64_t r12; | |
48 uint64_t r13; | |
49 uint64_t r14; | |
50 uint64_t r15; | |
51 uint64_t rip; | |
52 uint64_t rflags; | |
53 uint64_t cs; | |
54 uint64_t fs; | |
55 uint64_t gs; | |
56 }; | |
57 | |
58 // Fills |state| with |target_thread|'s context. | |
Mike Wittman
2017/02/16 21:51:34
Should we have deadlock warnings analogous to the
Avi (use Gerrit)
2017/02/17 03:41:09
Done.
| |
59 bool GetThreadContext(thread_act_t target_thread, ThreadContext* state) { | |
60 mach_msg_type_number_t count = | |
61 static_cast<mach_msg_type_number_t>(MACHINE_THREAD_STATE_COUNT); | |
62 return thread_get_state(target_thread, x86_THREAD_STATE64, | |
63 reinterpret_cast<thread_state_t>(state), | |
64 &count) == KERN_SUCCESS; | |
65 } | |
66 | |
67 // If the value at |pointer| points to the original stack, rewrite it to point | |
68 // to the corresponding location in the copied stack. | |
69 uint64_t RewritePointerIfInOriginalStack(uint64_t* original_stack_bottom, | |
70 uint64_t* original_stack_top, | |
71 uint64_t* stack_copy_bottom, | |
72 uint64_t pointer) { | |
73 uint64_t original_stack_bottom_int = | |
74 reinterpret_cast<uint64_t>(original_stack_bottom); | |
75 uint64_t original_stack_top_int = | |
76 reinterpret_cast<uint64_t>(original_stack_top); | |
77 uint64_t stack_copy_bottom_int = | |
78 reinterpret_cast<uint64_t>(stack_copy_bottom); | |
79 | |
80 if ((pointer < original_stack_bottom_int) || | |
81 (pointer >= original_stack_top_int)) { | |
82 return pointer; | |
83 } | |
84 | |
85 return stack_copy_bottom_int + (pointer - original_stack_bottom_int); | |
86 } | |
87 | |
88 void CopyStackAndRewritePointers(void* dest, | |
89 void* from, | |
90 void* to, | |
91 ThreadContext* thread_context) | |
92 NO_SANITIZE("address") { | |
93 uint64_t* original_stack_bottom = static_cast<uint64_t*>(from); | |
94 uint64_t* original_stack_top = static_cast<uint64_t*>(to); | |
95 uint64_t* stack_copy_bottom = static_cast<uint64_t*>(dest); | |
96 DCHECK_EQ( | |
97 0u, reinterpret_cast<uint64_t>(original_stack_bottom) % sizeof(uint64_t)); | |
98 DCHECK_EQ(0u, | |
99 reinterpret_cast<uint64_t>(original_stack_top) % sizeof(uint64_t)); | |
100 DCHECK_EQ(0u, | |
101 reinterpret_cast<uint64_t>(stack_copy_bottom) % sizeof(uint64_t)); | |
102 | |
103 size_t count = original_stack_top - original_stack_bottom; | |
104 for (size_t pos = 0; pos < count; ++pos) { | |
105 stack_copy_bottom[pos] = RewritePointerIfInOriginalStack( | |
106 original_stack_bottom, original_stack_top, stack_copy_bottom, | |
107 original_stack_bottom[pos]); | |
108 } | |
109 | |
110 thread_context->rbp = | |
111 RewritePointerIfInOriginalStack(original_stack_bottom, original_stack_top, | |
112 stack_copy_bottom, thread_context->rbp); | |
113 thread_context->rsp = | |
114 RewritePointerIfInOriginalStack(original_stack_bottom, original_stack_top, | |
115 stack_copy_bottom, thread_context->rsp); | |
116 } | |
117 | |
118 const char* LibSystemKernelName() { | |
119 static char path[PATH_MAX]; | |
120 static char* name = nullptr; | |
121 if (name) | |
122 return name; | |
123 | |
124 Dl_info info; | |
125 dladdr(reinterpret_cast<void*>(_exit), &info); | |
126 strncpy(path, info.dli_fname, PATH_MAX); | |
127 name = path; | |
128 DCHECK_EQ(std::string(name), | |
129 std::string("/usr/lib/system/libsystem_kernel.dylib")); | |
130 return name; | |
131 } | |
132 | |
133 enum StackWalkResult : int { | |
134 ERROR = -1, | |
135 SUCCESS, | |
136 SYSCALL, | |
137 }; | |
138 | |
139 // Walks the stack represented by |unwind_context|, calling back to the provided | |
140 // lambda for each frame. | |
141 template <typename StackFrameCallback> | |
142 StackWalkResult WalkStackFromContext(unw_context_t* unwind_context, | |
143 const StackFrameCallback& callback) { | |
144 unw_cursor_t unwind_cursor; | |
145 unw_init_local(&unwind_cursor, unwind_context); | |
146 | |
147 int step_result; | |
148 unw_word_t ip; | |
149 size_t frames = 0; | |
150 do { | |
151 ++frames; | |
152 unw_get_reg(&unwind_cursor, UNW_REG_IP, &ip); | |
153 | |
154 callback(static_cast<uintptr_t>(ip)); | |
155 | |
156 step_result = unw_step(&unwind_cursor); | |
Mike Wittman
2017/02/16 21:51:34
General questions: How is unwinding of leaf functi
Avi (use Gerrit)
2017/02/17 03:41:09
I believe the compiler puts in full debug info eve
Mark Mentovai
2017/02/17 05:21:05
Avi wrote:
Mike Wittman
2017/02/17 17:09:15
Can we verify the behavior by running the profiler
Avi (use Gerrit)
2017/02/17 17:18:12
a) Didn't catch that the stack_sampling_configurat
Mike Wittman
2017/02/17 17:38:51
That's the only other thing necessary to enable fo
| |
157 } while (step_result > 0); | |
158 | |
159 if (step_result != 0) | |
160 return StackWalkResult::ERROR; | |
161 | |
162 Dl_info info; | |
163 if (frames == 1 && dladdr(reinterpret_cast<void*>(ip), &info) != 0 && | |
164 strcmp(info.dli_fname, LibSystemKernelName()) == 0) { | |
165 return StackWalkResult::SYSCALL; | |
166 } | |
167 | |
168 return StackWalkResult::SUCCESS; | |
169 } | |
170 | |
171 // Walks the stack represented by |thread_context|, calling back to the provided | |
172 // lambda for each frame. | |
173 template <typename StackFrameCallback> | |
174 void WalkStack(const ThreadContext& thread_context, | |
175 const StackFrameCallback& callback) { | |
176 // This uses libunwind to walk the stack. libunwind is designed to be used for | |
177 // a thread to walk its own stack. This creates two problems. | |
178 | |
179 // Problem 1: There is no official way to create a unw_context other than to | |
180 // create it from the current state of the current thread's stack. To get | |
181 // around this, forge a context. A unw_context is just a copy of the register | |
182 // file followed by the instruction pointer. Coincidentally, the first 17 | |
183 // items of the ThreadContext type are exactly that! | |
184 unw_context_t unwind_context; | |
185 memcpy(&unwind_context, &thread_context, sizeof(uint64_t) * 17); | |
186 StackWalkResult result = WalkStackFromContext(&unwind_context, callback); | |
187 | |
188 if (result == StackWalkResult::SYSCALL) { | |
189 // Problem 2: Because libunwind is designed to be triggered by user code on | |
190 // their own thread, if it hits a library that has no unwind info for the | |
191 // function that is being executed, it just stops. This isn't a problem in | |
192 // the normal case, but in this case, it's quite possible that the stack | |
193 // being walked is stopped in a function that bridges to the kernel and thus | |
194 // is missing the unwind info. | |
195 // | |
196 // If so, cheat by manually unwinding one stack frame and trying again. | |
197 unwind_context.data[7] = thread_context.rsp + 8; // rsp++ | |
198 unwind_context.data[16] = | |
199 *reinterpret_cast<uint64_t*>(thread_context.rsp); // rip = *rsp | |
200 WalkStackFromContext(&unwind_context, callback); | |
201 } | |
202 } | |
203 | |
204 // Module identifiers --------------------------------------------------------- | |
205 | |
206 // Helper that swaps byte order in |x| if |swap| flag is set. | |
207 uint32_t SwapIfBig32(uint32_t x, bool swap) { | |
208 if (swap) | |
209 return OSSwapBigToHostInt32(x); | |
210 return x; | |
211 } | |
212 | |
213 // Returns the offset in bytes where the x86_64 header is located in a binary | |
214 // loaded at |module_addr|. Returns 0 if |module_addr| is not a valid FAT | |
215 // Mach-O binary or has not been built for x86_64. | |
216 off_t GetMach64HeaderOffset(const void* module_addr) { | |
217 const fat_header* header = reinterpret_cast<const fat_header*>(module_addr); | |
218 if (header->magic != FAT_MAGIC && header->magic != FAT_CIGAM) | |
219 return 0; | |
220 | |
221 // Search all FAT architectures for x86_64. | |
222 const fat_arch* fat_arches = reinterpret_cast<const fat_arch*>( | |
223 reinterpret_cast<const uint8_t*>(module_addr) + sizeof(header)); | |
224 uint32_t n_arches = OSSwapBigToHostInt32(header->nfat_arch); | |
225 for (uint32_t i = 0; i < n_arches; ++i) { | |
226 const fat_arch& arch = fat_arches[i]; | |
227 if (OSSwapBigToHostInt32(arch.cputype) == CPU_TYPE_X86_64) | |
228 return OSSwapBigToHostInt32(arch.offset); | |
229 } | |
230 return 0; | |
231 } | |
232 | |
233 // Returns true if the Mach-O binary at |module_addr| was built specifically for | |
234 // the x86_64 CPU architecture. | |
235 bool IsX64Header(const void* module_addr) { | |
236 const mach_header_64* header = | |
237 reinterpret_cast<const mach_header_64*>(module_addr); | |
238 if (header->magic != MH_MAGIC_64 && header->magic != MH_CIGAM_64) | |
239 return false; | |
240 bool swap = header->magic == MH_CIGAM_64; | |
241 return SwapIfBig32(header->cputype, swap) == CPU_TYPE_X86_64; | |
242 } | |
243 | |
244 // Fills |id| with the UUID of the x86_64 Mach-O binary loaded at |module_addr|. | |
245 // |offset| is the offset in bytes into |module_addr| where the x86_64 header is | |
246 // located. |offset| is only relevant if the binary is FAT and contains multiple | |
247 // architecture headers. Returns false if the header is malformed or the header | |
248 // does not specify the UUID load command. | |
249 bool GetX64UUIDAt(const void* module_addr, unsigned char* id, off_t offset) { | |
250 const mach_header_64* header = reinterpret_cast<const mach_header_64*>( | |
251 reinterpret_cast<const uint8_t*>(module_addr) + offset); | |
252 if (header->magic != MH_MAGIC_64 && header->magic != MH_CIGAM_64) | |
253 return false; | |
254 | |
255 bool swap = header->magic == MH_CIGAM_64; | |
256 // Search all load commands for UUID command. | |
257 offset += sizeof(mach_header_64); | |
258 for (uint32_t i = 0; i < SwapIfBig32(header->ncmds, swap); ++i) { | |
259 const load_command* current_cmd = reinterpret_cast<const load_command*>( | |
260 reinterpret_cast<const uint8_t*>(module_addr) + offset); | |
261 | |
262 if (SwapIfBig32(current_cmd->cmd, swap) == LC_UUID) { | |
263 const uuid_command* uuid_cmd = | |
264 reinterpret_cast<const uuid_command*>(current_cmd); | |
265 static_assert(sizeof(uuid_cmd->uuid) == sizeof(uuid_t), | |
266 "UUID field of UUID command should be 16 bytes."); | |
267 memcpy(id, &uuid_cmd->uuid, sizeof(uuid_t)); | |
268 return true; | |
269 } | |
270 offset += SwapIfBig32(current_cmd->cmdsize, swap); | |
271 } | |
272 return false; | |
273 } | |
274 | |
275 // Fills |id| with the Mach-O UUID retrieved from Mach-O binary loaded at | |
276 // |module_addr|. This function returns false if the binary was not built for | |
277 // X86_64 or if the UUID cannot be found. | |
278 bool GetUUID(const void* module_addr, unsigned char* id) { | |
279 off_t offset = 0; | |
280 // If the module is not x86_64 exclusive, it could be a module that supports | |
281 // multiple architectures. In that case, the appropriate header will be at | |
282 // some non-zero offset. | |
283 if (!IsX64Header(module_addr) && | |
284 !(offset = GetMach64HeaderOffset(module_addr))) { | |
285 return false; | |
286 } | |
287 return GetX64UUIDAt(module_addr, id, offset); | |
288 } | |
289 | |
290 // Returns the hex encoding of a 16-byte ID for the binary loaded at | |
291 // |module_addr|. Returns an empty string if the UUID cannot be found at | |
292 // |module_addr|. | |
293 std::string GetUniqueId(const void* module_addr) { | |
294 unsigned char id[sizeof(uuid_t)]; | |
295 if (!GetUUID(module_addr, id)) | |
296 return ""; | |
297 return HexEncode(id, sizeof(uuid_t)); | |
298 } | |
299 | |
300 // Gets the index for the Module containing |instruction_pointer| in | |
301 // |modules|, adding it if it's not already present. Returns | |
302 // StackSamplingProfiler::Frame::kUnknownModuleIndex if no Module can be | |
303 // determined for |module|. | |
304 size_t GetModuleIndex(const uintptr_t instruction_pointer, | |
305 std::vector<StackSamplingProfiler::Module>* modules, | |
306 std::map<const void*, size_t>* profile_module_index) { | |
307 Dl_info inf; | |
308 if (!dladdr(reinterpret_cast<const void*>(instruction_pointer), &inf)) | |
309 return StackSamplingProfiler::Frame::kUnknownModuleIndex; | |
310 | |
311 auto module_index = profile_module_index->find(inf.dli_fbase); | |
312 if (module_index == profile_module_index->end()) { | |
313 StackSamplingProfiler::Module module( | |
314 reinterpret_cast<uintptr_t>(inf.dli_fbase), GetUniqueId(inf.dli_fbase), | |
315 base::FilePath(inf.dli_fname)); | |
316 modules->push_back(module); | |
317 module_index = | |
318 profile_module_index | |
319 ->insert(std::make_pair(inf.dli_fbase, modules->size() - 1)) | |
320 .first; | |
321 } | |
322 return module_index->second; | |
323 } | |
324 | |
325 // ScopedSuspendThread -------------------------------------------------------- | |
326 | |
327 // Suspends a thread for the lifetime of the object. | |
328 class ScopedSuspendThread { | |
329 public: | |
330 explicit ScopedSuspendThread(mach_port_t thread_port); | |
331 ~ScopedSuspendThread(); | |
332 | |
333 bool was_successful() const { return was_successful_; } | |
334 | |
335 private: | |
336 mach_port_t thread_port_; | |
337 bool was_successful_; | |
338 | |
339 DISALLOW_COPY_AND_ASSIGN(ScopedSuspendThread); | |
340 }; | |
341 | |
342 ScopedSuspendThread::ScopedSuspendThread(mach_port_t thread_port) | |
343 : thread_port_(thread_port), | |
344 was_successful_(thread_suspend(thread_port) == KERN_SUCCESS) {} | |
345 | |
346 ScopedSuspendThread::~ScopedSuspendThread() { | |
347 if (!was_successful_) | |
348 return; | |
349 | |
350 kern_return_t resume_result = thread_resume(thread_port_); | |
351 CHECK_EQ(KERN_SUCCESS, resume_result) << "thread_resume failed"; | |
352 } | |
353 | |
354 // NativeStackSamplerMac ------------------------------------------------------ | |
355 | |
356 class NativeStackSamplerMac : public NativeStackSampler { | |
357 public: | |
358 NativeStackSamplerMac(mach_port_t thread_port, | |
359 AnnotateCallback annotator, | |
360 NativeStackSamplerTestDelegate* test_delegate); | |
361 ~NativeStackSamplerMac() override; | |
362 | |
363 // StackSamplingProfiler::NativeStackSampler: | |
364 void ProfileRecordingStarting( | |
365 std::vector<StackSamplingProfiler::Module>* modules) override; | |
366 void RecordStackSample(StackSamplingProfiler::Sample* sample) override; | |
367 void ProfileRecordingStopped() override; | |
368 | |
369 private: | |
370 enum { | |
371 // Intended to hold the largest stack used by Chrome. The default macOS main | |
372 // thread stack size is 8 MB, and this allows for expansion if it occurs. | |
373 kStackCopyBufferSize = 12 * 1024 * 1024 | |
374 }; | |
375 | |
376 // Suspends the thread with |thread_port_|, copies its stack and resumes the | |
377 // thread, then records the stack frames and associated modules into |sample|. | |
378 void SuspendThreadAndRecordStack(StackSamplingProfiler::Sample* sample); | |
379 | |
380 // Weak reference: Mach port for thread being profiled. | |
381 mach_port_t thread_port_; | |
382 | |
383 const AnnotateCallback annotator_; | |
384 | |
385 NativeStackSamplerTestDelegate* const test_delegate_; | |
386 | |
387 // The stack base address corresponding to |thread_handle_|. | |
388 const void* const thread_stack_base_address_; | |
389 | |
390 // Buffer to use for copies of the stack. We use the same buffer for all the | |
391 // samples to avoid the overhead of multiple allocations and frees. | |
392 const std::unique_ptr<unsigned char[]> stack_copy_buffer_; | |
393 | |
394 // Weak. Points to the modules associated with the profile being recorded | |
395 // between ProfileRecordingStarting() and ProfileRecordingStopped(). | |
396 std::vector<StackSamplingProfiler::Module>* current_modules_ = nullptr; | |
397 | |
398 // Maps a module's base address to the corresponding Module's index within | |
399 // current_modules_. | |
400 std::map<const void*, size_t> profile_module_index_; | |
401 | |
402 DISALLOW_COPY_AND_ASSIGN(NativeStackSamplerMac); | |
403 }; | |
404 | |
405 NativeStackSamplerMac::NativeStackSamplerMac( | |
406 mach_port_t thread_port, | |
407 AnnotateCallback annotator, | |
408 NativeStackSamplerTestDelegate* test_delegate) | |
409 : thread_port_(thread_port), | |
410 annotator_(annotator), | |
411 test_delegate_(test_delegate), | |
412 thread_stack_base_address_( | |
413 pthread_get_stackaddr_np(pthread_from_mach_thread_np(thread_port))), | |
414 stack_copy_buffer_(new unsigned char[kStackCopyBufferSize]) { | |
415 DCHECK(annotator_); | |
416 | |
417 // This class suspends threads, and those threads might be suspended in dyld. | |
418 // Therefore, for all the system functions that might be linked in dynamically | |
419 // that are used while threads are suspended, make calls to them to make sure | |
420 // that they are linked up. | |
421 ThreadContext thread_context; | |
422 GetThreadContext(thread_port_, &thread_context); | |
423 } | |
424 | |
425 NativeStackSamplerMac::~NativeStackSamplerMac() {} | |
426 | |
427 void NativeStackSamplerMac::ProfileRecordingStarting( | |
428 std::vector<StackSamplingProfiler::Module>* modules) { | |
429 current_modules_ = modules; | |
430 profile_module_index_.clear(); | |
431 } | |
432 | |
433 void NativeStackSamplerMac::RecordStackSample( | |
434 StackSamplingProfiler::Sample* sample) { | |
435 DCHECK(current_modules_); | |
436 | |
437 if (!stack_copy_buffer_) | |
438 return; | |
439 | |
440 SuspendThreadAndRecordStack(sample); | |
441 } | |
442 | |
443 void NativeStackSamplerMac::ProfileRecordingStopped() { | |
444 current_modules_ = nullptr; | |
445 } | |
446 | |
447 void NativeStackSamplerMac::SuspendThreadAndRecordStack( | |
448 StackSamplingProfiler::Sample* sample) { | |
449 ThreadContext thread_context; | |
450 | |
451 // Copy the stack. | |
452 | |
453 { | |
454 ScopedSuspendThread suspend_thread(thread_port_); | |
455 if (!suspend_thread.was_successful()) | |
456 return; | |
457 | |
458 if (!GetThreadContext(thread_port_, &thread_context)) | |
459 return; | |
460 uint64_t stack_top = reinterpret_cast<uint64_t>(thread_stack_base_address_); | |
461 uint64_t stack_bottom = thread_context.rsp; | |
462 | |
463 if ((stack_top - stack_bottom) > kStackCopyBufferSize) | |
464 return; | |
465 | |
466 (*annotator_)(sample); | |
467 | |
468 CopyStackAndRewritePointers( | |
469 stack_copy_buffer_.get(), reinterpret_cast<void*>(stack_bottom), | |
470 reinterpret_cast<void*>(stack_top), &thread_context); | |
471 } // ScopedSuspendThread | |
472 | |
473 if (test_delegate_) | |
474 test_delegate_->OnPreStackWalk(); | |
475 | |
476 // Walk the stack and record it. | |
477 | |
478 auto current_modules = current_modules_; | |
479 auto profile_module_index = &profile_module_index_; | |
Mike Wittman
2017/02/16 21:51:34
Can we add a sample->frames.reserve() call here to
Avi (use Gerrit)
2017/02/17 03:41:09
Ah! Done.
| |
480 WalkStack(thread_context, [sample, current_modules, | |
Mike Wittman
2017/02/16 21:51:34
Nice, using lambdas makes for a much cleaner solut
Avi (use Gerrit)
2017/02/17 03:41:09
Acknowledged.
| |
481 profile_module_index](uintptr_t frame_ip) { | |
482 sample->frames.push_back(StackSamplingProfiler::Frame( | |
483 frame_ip, | |
484 GetModuleIndex(frame_ip, current_modules, profile_module_index))); | |
485 }); | |
486 } | |
487 | |
488 } // namespace | |
489 | |
490 std::unique_ptr<NativeStackSampler> NativeStackSampler::Create( | |
491 PlatformThreadId thread_id, | |
492 AnnotateCallback annotator, | |
493 NativeStackSamplerTestDelegate* test_delegate) { | |
494 #if defined(__i386__) | |
495 return nullptr; | |
496 #endif | |
497 return base::MakeUnique<NativeStackSamplerMac>(thread_id, annotator, | |
498 test_delegate); | |
499 } | |
500 | |
501 } // namespace base | |
OLD | NEW |