OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "base/profiler/native_stack_sampler.h" | |
6 | |
7 #include <dlfcn.h> | |
8 #include <libkern/OSByteOrder.h> | |
9 #include <libunwind.h> | |
10 #include <mach-o/swap.h> | |
11 #include <mach/kern_return.h> | |
12 #include <mach/mach.h> | |
13 #include <mach/thread_act.h> | |
14 #include <pthread.h> | |
15 #include <sys/syslimits.h> | |
16 | |
17 #include <algorithm> | |
18 #include <map> | |
19 #include <memory> | |
20 | |
21 #include "base/logging.h" | |
22 #include "base/mac/mach_logging.h" | |
23 #include "base/macros.h" | |
24 #include "base/memory/ptr_util.h" | |
25 #include "base/strings/string_number_conversions.h" | |
26 | |
27 namespace base { | |
28 | |
29 namespace { | |
30 | |
31 // Stack walking -------------------------------------------------------------- | |
32 | |
33 // Fills |state| with |target_thread|'s context. | |
34 // | |
35 // Note that this is called while a thread is suspended. Make very very sure | |
36 // that no shared resources (e.g. memory allocators) are used for the duration | |
37 // of this function. | |
38 bool GetThreadState(thread_act_t target_thread, x86_thread_state64_t* state) { | |
39 mach_msg_type_number_t count = | |
40 static_cast<mach_msg_type_number_t>(x86_THREAD_STATE64_COUNT); | |
41 return thread_get_state(target_thread, x86_THREAD_STATE64, | |
42 reinterpret_cast<thread_state_t>(state), | |
43 &count) == KERN_SUCCESS; | |
44 } | |
45 | |
46 // If the value at |pointer| points to the original stack, rewrites it to point | |
47 // to the corresponding location in the copied stack. | |
48 // | |
49 // Note that this is called while a thread is suspended. Make very very sure | |
50 // that no shared resources (e.g. memory allocators) are used for the duration | |
51 // of this function. | |
52 uintptr_t RewritePointerIfInOriginalStack(uintptr_t* original_stack_bottom, | |
53 uintptr_t* original_stack_top, | |
54 uintptr_t* stack_copy_bottom, | |
55 uintptr_t pointer) { | |
56 uintptr_t original_stack_bottom_int = | |
57 reinterpret_cast<uintptr_t>(original_stack_bottom); | |
58 uintptr_t original_stack_top_int = | |
59 reinterpret_cast<uintptr_t>(original_stack_top); | |
60 uintptr_t stack_copy_bottom_int = | |
61 reinterpret_cast<uintptr_t>(stack_copy_bottom); | |
62 | |
63 if ((pointer < original_stack_bottom_int) || | |
64 (pointer >= original_stack_top_int)) { | |
65 return pointer; | |
66 } | |
67 | |
68 return stack_copy_bottom_int + (pointer - original_stack_bottom_int); | |
69 } | |
70 | |
71 // Copies the stack to a buffer while rewriting possible pointers to locations | |
72 // within the stack to point to the corresponding locations in the copy. This is | |
73 // necessary to handle stack frames with dynamic stack allocation, where a | |
74 // pointer to the beginning of the dynamic allocation area is stored on the | |
75 // stack and/or in a non-volatile register. | |
76 // | |
77 // Eager rewriting of anything that looks like a pointer to the stack, as done | |
78 // in this function, does not adversely affect the stack unwinding. The only | |
79 // other values on the stack the unwinding depends on are return addresses, | |
80 // which should not point within the stack memory. The rewriting is guaranteed | |
81 // to catch all pointers because the stacks are guaranteed by the ABI to be | |
82 // sizeof(void*) aligned. | |
83 // | |
84 // Note that this is called while a thread is suspended. Make very very sure | |
85 // that no shared resources (e.g. memory allocators) are used for the duration | |
86 // of this function. | |
87 void CopyStackAndRewritePointers(uintptr_t* stack_copy_bottom, | |
88 uintptr_t* original_stack_bottom, | |
89 uintptr_t* original_stack_top, | |
90 x86_thread_state64_t* thread_state) | |
91 NO_SANITIZE("address") { | |
92 size_t count = original_stack_top - original_stack_bottom; | |
93 for (size_t pos = 0; pos < count; ++pos) { | |
94 stack_copy_bottom[pos] = RewritePointerIfInOriginalStack( | |
95 original_stack_bottom, original_stack_top, stack_copy_bottom, | |
96 original_stack_bottom[pos]); | |
97 } | |
98 | |
99 uint64_t* rewrite_registers[] = {&thread_state->__rbx, &thread_state->__rbp, | |
100 &thread_state->__rsp, &thread_state->__r12, | |
101 &thread_state->__r13, &thread_state->__r14, | |
102 &thread_state->__r15}; | |
103 for (auto* reg : rewrite_registers) { | |
104 *reg = RewritePointerIfInOriginalStack( | |
105 original_stack_bottom, original_stack_top, stack_copy_bottom, *reg); | |
106 } | |
107 } | |
108 | |
109 // Walks the stack represented by |unwind_context|, calling back to the provided | |
110 // lambda for each frame. Returns false if an error occurred, otherwise returns | |
111 // true. | |
112 template <typename StackFrameCallback> | |
113 bool WalkStackFromContext(unw_context_t* unwind_context, | |
114 size_t* frame_count, | |
115 const StackFrameCallback& callback) { | |
116 unw_cursor_t unwind_cursor; | |
117 unw_init_local(&unwind_cursor, unwind_context); | |
118 | |
119 int step_result; | |
120 unw_word_t ip; | |
121 do { | |
122 ++(*frame_count); | |
123 unw_get_reg(&unwind_cursor, UNW_REG_IP, &ip); | |
124 | |
125 callback(static_cast<uintptr_t>(ip)); | |
126 | |
127 step_result = unw_step(&unwind_cursor); | |
128 } while (step_result > 0); | |
129 | |
130 if (step_result != 0) | |
131 return false; | |
132 | |
133 return true; | |
134 } | |
135 | |
136 bool IsIPInValidImage(unw_context_t* unwind_context) { | |
137 unw_cursor_t unwind_cursor; | |
138 unw_init_local(&unwind_cursor, unwind_context); | |
139 unw_proc_info_t proc_info; | |
140 unw_get_proc_info(&unwind_cursor, &proc_info); | |
141 return proc_info.extra != 0; | |
142 } | |
143 | |
144 // Walks the stack represented by |thread_state|, calling back to the provided | |
145 // lambda for each frame. | |
146 template <typename StackFrameCallback> | |
147 void WalkStack(const x86_thread_state64_t& thread_state, | |
148 uintptr_t stack_top, | |
149 const StackFrameCallback& callback) { | |
150 size_t frame_count = 0; | |
151 // This uses libunwind to walk the stack. libunwind is designed to be used for | |
152 // a thread to walk its own stack. This creates two problems. | |
153 | |
154 // Problem 1: There is no official way to create a unw_context other than to | |
155 // create it from the current state of the current thread's stack. To get | |
156 // around this, forge a context. A unw_context is just a copy of the 16 main | |
157 // registers followed by the instruction pointer, nothing more. | |
158 // Coincidentally, the first 17 items of the x86_thread_state64_t type are | |
159 // exactly those registers in exactly the same order, so just bulk copy them | |
160 // over. | |
161 unw_context_t unwind_context; | |
162 memcpy(&unwind_context, &thread_state, sizeof(uintptr_t) * 17); | |
163 bool result = WalkStackFromContext(&unwind_context, &frame_count, callback); | |
164 | |
165 if (!result) | |
166 return; | |
167 | |
168 if (frame_count == 1) { | |
169 // Problem 2: Because libunwind is designed to be triggered by user code on | |
170 // their own thread, if it hits a library that has no unwind info for the | |
171 // function that is being executed, it just stops. This isn't a problem in | |
172 // the normal case, but in this case, it's quite possible that the stack | |
173 // being walked is stopped in a function that bridges to the kernel and thus | |
174 // is missing the unwind info. | |
175 // | |
176 // If so, cheat by scanning the stack and trying again. Only do this if the | |
177 // first time using libunwind fails after one frame. | |
178 bool ip_in_valid_image = false; | |
179 auto& rsp = unwind_context.data[7]; | |
180 auto& rip = unwind_context.data[16]; | |
181 const uintptr_t kMaxScanDepth = 50; | |
182 uintptr_t scan_limit = std::min<uintptr_t>(stack_top, rsp + kMaxScanDepth); | |
183 do { | |
184 rip = *reinterpret_cast<uintptr_t*>(rsp); // rip = *rsp | |
185 rsp += sizeof(uintptr_t); // rsp++ | |
186 if (rsp % sizeof(uintptr_t)) { | |
187 // The "stack pointer" isn't aligned. Just give up. | |
188 return; | |
189 } | |
190 | |
191 ip_in_valid_image = IsIPInValidImage(&unwind_context); | |
192 } while (!ip_in_valid_image && rsp < scan_limit); | |
193 | |
194 if (ip_in_valid_image) | |
195 WalkStackFromContext(&unwind_context, &frame_count, callback); | |
196 } | |
197 } | |
198 | |
199 // Module identifiers --------------------------------------------------------- | |
200 | |
201 // Returns the hex encoding of a 16-byte ID for the binary loaded at | |
202 // |module_addr|. Returns an empty string if the UUID cannot be found at | |
203 // |module_addr|. | |
204 std::string GetUniqueId(const void* module_addr) { | |
205 const mach_header_64* mach_header = | |
206 reinterpret_cast<const mach_header_64*>(module_addr); | |
207 DCHECK_EQ(MH_MAGIC_64, mach_header->magic); | |
208 | |
209 size_t offset = sizeof(mach_header_64); | |
210 size_t offset_limit = sizeof(mach_header_64) + mach_header->sizeofcmds; | |
211 for (uint32_t i = 0; (i < mach_header->ncmds) && | |
212 (offset + sizeof(load_command) < offset_limit); | |
213 ++i) { | |
214 const load_command* current_cmd = reinterpret_cast<const load_command*>( | |
215 reinterpret_cast<const uint8_t*>(mach_header) + offset); | |
216 | |
217 if (offset + current_cmd->cmdsize > offset_limit) { | |
218 // This command runs off the end of the command list. This is malformed. | |
219 return std::string(); | |
220 } | |
221 | |
222 if (current_cmd->cmd == LC_UUID) { | |
223 if (current_cmd->cmdsize < sizeof(uuid_command)) { | |
224 // This "UUID command" is too small. This is malformed. | |
225 return std::string(); | |
226 } | |
227 | |
228 const uuid_command* uuid_cmd = | |
229 reinterpret_cast<const uuid_command*>(current_cmd); | |
230 static_assert(sizeof(uuid_cmd->uuid) == sizeof(uuid_t), | |
231 "UUID field of UUID command should be 16 bytes."); | |
232 return HexEncode(&uuid_cmd->uuid, sizeof(uuid_cmd->uuid)); | |
233 } | |
234 offset += current_cmd->cmdsize; | |
235 } | |
236 return std::string(); | |
237 } | |
238 | |
239 // Gets the index for the Module containing |instruction_pointer| in | |
240 // |modules|, adding it if it's not already present. Returns | |
241 // StackSamplingProfiler::Frame::kUnknownModuleIndex if no Module can be | |
242 // determined for |module|. | |
243 size_t GetModuleIndex(const uintptr_t instruction_pointer, | |
244 std::vector<StackSamplingProfiler::Module>* modules, | |
245 std::map<const void*, size_t>* profile_module_index) { | |
246 Dl_info inf; | |
247 if (!dladdr(reinterpret_cast<const void*>(instruction_pointer), &inf)) | |
248 return StackSamplingProfiler::Frame::kUnknownModuleIndex; | |
249 | |
250 auto module_index = profile_module_index->find(inf.dli_fbase); | |
251 if (module_index == profile_module_index->end()) { | |
252 StackSamplingProfiler::Module module( | |
253 reinterpret_cast<uintptr_t>(inf.dli_fbase), GetUniqueId(inf.dli_fbase), | |
254 base::FilePath(inf.dli_fname)); | |
255 modules->push_back(module); | |
256 module_index = | |
257 profile_module_index | |
258 ->insert(std::make_pair(inf.dli_fbase, modules->size() - 1)) | |
259 .first; | |
260 } | |
261 return module_index->second; | |
262 } | |
263 | |
264 // ScopedSuspendThread -------------------------------------------------------- | |
265 | |
266 // Suspends a thread for the lifetime of the object. | |
267 class ScopedSuspendThread { | |
268 public: | |
269 explicit ScopedSuspendThread(mach_port_t thread_port) | |
270 : thread_port_(thread_suspend(thread_port) == KERN_SUCCESS | |
271 ? thread_port | |
272 : MACH_PORT_NULL) {} | |
273 | |
274 ~ScopedSuspendThread() { | |
275 if (!was_successful()) | |
276 return; | |
277 | |
278 kern_return_t kr = thread_resume(thread_port_); | |
279 MACH_CHECK(kr == KERN_SUCCESS, kr) << "thread_resume"; | |
280 } | |
281 | |
282 bool was_successful() const { return thread_port_ != MACH_PORT_NULL; } | |
283 | |
284 private: | |
285 mach_port_t thread_port_; | |
286 | |
287 DISALLOW_COPY_AND_ASSIGN(ScopedSuspendThread); | |
288 }; | |
289 | |
290 // NativeStackSamplerMac ------------------------------------------------------ | |
291 | |
292 class NativeStackSamplerMac : public NativeStackSampler { | |
293 public: | |
294 NativeStackSamplerMac(mach_port_t thread_port, | |
295 AnnotateCallback annotator, | |
296 NativeStackSamplerTestDelegate* test_delegate); | |
297 ~NativeStackSamplerMac() override; | |
298 | |
299 // StackSamplingProfiler::NativeStackSampler: | |
300 void ProfileRecordingStarting( | |
301 std::vector<StackSamplingProfiler::Module>* modules) override; | |
302 void RecordStackSample(StackSamplingProfiler::Sample* sample) override; | |
303 void ProfileRecordingStopped() override; | |
304 | |
305 private: | |
306 // Intended to hold the largest stack used by Chrome. The default macOS main | |
307 // thread stack size is 8 MB, and this allows for expansion if it occurs. | |
308 static constexpr size_t kStackCopyBufferSize = 12 * 1024 * 1024; | |
Mark Mentovai
2017/03/27 18:24:55
You could also getrlimit(RLIMIT_STACK) to compute
Avi (use Gerrit)
2017/03/29 17:52:09
Right, but that's for the main thread and this is
Mike Wittman
2017/03/29 18:08:09
I'd like to keep this as a single buffer size that
Avi (use Gerrit)
2017/03/29 19:04:15
Acknowledged.
Mark Mentovai
2017/03/29 19:23:49
I don't really think a non-main thread would use m
Avi (use Gerrit)
2017/03/29 19:59:25
Yes, I see the use of getrlimit in platform_thread
Mike Wittman
2017/03/29 20:03:02
It's still a single buffer size across all threads
| |
309 | |
310 // Suspends the thread with |thread_port_|, copies its stack and resumes the | |
311 // thread, then records the stack frames and associated modules into |sample|. | |
312 void SuspendThreadAndRecordStack(StackSamplingProfiler::Sample* sample); | |
313 | |
314 // Weak reference: Mach port for thread being profiled. | |
315 mach_port_t thread_port_; | |
316 | |
317 const AnnotateCallback annotator_; | |
318 | |
319 NativeStackSamplerTestDelegate* const test_delegate_; | |
320 | |
321 // The stack base address corresponding to |thread_handle_|. | |
322 const void* const thread_stack_base_address_; | |
323 | |
324 // Buffer to use for copies of the stack. We use the same buffer for all the | |
325 // samples to avoid the overhead of multiple allocations and frees. | |
326 const std::unique_ptr<unsigned char[]> stack_copy_buffer_; | |
327 | |
328 // Weak. Points to the modules associated with the profile being recorded | |
329 // between ProfileRecordingStarting() and ProfileRecordingStopped(). | |
330 std::vector<StackSamplingProfiler::Module>* current_modules_ = nullptr; | |
331 | |
332 // Maps a module's base address to the corresponding Module's index within | |
333 // current_modules_. | |
334 std::map<const void*, size_t> profile_module_index_; | |
335 | |
336 DISALLOW_COPY_AND_ASSIGN(NativeStackSamplerMac); | |
337 }; | |
338 | |
339 NativeStackSamplerMac::NativeStackSamplerMac( | |
340 mach_port_t thread_port, | |
341 AnnotateCallback annotator, | |
342 NativeStackSamplerTestDelegate* test_delegate) | |
343 : thread_port_(thread_port), | |
344 annotator_(annotator), | |
345 test_delegate_(test_delegate), | |
346 thread_stack_base_address_( | |
347 pthread_get_stackaddr_np(pthread_from_mach_thread_np(thread_port))), | |
348 stack_copy_buffer_(new unsigned char[kStackCopyBufferSize]) { | |
349 DCHECK(annotator_); | |
350 | |
351 // This class suspends threads, and those threads might be suspended in dyld. | |
352 // Therefore, for all the system functions that might be linked in dynamically | |
353 // that are used while threads are suspended, make calls to them to make sure | |
354 // that they are linked up. | |
355 x86_thread_state64_t thread_state; | |
356 GetThreadState(thread_port_, &thread_state); | |
357 } | |
358 | |
359 NativeStackSamplerMac::~NativeStackSamplerMac() {} | |
360 | |
361 void NativeStackSamplerMac::ProfileRecordingStarting( | |
362 std::vector<StackSamplingProfiler::Module>* modules) { | |
363 current_modules_ = modules; | |
364 profile_module_index_.clear(); | |
365 } | |
366 | |
367 void NativeStackSamplerMac::RecordStackSample( | |
368 StackSamplingProfiler::Sample* sample) { | |
369 DCHECK(current_modules_); | |
370 | |
371 SuspendThreadAndRecordStack(sample); | |
372 } | |
373 | |
374 void NativeStackSamplerMac::ProfileRecordingStopped() { | |
375 current_modules_ = nullptr; | |
376 } | |
377 | |
378 void NativeStackSamplerMac::SuspendThreadAndRecordStack( | |
379 StackSamplingProfiler::Sample* sample) { | |
380 x86_thread_state64_t thread_state; | |
381 | |
382 // Copy the stack. | |
383 | |
384 uintptr_t new_stack_top = 0; | |
385 { | |
386 // IMPORTANT NOTE: Do not do ANYTHING in this in this scope that might | |
387 // allocate memory, including indirectly via use of DCHECK/CHECK or other | |
388 // logging statements. Otherwise this code can deadlock on heap locks in the | |
389 // default heap acquired by the target thread before it was suspended. | |
390 ScopedSuspendThread suspend_thread(thread_port_); | |
391 if (!suspend_thread.was_successful()) | |
392 return; | |
393 | |
394 if (!GetThreadState(thread_port_, &thread_state)) | |
395 return; | |
396 uintptr_t stack_top = | |
397 reinterpret_cast<uintptr_t>(thread_stack_base_address_); | |
398 uintptr_t stack_bottom = thread_state.__rsp; | |
399 if (stack_bottom >= stack_top) | |
400 return; | |
401 uintptr_t stack_size = stack_top - stack_bottom; | |
402 | |
403 if (stack_size > kStackCopyBufferSize) | |
404 return; | |
405 | |
406 (*annotator_)(sample); | |
407 | |
408 CopyStackAndRewritePointers( | |
409 reinterpret_cast<uintptr_t*>(stack_copy_buffer_.get()), | |
410 reinterpret_cast<uintptr_t*>(stack_bottom), | |
411 reinterpret_cast<uintptr_t*>(stack_top), &thread_state); | |
412 | |
413 new_stack_top = | |
414 reinterpret_cast<uintptr_t>(stack_copy_buffer_.get()) + stack_size; | |
415 } // ScopedSuspendThread | |
416 | |
417 if (test_delegate_) | |
418 test_delegate_->OnPreStackWalk(); | |
419 | |
420 // Walk the stack and record it. | |
421 | |
422 // Reserve enough memory for most stacks, to avoid repeated allocations. | |
423 // Approximately 99.9% of recorded stacks are 128 frames or fewer. | |
424 sample->frames.reserve(128); | |
425 | |
426 auto* current_modules = current_modules_; | |
427 auto* profile_module_index = &profile_module_index_; | |
428 WalkStack( | |
429 thread_state, new_stack_top, | |
430 [sample, current_modules, profile_module_index](uintptr_t frame_ip) { | |
431 sample->frames.push_back(StackSamplingProfiler::Frame( | |
432 frame_ip, | |
433 GetModuleIndex(frame_ip, current_modules, profile_module_index))); | |
434 }); | |
435 } | |
436 | |
437 } // namespace | |
438 | |
439 std::unique_ptr<NativeStackSampler> NativeStackSampler::Create( | |
440 PlatformThreadId thread_id, | |
441 AnnotateCallback annotator, | |
442 NativeStackSamplerTestDelegate* test_delegate) { | |
443 return base::MakeUnique<NativeStackSamplerMac>(thread_id, annotator, | |
444 test_delegate); | |
445 } | |
446 | |
447 } // namespace base | |
OLD | NEW |