 Chromium Code Reviews
 Chromium Code Reviews Issue 323893002:
  [Android] Introduce libheap_profiler for memory profiling.  (Closed) 
  Base URL: https://chromium.googlesource.com/chromium/src.git@master
    
  
    Issue 323893002:
  [Android] Introduce libheap_profiler for memory profiling.  (Closed) 
  Base URL: https://chromium.googlesource.com/chromium/src.git@master| Index: tools/android/heap_profiler/heap_profiler.c | 
| diff --git a/tools/android/heap_profiler/heap_profiler.c b/tools/android/heap_profiler/heap_profiler.c | 
| new file mode 100644 | 
| index 0000000000000000000000000000000000000000..1d60a687321be34524415fc5713182da16137089 | 
| --- /dev/null | 
| +++ b/tools/android/heap_profiler/heap_profiler.c | 
| @@ -0,0 +1,393 @@ | 
| +// Copyright 2014 The Chromium Authors. All rights reserved. | 
| +// Use of this source code is governed by a BSD-style license that can be | 
| +// found in the LICENSE file. | 
| + | 
| +// This is a OS-independent* module which purpose is tracking allocations and | 
| +// their call sites (stack traces). It is able to deal with hole punching | 
| +// (read: munmap). Also, it is very fast and its presence in the system its | 
| 
pasko
2014/06/23 20:09:23
nit (sorry, I'm picky in inappropriate places quit
 
Primiano Tucci (use gerrit)
2014/06/24 08:43:32
Done.
 | 
| +// barely noticeable, even if tracing *all* the processes. | 
| +// This module does NOT know how to deal with stack unwinding. The caller must | 
| +// do that and pass the addresses of the unwound stack. | 
| +// * (Modulo three lines for mutexes.) | 
| +// | 
| +// Exposed API: | 
| +// void heap_profiler_init(HeapStats*); | 
| +// void heap_profiler_alloc(addr, size, stack_frames, depth, flags); | 
| +// void heap_profiler_free(addr, size); (size == 0 means free entire region). | 
| +// | 
| +// The profiling information is tracked into two data structures: | 
| +// 1) A RB-Tree of non overlapping virtual memory regions (VMAs) sorted by their | 
| 
pasko
2014/06/23 20:09:23
nit: s/non overlapping/non-overlapping/
 
Primiano Tucci (use gerrit)
2014/06/24 08:43:33
Done.
 | 
| +// start addr. Each entry tracks the start-end addresses and points to the | 
| +// stack trace which created that allocation (see below). | 
| +// 2) A (hash) table of stack traces. In general the #allocations >> #call sites | 
| +// which create those allocations. In order to avoid duplicating the latter, | 
| +// they are stored distinctly in this hash table and used by reference. | 
| +// | 
| +// / Process virtual address space \ | 
| +// +------+ +------+ +------+ | 
| +// | VMA1 | | VMA2 | | VMA3 | <- VMAs (a RB-Tree underneath) | 
| +// +------+ +------+ +------+ | 
| +// Len: 12 Len: 4 Len: 4 | 
| +// | | | stack_traces | 
| +// | | | +-----------+--------------+ | 
| +// | | | | Alloc tot | stack frames + | 
| +// | | | +-----------+--------------+ | 
| +// +------------|-------------+------------> | 16 | 0x1234 .... | | 
| +// | +-----------+--------------+ | 
| +// +--------------------------> | 4 | 0x5678 .... | | 
| +// +-----------+--------------+ | 
| +// (A hash-table underneath) | 
| +// | 
| +// Final note: the memory for both 1) and 2) entries is carved out from two | 
| +// static pools (i.e. stack_traces and vmas). The pools are treated as | 
| +// a sbrk essentially, and are kept compact by reusing freed elements (hence | 
| +// having a freelist for each of them). | 
| +// | 
| +// All the internal (static) functions here assume that the |lock| is held. | 
| + | 
| +#include <assert.h> | 
| +#include <string.h> | 
| + | 
| +// Platform-dependent mutex boilerplate. | 
| +#if defined(__linux__) || defined(__ANDROID__) | 
| +#include <pthread.h> | 
| +#define DEFINE_MUTEX(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER | 
| +#define LOCK_MUTEX(x) pthread_mutex_lock(&x) | 
| +#define UNLOCK_MUTEX(x) pthread_mutex_unlock(&x) | 
| +#else | 
| +#error OS not supported. | 
| +#endif | 
| + | 
| +#include "tools/android/heap_profiler/heap_profiler.h" | 
| + | 
| + | 
| +static DEFINE_MUTEX(lock); | 
| + | 
| +// |stats| contains the global tracking metadata and is the entry point which | 
| +// is read by the heap_dump tool. | 
| +static HeapStats* stats; | 
| + | 
| +// +---------------------------------------------------------------------------+ | 
| +// + Stack traces hash-table + | 
| +// +---------------------------------------------------------------------------+ | 
| +#define ST_ENTRIES_MAX (64 * 1024) | 
| +#define ST_HASHTABLE_BUCKETS (64 * 1024) /* Must be a power of 2. */ | 
| + | 
| +static StacktraceEntry stack_traces[ST_ENTRIES_MAX]; | 
| +static StacktraceEntry* stack_traces_freelist; | 
| +static StacktraceEntry* stack_traces_ht[ST_HASHTABLE_BUCKETS]; | 
| + | 
| +// Looks up a stack trace from the stack frames. Creates a new one if necessary. | 
| +static StacktraceEntry* record_stacktrace(uintptr_t* frames, uint32_t depth) { | 
| + if (depth == 0) | 
| + return NULL; | 
| + | 
| + if (depth > HEAP_PROFILER_MAX_DEPTH) | 
| + depth = HEAP_PROFILER_MAX_DEPTH; | 
| + | 
| + uint32_t i; | 
| + uintptr_t hash = 0; | 
| + for (i = 0; i < depth; ++i) | 
| + hash = (hash << 1) ^ (frames[i]); | 
| + const uint32_t slot = hash & (ST_HASHTABLE_BUCKETS - 1); | 
| + StacktraceEntry* st = stack_traces_ht[slot]; | 
| + | 
| + // Look for an existing entry in the hash-table. | 
| + const size_t frames_length = depth * sizeof(uintptr_t); | 
| + while (st != NULL && st->hash != hash && | 
| + memcmp(frames, st->frames, frames_length) != 0) { | 
| + st = st->next; | 
| + } | 
| + | 
| + // Is none is found, create a new one from the stack_traces array and add it | 
| 
pasko
2014/06/23 20:09:23
None is found
 
Primiano Tucci (use gerrit)
2014/06/24 08:43:32
Actually that was an "If"
 | 
| + // to the hash-table. | 
| + if (st == NULL) { | 
| + // Get a free element either from the freelist or from the pool. | 
| + if (stack_traces_freelist != NULL) { | 
| + st = stack_traces_freelist; | 
| + stack_traces_freelist = stack_traces_freelist->next; | 
| + } else if (stats->max_stack_traces < ST_ENTRIES_MAX) { | 
| + st = &stack_traces[stats->max_stack_traces]; | 
| + ++stats->max_stack_traces; | 
| + } else { | 
| + return NULL; | 
| + } | 
| + | 
| + memset(st, 0, sizeof(*st)); | 
| + memcpy(st->frames, frames, frames_length); | 
| + st->hash = hash; | 
| + st->next = stack_traces_ht[slot]; | 
| + stack_traces_ht[slot] = st; | 
| + ++stats->num_stack_traces; | 
| + } | 
| + | 
| + return st; | 
| +} | 
| + | 
| +// Frees up a stack trace and appends it to the corresponding freelist. | 
| +static void free_stacktrace(StacktraceEntry* st) { | 
| + assert(st->alloc_bytes == 0); | 
| + const uint32_t slot = st->hash & (ST_HASHTABLE_BUCKETS - 1); | 
| + | 
| + // The expected load factor of the hash-table is very low. Frees should be | 
| + // pretty rare. Hence don't bother with a doubly linked list, might cost more. | 
| + StacktraceEntry** prev = &stack_traces_ht[slot]; | 
| + while (*prev != st) | 
| + prev = &((*prev)->next); | 
| + | 
| + // Remove from the hash-table bucket. | 
| + assert(*prev == st); | 
| + *prev = st->next; | 
| + | 
| + // Add to the freelist. | 
| + st->next = stack_traces_freelist; | 
| + stack_traces_freelist = st; | 
| + --stats->num_stack_traces; | 
| +} | 
| + | 
| +// +---------------------------------------------------------------------------+ | 
| +// + VMAs RB-tree + | 
| +// +---------------------------------------------------------------------------+ | 
| +#define VMA_ENTRIES_MAX (256 * 1024) | 
| + | 
| +static VMA vmas[VMA_ENTRIES_MAX]; | 
| +static VMA* vmas_freelist; | 
| +static RB_HEAD(HeapEntriesTree, VMA) vmas_tree = RB_INITIALIZER(&vmas_tree); | 
| + | 
| +// Comparator used by the RB-Tree (mind the overflow, avoid arith on addresses). | 
| +static int vmas_tree_cmp(VMA *e1, VMA *e2) { | 
| + if (e1->start < e2->start) | 
| + return -1; | 
| + if (e1->start > e2->start) | 
| + return 1; | 
| + return 0; | 
| +} | 
| + | 
| +RB_PROTOTYPE(HeapEntriesTree, VMA, rb_node, vmas_tree_cmp); | 
| +RB_GENERATE(HeapEntriesTree, VMA, rb_node, vmas_tree_cmp); | 
| + | 
| +// Allocates a new VMA and inserts it in the tree. | 
| +static VMA* insert_vma( | 
| + uintptr_t start, uintptr_t end, StacktraceEntry* st, uint32_t flags) { | 
| + VMA* vma = NULL; | 
| + | 
| + // First of all, get a free element either from the freelist or from the pool. | 
| + if (vmas_freelist != NULL) { | 
| + vma = vmas_freelist; | 
| + vmas_freelist = vma->next_free; | 
| + } else if (stats->max_allocs < VMA_ENTRIES_MAX) { | 
| + vma = &vmas[stats->max_allocs]; | 
| + ++stats->max_allocs; | 
| + } else { | 
| + return NULL; // OOM. | 
| + } | 
| + | 
| + vma->start = start; | 
| + vma->end = end; | 
| + vma->st = st; | 
| + vma->flags = flags; | 
| + vma->next_free = NULL; | 
| + RB_INSERT(HeapEntriesTree, &vmas_tree, vma); | 
| + ++stats->num_allocs; | 
| + return vma; | 
| +} | 
| + | 
| +// Deletes all the vmas in the range [addr, addr+size[ dealing with partial | 
| +// frees and hole punching. Note that in the general case this function might | 
| +// need to deal with very unfortunate cases, as below: | 
| +// | 
| +// VMA tree @ begin: [ VMA 1 ]----[ VMA 2 ]-------[ VMA 3 ][ VMA 4 ]---[ VMA 5 ] | 
| +// Deletion range: [xxxxxxxxxxxxxxxxxxxx] | 
| +// VMA tree @ end: [ VMA 1 ]----[VMA2]----------------------[VMA4]---[ VMA 5 ] | 
| +// VMA3 has to be deleted and VMA 2,4 shrunk. | 
| +static uint32_t delete_vmas_in_range(void* addr, size_t size) { | 
| + uintptr_t del_start = (uintptr_t) addr; | 
| + uintptr_t del_end = del_start + size - 1; | 
| + uint32_t flags = 0; | 
| + | 
| + VMA* vma = NULL; | 
| + VMA* next_vma = RB_ROOT(&vmas_tree); | 
| + | 
| + // Lookup the first (by address) relevant VMA to initiate the deletion walk. | 
| + // At the end of the loop next_vma is either: | 
| + // - the closest VMA starting before (or exactly at) the start of the deletion | 
| + // range (i.e. addr == del_start). | 
| + // - the first VMA inside the deletion range. | 
| + // - the first VMA after the deletion range iff the range was already empty | 
| + // (in this case the next loop will just bail out doing nothing). | 
| + // - NULL: iff the entire tree is empty (as above). | 
| + while (next_vma != NULL) { | 
| + vma = next_vma; | 
| + if (vma->start > del_start) { | 
| + next_vma = RB_LEFT(vma, rb_node); | 
| + } else if (vma->end < del_start) { | 
| + next_vma = RB_RIGHT(vma, rb_node); | 
| + } else { // vma->start <= del_start && vma->end >= del_start | 
| + break; | 
| + } | 
| + } | 
| + | 
| + // Now scan the VMAs linearly deleting chunks (or eventually the whole VMAs) | 
| + // until passing the end of the deleting region. | 
| + next_vma = vma; | 
| + while (next_vma != NULL) { | 
| + vma = next_vma; | 
| + next_vma = RB_NEXT(HeapEntriesTree, &vmas_tree, vma); | 
| + | 
| + if (size != 0) { | 
| + // In the general case we stop passed the end of the deletion range. | 
| + if (vma->start > del_end) | 
| + break; | 
| + | 
| + // This deals with the case of the first VMA laying before the range. | 
| + if (vma->end < del_start) | 
| + continue; | 
| + } else { | 
| + // size == 0 is a special case. It means deleting only the vma which | 
| + // starts exactly @ |del_start| if any (for dealing with free(ptr)). | 
| 
pasko
2014/06/23 20:09:23
nit:
s/@/at/ please
 
Primiano Tucci (use gerrit)
2014/06/24 08:43:33
Done.
 | 
| + if (vma->start > del_start) | 
| + break; | 
| + if (vma->start < del_start) | 
| + continue; | 
| + del_end = vma->end; | 
| + } | 
| + | 
| + // Reached this point the VMA must overlap (partially or completely) with | 
| + // the deletion range. | 
| + assert(!(vma->start > del_end || vma->end < del_start)); | 
| + | 
| + StacktraceEntry* st = vma->st; | 
| + flags |= vma->flags; | 
| + uintptr_t freed_bytes = 0; // Bytes freed in this cycle. | 
| + | 
| + if (del_start <= vma->start) { | 
| + if (del_end >= vma->end) { | 
| + // Complete overlap. Delete full VMA. Note: the range might might still | 
| + // overlap with the next vmas. | 
| + // Begin: ------[vma.start vma.end]-[next vma] | 
| + // Del range: [xxxxxxxxxxxxxxxxxxxxxxxxxxxx] | 
| + // Result: -----------------------------[next vma] | 
| + // Note: at the next iteration we'll deal with the next vma. | 
| + freed_bytes = vma->end - vma->start + 1; | 
| + RB_REMOVE(HeapEntriesTree, &vmas_tree, vma); | 
| + | 
| + // Clean-up, so heap_dump can tell this is a free entry and skip it. | 
| + vma->start = vma->end = 0; | 
| + vma->st = NULL; | 
| + | 
| + // Put in the freelist. | 
| + vma->next_free = vmas_freelist; | 
| + vmas_freelist = vma; | 
| + --stats->num_allocs; | 
| + } else { | 
| + // Partial overlap at beginning. Cut first part and shrink the vma. | 
| + // Begin: ------[vma.start vma.end]-[next vma] | 
| + // Del range: [xxxxxx] | 
| + // Result: ------------[start vma.end]-[next vma] | 
| + freed_bytes = del_end - vma->start + 1; | 
| + vma->start = del_end + 1; | 
| + // No need to update the tree even if we changed the key. The keys are | 
| + // still monotonic (because the ranges are guaranteed to not overlap). | 
| + } | 
| + } else { | 
| + if (del_end >= vma->end) { | 
| + // Partial overlap at end. Cut last part and shrink the vma left. | 
| + // Begin: ------[vma.start vma.end]-[next vma] | 
| + // Del range: [xxxxxx] | 
| + // Result: ------[vma.start vma.end]-[next vma] | 
| + // Note: at the next iteration we'll deal with the next vma. | 
| + freed_bytes = vma->end - del_start + 1; | 
| + vma->end = del_start - 1; | 
| + } else { | 
| + // Hole punching. Requires creating an extra vma. | 
| + // Begin: ------[vma.start vma.end]-[next vma] | 
| + // Del range: [xxx] | 
| + // Result: ------[ vma 1 ]-----[ vma 2 ]-[next vma] | 
| + freed_bytes = del_end - del_start + 1; | 
| + const uintptr_t old_end = vma->end; | 
| + vma->end = del_start - 1; | 
| + insert_vma(del_end + 1, old_end, st, vma->flags); | 
| 
pasko
2014/06/23 20:09:23
we may OOM here, the worst consequence seems to be
 
Primiano Tucci (use gerrit)
2014/06/24 08:43:33
Hmm which assert? This shouldn't hit asserts in ca
 
pasko
2014/06/24 13:21:45
I thought it would assert on the line 257 when we
 
Primiano Tucci (use gerrit)
2014/06/24 15:10:56
Hmm no, as a general principle we never *barf* if
 | 
| + } | 
| + } | 
| + // Now update the StackTraceEntry the VMA was pointing to, eventually | 
| + // freeing it up. | 
| + assert(st->alloc_bytes >= freed_bytes); | 
| + st->alloc_bytes -= freed_bytes; | 
| + if (st->alloc_bytes == 0) | 
| + free_stacktrace(st); | 
| + stats->total_alloc_bytes -= freed_bytes; | 
| + } | 
| + return flags; | 
| +} | 
| + | 
| +// +---------------------------------------------------------------------------+ | 
| +// + Library entry points (refer to heap_profiler.h for API doc). + | 
| +// +---------------------------------------------------------------------------+ | 
| +void heap_profiler_free(void* addr, size_t size, uint32_t* old_flags) { | 
| + assert(size == 0 || ((uintptr_t) addr + (size - 1)) >= (uintptr_t) addr); | 
| + | 
| + LOCK_MUTEX(lock); | 
| + uint32_t flags = delete_vmas_in_range(addr, size); | 
| + UNLOCK_MUTEX(lock); | 
| + | 
| + if (old_flags != NULL) | 
| + *old_flags = flags; | 
| +} | 
| + | 
| +void heap_profiler_alloc(void* addr, size_t size, uintptr_t* frames, | 
| + uint32_t depth, uint32_t flags) { | 
| + if (depth > HEAP_PROFILER_MAX_DEPTH) | 
| + depth = HEAP_PROFILER_MAX_DEPTH; | 
| + | 
| + if (size == 0) // Apps calling malloc(0), sometimes it happens. | 
| + return; | 
| + | 
| + const uintptr_t start = (uintptr_t) addr; | 
| + const uintptr_t end = start + (size - 1); | 
| + assert(start <= end); | 
| + | 
| + LOCK_MUTEX(lock); | 
| + | 
| + delete_vmas_in_range(addr, size); | 
| 
pasko
2014/06/23 20:09:23
ugh, having to remove ranges is counter-intuitive
 
Primiano Tucci (use gerrit)
2014/06/24 08:43:33
This is to be paranoid in the case we miss some fr
 
pasko
2014/06/24 13:21:45
I would argue that it can be actionable. If the me
 
Primiano Tucci (use gerrit)
2014/06/24 15:10:56
Really? You stop using a device just because you k
 | 
| + | 
| + StacktraceEntry* st = record_stacktrace(frames, depth); | 
| + if (st != NULL) { | 
| + VMA* vma = insert_vma(start, end, st, flags); | 
| + if (vma != NULL) { | 
| + st->alloc_bytes += size; | 
| + stats->total_alloc_bytes += size; | 
| + } | 
| + } | 
| + | 
| + UNLOCK_MUTEX(lock); | 
| +} | 
| + | 
| +void heap_profiler_init(HeapStats* heap_stats) { | 
| + LOCK_MUTEX(lock); | 
| + | 
| + assert(stats == NULL); | 
| + stats = heap_stats; | 
| + memset(stats, 0, sizeof(HeapStats)); | 
| 
pasko
2014/06/23 20:09:23
isn't this memory mapped from /dev/zero?
 
Primiano Tucci (use gerrit)
2014/06/24 08:43:33
This is essentially for the tests, which init/clea
 | 
| + stats->magic_start = HEAP_PROFILER_MAGIC_MARKER; | 
| + stats->allocs = &vmas[0]; | 
| + stats->stack_traces = &stack_traces[0]; | 
| + | 
| + UNLOCK_MUTEX(lock); | 
| +} | 
| + | 
| +void heap_profiler_cleanup(void) { | 
| + LOCK_MUTEX(lock); | 
| + | 
| + assert(stats != NULL); | 
| + memset(stack_traces, 0, sizeof(StacktraceEntry) * stats->max_stack_traces); | 
| + memset(stack_traces_ht, 0, sizeof(stack_traces_ht)); | 
| + stack_traces_freelist = NULL; | 
| + | 
| + memset(vmas, 0, sizeof(VMA) * stats->max_allocs); | 
| + vmas_freelist = NULL; | 
| + RB_INIT(&vmas_tree); | 
| + | 
| + stats = NULL; | 
| + | 
| + UNLOCK_MUTEX(lock); | 
| +} |