/**************************************************************************/ /* */ /* OCaml */ /* */ /* Jacques-Henri Jourdan, projet Gallium, INRIA Paris */ /* */ /* Copyright 2016 Institut National de Recherche en Informatique et */ /* en Automatique. */ /* */ /* All rights reserved. This file is distributed under the terms of */ /* the GNU Lesser General Public License version 2.1, with the */ /* special exception on linking described in the file LICENSE. */ /* */ /**************************************************************************/ #define CAML_INTERNALS #include #include "caml/memprof.h" #include "caml/fail.h" #include "caml/alloc.h" #include "caml/callback.h" #include "caml/signals.h" #include "caml/memory.h" #include "caml/minor_gc.h" #include "caml/backtrace_prim.h" #include "caml/weak.h" #include "caml/stack.h" #include "caml/misc.h" #include "caml/compact.h" #include "caml/printexc.h" #include "caml/eventlog.h" #define RAND_BLOCK_SIZE 64 static uint32_t xoshiro_state[4][RAND_BLOCK_SIZE]; static uintnat rand_geom_buff[RAND_BLOCK_SIZE]; static uint32_t rand_pos; static uint32_t rand_pos; /* [lambda] is the mean number of samples for each allocated word (including block headers). */ static double lambda = 0; /* Precomputed value of [1/log(1-lambda)], for fast sampling of geometric distribution. Dummy if [lambda = 0]. */ static float one_log1m_lambda; /* [caml_memprof_suspended] is used for masking memprof callbacks when a callback is running or when an uncaught exception handler is called. */ int caml_memprof_suspended = 0; /* [callback_running] is used to trigger a fatal error whenever [Thread.exit] is called from a callback. */ static int callback_running = 0; static intnat callstack_size; /* accessors for the OCaml type [Gc.Memprof.tracker], which is the type of the [tracker] global below. */ #define Alloc_minor(tracker) (Field(tracker, 0)) #define Alloc_major(tracker) (Field(tracker, 1)) #define Promote(tracker) (Field(tracker, 2)) #define Dealloc_minor(tracker) (Field(tracker, 3)) #define Dealloc_major(tracker) (Field(tracker, 4)) static value tracker; /* Pointer to the word following the next sample in the minor heap. Equals [Caml_state->young_alloc_start] if no sampling is planned in the current minor heap. Invariant: [caml_memprof_young_trigger <= Caml_state->young_ptr]. */ value* caml_memprof_young_trigger; /* Whether memprof has been initialized. */ static int init = 0; /* Whether memprof is started. */ static int started = 0; /* Buffer used to compute backtraces */ static value* callstack_buffer = NULL; static intnat callstack_buffer_len = 0; /**** Statistical sampling ****/ Caml_inline uint64_t splitmix64_next(uint64_t* x) { uint64_t z = (*x += 0x9E3779B97F4A7C15ull); z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; return z ^ (z >> 31); } static void xoshiro_init(void) { int i; uint64_t splitmix64_state = 42; rand_pos = RAND_BLOCK_SIZE; for (i = 0; i < RAND_BLOCK_SIZE; i++) { uint64_t t = splitmix64_next(&splitmix64_state); xoshiro_state[0][i] = t & 0xFFFFFFFF; xoshiro_state[1][i] = t >> 32; t = splitmix64_next(&splitmix64_state); xoshiro_state[2][i] = t & 0xFFFFFFFF; xoshiro_state[3][i] = t >> 32; } } Caml_inline uint32_t xoshiro_next(int i) { uint32_t res = xoshiro_state[0][i] + xoshiro_state[3][i]; uint32_t t = xoshiro_state[1][i] << 9; xoshiro_state[2][i] ^= xoshiro_state[0][i]; xoshiro_state[3][i] ^= xoshiro_state[1][i]; xoshiro_state[1][i] ^= xoshiro_state[2][i]; xoshiro_state[0][i] ^= xoshiro_state[3][i]; xoshiro_state[2][i] ^= t; t = xoshiro_state[3][i]; xoshiro_state[3][i] = (t << 11) | (t >> 21); return res; } /* Computes [log((y+0.5)/2^32)], up to a relatively good precision, and guarantee that the result is negative. The average absolute error is very close to 0. */ Caml_inline float log_approx(uint32_t y) { union { float f; int32_t i; } u; float exp, x; u.f = y + 0.5f; /* We convert y to a float ... */ exp = u.i >> 23; /* ... of which we extract the exponent ... */ u.i = (u.i & 0x7FFFFF) | 0x3F800000; x = u.f; /* ... and the mantissa. */ return /* This polynomial computes the logarithm of the mantissa (which is in [1, 2]), up to an additive constant. It is chosen such that : - Its degree is 4. - Its average value is that of log in [1, 2] (the sampling has the right mean when lambda is small). - f(1) = f(2) - log(2) = -159*log(2) - 1e-5 (this guarantee that log_approx(y) is always <= -1e-5 < 0). - The maximum of abs(f(x)-log(x)+159*log(2)) is minimized. */ x * (2.104659476859f + x * (-0.720478916626f + x * 0.107132064797f)) /* Then, we add the term corresponding to the exponent, and additive constants. */ + (-111.701724334061f + 0.6931471805f*exp); } /* This function regenerates [MT_STATE_SIZE] geometric random variables at once. Doing this by batches help us gain performances: many compilers (e.g., GCC, CLang, ICC) will be able to use SIMD instructions to get a performance boost. */ #ifdef SUPPORTS_TREE_VECTORIZE __attribute__((optimize("tree-vectorize"))) #endif static void rand_batch(void) { int i; /* Instead of using temporary buffers, we could use one big loop, but it turns out SIMD optimizations of compilers are more fragile when using larger loops. */ static uint32_t A[RAND_BLOCK_SIZE]; static float B[RAND_BLOCK_SIZE]; CAMLassert(lambda > 0.); /* Shuffle the xoshiro samplers, and generate uniform variables in A. */ for(i = 0; i < RAND_BLOCK_SIZE; i++) A[i] = xoshiro_next(i); /* Generate exponential random variables by computing logarithms. We do not use math.h library functions, which are slow and prevent compiler from using SIMD instructions. */ for(i = 0; i < RAND_BLOCK_SIZE; i++) B[i] = 1 + log_approx(A[i]) * one_log1m_lambda; /* We do the final flooring for generating geometric variables. Compilers are unlikely to use SIMD instructions for this loop, because it involves a conditional and variables of different sizes (32 and 64 bits). */ for(i = 0; i < RAND_BLOCK_SIZE; i++) { double f = B[i]; CAMLassert (f >= 1); if(f > Max_long) rand_geom_buff[i] = Max_long; else rand_geom_buff[i] = (uintnat)f; } rand_pos = 0; } /* Simulate a geometric variable of parameter [lambda]. The result is clipped in [1..Max_long] */ static uintnat rand_geom(void) { uintnat res; CAMLassert(lambda > 0.); if(rand_pos == RAND_BLOCK_SIZE) rand_batch(); res = rand_geom_buff[rand_pos++]; CAMLassert(1 <= res && res <= Max_long); return res; } static uintnat next_rand_geom; /* Simulate a binomial variable of parameters [len] and [lambda]. This sampling algorithm has running time linear with [len * lambda]. We could use more a involved algorithm, but this should be good enough since, in the average use case, [lambda] <= 0.01 and therefore the generation of the binomial variable is amortized by the initialialization of the corresponding block. If needed, we could use algorithm BTRS from the paper: Hormann, Wolfgang. "The generation of binomial random variates." Journal of statistical computation and simulation 46.1-2 (1993), pp101-110. */ static uintnat rand_binom(uintnat len) { uintnat res; CAMLassert(lambda > 0. && len < Max_long); for (res = 0; next_rand_geom < len; res++) next_rand_geom += rand_geom(); next_rand_geom -= len; return res; } /**** Capturing the call stack *****/ /* This function is called in, e.g., [caml_alloc_shr], which guarantees that the GC is not called. Clients may use it in a context where the heap is in an invalid state, or when the roots are not properly registered. Therefore, we do not use [caml_alloc], which may call the GC, but prefer using [caml_alloc_shr], which gives this guarantee. The return value is either a valid callstack or 0 in out-of-memory scenarios. */ static value capture_callstack_postponed() { value res; intnat callstack_len = caml_collect_current_callstack(&callstack_buffer, &callstack_buffer_len, callstack_size, -1); if (callstack_len == 0) return Atom(0); res = caml_alloc_shr_no_track_noexc(callstack_len, 0); if (res == 0) return Atom(0); memcpy(Op_val(res), callstack_buffer, sizeof(value) * callstack_len); if (callstack_buffer_len > 256 && callstack_buffer_len > callstack_len * 8) { caml_stat_free(callstack_buffer); callstack_buffer = NULL; callstack_buffer_len = 0; } return res; } /* In this version, we are allowed to call the GC, so we use [caml_alloc], which is more efficient since it uses the minor heap. Should be called with [caml_memprof_suspended == 1] */ static value capture_callstack(int alloc_idx) { value res; intnat callstack_len = caml_collect_current_callstack(&callstack_buffer, &callstack_buffer_len, callstack_size, alloc_idx); CAMLassert(caml_memprof_suspended); res = caml_alloc(callstack_len, 0); memcpy(Op_val(res), callstack_buffer, sizeof(value) * callstack_len); if (callstack_buffer_len > 256 && callstack_buffer_len > callstack_len * 8) { caml_stat_free(callstack_buffer); callstack_buffer = NULL; callstack_buffer_len = 0; } return res; } /**** Data structures for tracked blocks. ****/ struct tracked { /* Memory block being sampled. This is a weak GC root. */ value block; /* Number of samples in this block. */ uintnat n_samples; /* The size of this block. */ uintnat wosize; /* The value returned by the previous callback for this block, or the callstack if the alloc callback has not been called yet. This is a strong GC root. */ value user_data; /* Whether this block has been initially allocated in the minor heap. */ unsigned int alloc_young : 1; /* Whether this block comes from unmarshalling. */ unsigned int unmarshalled : 1; /* Whether this block has been promoted. Implies [alloc_young]. */ unsigned int promoted : 1; /* Whether this block has been deallocated. */ unsigned int deallocated : 1; /* Whether the allocation callback has been called. */ unsigned int cb_alloc_called : 1; /* Whether the promotion callback has been called. */ unsigned int cb_promote_called : 1; /* Whether the deallocation callback has been called. */ unsigned int cb_dealloc_called : 1; /* Whether this entry is deleted. */ unsigned int deleted : 1; /* Whether a callback is currently running for this entry. */ unsigned int callback_running : 1; /* Pointer to the [t_idx] variable in the [run_callback] frame which is currently running the callback for this entry. This is needed to make [run_callback] reetrant, in the case it is called simultaneously by several threads. */ uintnat* idx_ptr; }; /* During the alloc callback for a minor allocation, the block being sampled is not yet allocated. Instead, we place in the block field a value computed with the following macro: */ #define Placeholder_magic 0x04200000 #define Placeholder_offs(offset) (Val_long(offset + Placeholder_magic)) #define Offs_placeholder(block) (Long_val(block) & 0xFFFF) #define Is_placeholder(block) \ (Is_long(block) && (Long_val(block) & ~(uintnat)0xFFFF) == Placeholder_magic) /* When an entry is deleted, its index is replaced by that integer. */ #define Invalid_index (~(uintnat)0) static struct tracking_state { struct tracked* entries; /* The allocated capacity of the entries array */ uintnat alloc_len; /* The number of active entries. (len <= alloc_len) */ uintnat len; /* Before this position, the [block] and [user_data] fields point to the major heap (young <= len). */ uintnat young; /* There are no pending callbacks before this position (callback <= len). */ uintnat callback; /* There are no blocks to be deleted before this position */ uintnat delete; } trackst; #define MIN_TRACKST_ALLOC_LEN 128 /* Reallocate the [trackst] array if it is either too small or too large. Returns 1 if reallocation succeeded --[trackst.alloc_len] is at least [trackst.len]--, and 0 otherwise. */ static int realloc_trackst(void) { uintnat new_alloc_len; struct tracked* new_entries; if (trackst.len <= trackst.alloc_len && (4*trackst.len >= trackst.alloc_len || trackst.alloc_len == MIN_TRACKST_ALLOC_LEN)) return 1; new_alloc_len = trackst.len * 2; if (new_alloc_len < MIN_TRACKST_ALLOC_LEN) new_alloc_len = MIN_TRACKST_ALLOC_LEN; new_entries = caml_stat_resize_noexc(trackst.entries, new_alloc_len * sizeof(struct tracked)); if (new_entries == NULL) return 0; trackst.entries = new_entries; trackst.alloc_len = new_alloc_len; return 1; } Caml_inline uintnat new_tracked(uintnat n_samples, uintnat wosize, int is_unmarshalled, int is_young, value block, value user_data) { struct tracked *t; trackst.len++; if (!realloc_trackst()) { trackst.len--; return Invalid_index; } t = &trackst.entries[trackst.len - 1]; t->block = block; t->n_samples = n_samples; t->wosize = wosize; t->user_data = user_data; t->idx_ptr = NULL; t->alloc_young = is_young; t->unmarshalled = is_unmarshalled; t->promoted = 0; t->deallocated = 0; t->cb_alloc_called = t->cb_promote_called = t->cb_dealloc_called = 0; t->deleted = 0; t->callback_running = 0; return trackst.len - 1; } static void mark_deleted(uintnat t_idx) { struct tracked* t = &trackst.entries[t_idx]; t->deleted = 1; t->user_data = Val_unit; t->block = Val_unit; if (t_idx < trackst.delete) trackst.delete = t_idx; CAMLassert(t->idx_ptr == NULL); } /* The return value is an exception or [Val_unit] iff [*t_idx] is set to [Invalid_index]. In this case, the entry is deleted. Otherwise, the return value is a [Some(...)] block. */ Caml_inline value run_callback_exn(uintnat *t_idx, value cb, value param) { struct tracked* t = &trackst.entries[*t_idx]; value res; CAMLassert(!t->callback_running && t->idx_ptr == NULL); CAMLassert(lambda > 0.); callback_running = t->callback_running = 1; t->idx_ptr = t_idx; res = caml_callback_exn(cb, param); callback_running = 0; /* The call above can modify [*t_idx] and thus invalidate [t]. */ if (*t_idx == Invalid_index) { /* Make sure this entry has not been removed by [caml_memprof_set] */ return Val_unit; } t = &trackst.entries[*t_idx]; t->idx_ptr = NULL; t->callback_running = 0; if (Is_exception_result(res) || res == Val_unit) { /* Callback raised an exception or returned None or (), discard this entry. */ mark_deleted(*t_idx); *t_idx = Invalid_index; } return res; } /* Run all the needed callbacks for a given entry. In case of a thread context switch during a callback, this can be called in a reetrant way. If [*t_idx] equals [trackst.callback], then this function increments [trackst.callback]. The index of the entry may change. It is set to [Invalid_index] if the entry is discarded. Returns: - An exception result if the callback raised an exception - Val_long(0) == Val_unit == None otherwise */ static value handle_entry_callbacks_exn(uintnat* t_idx) { value sample_info, res, user_data; /* No need to make these roots */ struct tracked* t = &trackst.entries[*t_idx]; if (*t_idx == trackst.callback) trackst.callback++; if (t->deleted || t->callback_running) return Val_unit; if (!t->cb_alloc_called) { t->cb_alloc_called = 1; CAMLassert(Is_block(t->block) || Is_placeholder(t->block) || t->deallocated); sample_info = caml_alloc_small(4, 0); Field(sample_info, 0) = Val_long(t->n_samples); Field(sample_info, 1) = Val_long(t->wosize); Field(sample_info, 2) = Val_long(t->unmarshalled); Field(sample_info, 3) = t->user_data; t->user_data = Val_unit; res = run_callback_exn(t_idx, t->alloc_young ? Alloc_minor(tracker) : Alloc_major(tracker), sample_info); if (*t_idx == Invalid_index) return res; CAMLassert(!Is_exception_result(res) && Is_block(res) && Tag_val(res) == 0 && Wosize_val(res) == 1); t = &trackst.entries[*t_idx]; t->user_data = Field(res, 0); if (Is_block(t->user_data) && Is_young(t->user_data) && *t_idx < trackst.young) trackst.young = *t_idx; } if (t->promoted && !t->cb_promote_called) { t->cb_promote_called = 1; user_data = t->user_data; t->user_data = Val_unit; res = run_callback_exn(t_idx, Promote(tracker), user_data); if (*t_idx == Invalid_index) return res; CAMLassert(!Is_exception_result(res) && Is_block(res) && Tag_val(res) == 0 && Wosize_val(res) == 1); t = &trackst.entries[*t_idx]; t->user_data = Field(res, 0); if (Is_block(t->user_data) && Is_young(t->user_data) && *t_idx < trackst.young) trackst.young = *t_idx; } if (t->deallocated && !t->cb_dealloc_called) { value cb = (t->promoted || !t->alloc_young) ? Dealloc_major(tracker) : Dealloc_minor(tracker); t->cb_dealloc_called = 1; user_data = t->user_data; t->user_data = Val_unit; res = run_callback_exn(t_idx, cb, user_data); /* [t] is invalid, but we do no longer use it. */ CAMLassert(*t_idx == Invalid_index); CAMLassert(Is_exception_result(res) || res == Val_unit); return res; } return Val_unit; } /* Remove any deleted entries, updating callback and young */ static void flush_deleted(void) { uintnat i = trackst.delete, j = i; while (i < trackst.len) { if (!trackst.entries[i].deleted) { if (trackst.entries[i].idx_ptr != NULL) *trackst.entries[i].idx_ptr = j; trackst.entries[j] = trackst.entries[i]; j++; } i++; if (trackst.young == i) trackst.young = j; if (trackst.callback == i) trackst.callback = j; } trackst.delete = trackst.len = j; CAMLassert(trackst.callback <= trackst.len); CAMLassert(trackst.young <= trackst.len); realloc_trackst(); } static void check_action_pending(void) { if (!caml_memprof_suspended && trackst.callback < trackst.len) caml_set_action_pending(); } /* In case of a thread context switch during a callback, this can be called in a reetrant way. */ value caml_memprof_handle_postponed_exn(void) { value res = Val_unit; if (caml_memprof_suspended) return res; caml_memprof_suspended = 1; while (trackst.callback < trackst.len) { uintnat i = trackst.callback; res = handle_entry_callbacks_exn(&i); if (Is_exception_result(res)) break; } caml_memprof_suspended = 0; check_action_pending(); /* Needed in case of an exception */ flush_deleted(); return res; } void caml_memprof_oldify_young_roots(void) { uintnat i; /* This loop should always have a small number of iteration (when compared to the size of the minor heap), because the young pointer should always be close to the end of the array. Indeed, it is only moved back when returning from a callback triggered by allocation or promotion, which can only happen for blocks allocated recently, which are close to the end of the trackst array. */ for (i = trackst.young; i < trackst.len; i++) caml_oldify_one(trackst.entries[i].user_data, &trackst.entries[i].user_data); } void caml_memprof_minor_update(void) { uintnat i; /* See comment in [caml_memprof_oldify_young_roots] for the number of iterations of this loop. */ for (i = trackst.young; i < trackst.len; i++) { struct tracked *t = &trackst.entries[i]; CAMLassert(Is_block(t->block) || t->deleted || t->deallocated || Is_placeholder(t->block)); if (Is_block(t->block) && Is_young(t->block)) { if (Hd_val(t->block) == 0) { /* Block has been promoted */ t->block = Field(t->block, 0); t->promoted = 1; } else { /* Block is dead */ CAMLassert_young_header(Hd_val(t->block)); t->block = Val_unit; t->deallocated = 1; } } } if (trackst.callback > trackst.young) { trackst.callback = trackst.young; check_action_pending(); } trackst.young = trackst.len; } void caml_memprof_do_roots(scanning_action f) { uintnat i; for (i = 0; i < trackst.len; i++) f(trackst.entries[i].user_data, &trackst.entries[i].user_data); } void caml_memprof_update_clean_phase(void) { uintnat i; for (i = 0; i < trackst.len; i++) { struct tracked *t = &trackst.entries[i]; if (Is_block(t->block) && !Is_young(t->block)) { CAMLassert(Is_in_heap(t->block)); CAMLassert(!t->alloc_young || t->promoted); if (Is_white_val(t->block)) { t->block = Val_unit; t->deallocated = 1; } } } trackst.callback = 0; check_action_pending(); } void caml_memprof_invert_tracked(void) { uintnat i; for (i = 0; i < trackst.len; i++) caml_invert_root(trackst.entries[i].block, &trackst.entries[i].block); } /**** Sampling procedures ****/ void caml_memprof_track_alloc_shr(value block) { uintnat n_samples; value callstack = 0; CAMLassert(Is_in_heap(block)); /* This test also makes sure memprof is initialized. */ if (lambda == 0 || caml_memprof_suspended) return; n_samples = rand_binom(Whsize_val(block)); if (n_samples == 0) return; callstack = capture_callstack_postponed(); if (callstack == 0) return; new_tracked(n_samples, Wosize_val(block), 0, 0, block, callstack); check_action_pending(); } /* Shifts the next sample in the minor heap by [n] words. Essentially, this tells the sampler to ignore the next [n] words of the minor heap. */ static void shift_sample(uintnat n) { if (caml_memprof_young_trigger - Caml_state->young_alloc_start > n) caml_memprof_young_trigger -= n; else caml_memprof_young_trigger = Caml_state->young_alloc_start; caml_update_young_limit(); } /* Renew the next sample in the minor heap. This needs to be called after each minor sampling and after each minor collection. In practice, this is called at each sampling in the minor heap and at each minor collection. Extra calls do not change the statistical properties of the sampling because of the memorylessness of the geometric distribution. */ void caml_memprof_renew_minor_sample(void) { if (lambda == 0) /* No trigger in the current minor heap. */ caml_memprof_young_trigger = Caml_state->young_alloc_start; else { uintnat geom = rand_geom(); if (Caml_state->young_ptr - Caml_state->young_alloc_start < geom) /* No trigger in the current minor heap. */ caml_memprof_young_trigger = Caml_state->young_alloc_start; caml_memprof_young_trigger = Caml_state->young_ptr - (geom - 1); } caml_update_young_limit(); } /* Called when exceeding the threshold for the next sample in the minor heap, from the C code (the handling is different when called from natively compiled OCaml code). */ void caml_memprof_track_young(uintnat wosize, int from_caml, int nallocs, unsigned char* encoded_alloc_lens) { uintnat whsize = Whsize_wosize(wosize); value callstack, res = Val_unit; int alloc_idx = 0, i, allocs_sampled = 0, has_delete = 0; intnat alloc_ofs, trigger_ofs; /* usually, only one allocation is sampled, even when the block contains multiple combined allocations. So, we delay allocating the full sampled_allocs array until we discover we actually need two entries */ uintnat first_idx, *idx_tab = &first_idx; double saved_lambda = lambda; if (caml_memprof_suspended) { caml_memprof_renew_minor_sample(); return; } /* If [lambda == 0], then [caml_memprof_young_trigger] should be equal to [Caml_state->young_alloc_start]. But this function is only called with [Caml_state->young_alloc_start <= Caml_state->young_ptr < caml_memprof_young_trigger], which is contradictory. */ CAMLassert(lambda > 0); if (!from_caml) { unsigned n_samples = 1 + rand_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr); CAMLassert(encoded_alloc_lens == NULL); /* No Comballoc in C! */ caml_memprof_renew_minor_sample(); callstack = capture_callstack_postponed(); if (callstack == 0) return; new_tracked(n_samples, wosize, 0, 1, Val_hp(Caml_state->young_ptr), callstack); check_action_pending(); return; } /* We need to call the callbacks for this sampled block. Since each callback can potentially allocate, the sampled block will *not* be the one pointed to by [caml_memprof_young_trigger]. Instead, we remember that we need to sample the next allocated word, call the callback and use as a sample the block which will be allocated right after the callback. */ CAMLassert(Caml_state->young_ptr < caml_memprof_young_trigger && caml_memprof_young_trigger <= Caml_state->young_ptr + whsize); trigger_ofs = caml_memprof_young_trigger - Caml_state->young_ptr; alloc_ofs = whsize; /* Restore the minor heap in a valid state for calling the callbacks. We should not call the GC before these two instructions. */ Caml_state->young_ptr += whsize; caml_memprof_renew_minor_sample(); caml_memprof_suspended = 1; /* Perform the sampling of the block in the set of Comballoc'd blocks, insert them in the entries array, and run the callbacks. */ for (alloc_idx = nallocs - 1; alloc_idx >= 0; alloc_idx--) { unsigned alloc_wosz = encoded_alloc_lens == NULL ? wosize : Wosize_encoded_alloc_len(encoded_alloc_lens[alloc_idx]); unsigned n_samples = 0; alloc_ofs -= Whsize_wosize(alloc_wosz); while (alloc_ofs < trigger_ofs) { n_samples++; trigger_ofs -= rand_geom(); } if (n_samples > 0) { uintnat *idx_ptr, t_idx; callstack = capture_callstack(alloc_idx); t_idx = new_tracked(n_samples, alloc_wosz, 0, 1, Placeholder_offs(alloc_ofs), callstack); if (t_idx == Invalid_index) continue; res = handle_entry_callbacks_exn(&t_idx); if (t_idx == Invalid_index) { has_delete = 1; if (saved_lambda != lambda) { /* [lambda] changed during the callback. We need to refresh [trigger_ofs]. */ saved_lambda = lambda; trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (rand_geom() - 1); } } if (Is_exception_result(res)) break; if (t_idx == Invalid_index) continue; if (allocs_sampled == 1) { /* Found a second sampled allocation! Allocate a buffer for them */ idx_tab = caml_stat_alloc_noexc(sizeof(uintnat) * nallocs); if (idx_tab == NULL) { alloc_ofs = 0; idx_tab = &first_idx; break; } idx_tab[0] = first_idx; if (idx_tab[0] != Invalid_index) trackst.entries[idx_tab[0]].idx_ptr = &idx_tab[0]; } /* Usually, trackst.entries[...].idx_ptr is owned by the thread running a callback for the entry, if any. Here, we take ownership of idx_ptr until the end of the function. This does not conflict with the usual use of idx_ptr because no callbacks can run on this entry until the end of the function: the allocation callback has already run and the other callbacks do not run on Placeholder values */ idx_ptr = &idx_tab[allocs_sampled]; *idx_ptr = t_idx; trackst.entries[*idx_ptr].idx_ptr = idx_ptr; allocs_sampled++; } } CAMLassert(alloc_ofs == 0 || Is_exception_result(res)); CAMLassert(allocs_sampled <= nallocs); caml_memprof_suspended = 0; check_action_pending(); /* We need to call [check_action_pending] since we reset [caml_memprof_suspended] to 0 (a GC collection may have triggered some new callback). We need to make sure that the action pending flag is not set systematically, which is to be expected, since [new_tracked] created a new block without updating [trackst.callback]. Fortunately, [handle_entry_callback_exn] increments [trackst.callback] if it is equal to [t_idx]. */ /* This condition happens either in the case of an exception or if one of the callbacks returned [None]. If these cases happen frequently, then we need to call [flush_deleted] somewhere to prevent a leak. */ if (has_delete) flush_deleted(); if (Is_exception_result(res)) { for (i = 0; i < allocs_sampled; i++) if (idx_tab[i] != Invalid_index) { struct tracked* t = &trackst.entries[idx_tab[i]]; /* The allocations are cancelled because of the exception, but this callback has already been called. We simulate a deallocation. */ t->block = Val_unit; t->deallocated = 1; if (trackst.callback > idx_tab[i]) { trackst.callback = idx_tab[i]; check_action_pending(); } } if (idx_tab != &first_idx) caml_stat_free(idx_tab); caml_raise(Extract_exception(res)); } /* We can now restore the minor heap in the state needed by [Alloc_small_aux]. */ if (Caml_state->young_ptr - whsize < Caml_state->young_trigger) { CAML_EV_COUNTER(EV_C_FORCE_MINOR_MEMPROF, 1); caml_gc_dispatch(); } /* Re-allocate the blocks in the minor heap. We should not call the GC after this. */ Caml_state->young_ptr -= whsize; /* Make sure this block is not going to be sampled again. */ shift_sample(whsize); for (i = 0; i < allocs_sampled; i++) { if (idx_tab[i] != Invalid_index) { /* If the execution of the callback has succeeded, then we start the tracking of this block.. Subtlety: we are actually writing [t->block] with an invalid (uninitialized) block. This is correct because the allocation and initialization happens right after returning from [caml_memprof_track_young]. */ struct tracked *t = &trackst.entries[idx_tab[i]]; t->block = Val_hp(Caml_state->young_ptr + Offs_placeholder(t->block)); t->idx_ptr = NULL; CAMLassert(t->cb_alloc_called); if (idx_tab[i] < trackst.young) trackst.young = idx_tab[i]; } } if (idx_tab != &first_idx) caml_stat_free(idx_tab); /* /!\ Since the heap is in an invalid state before initialization, very little heap operations are allowed until then. */ return; } void caml_memprof_track_interned(header_t* block, header_t* blockend) { header_t *p; value callstack = 0; int is_young = Is_young(Val_hp(block)); if (lambda == 0 || caml_memprof_suspended) return; p = block; while (1) { uintnat next_sample = rand_geom(); header_t *next_sample_p, *next_p; if (next_sample > blockend - p) break; /* [next_sample_p] is the block *following* the next sampled block! */ next_sample_p = p + next_sample; while (1) { next_p = p + Whsize_hp(p); if (next_p >= next_sample_p) break; p = next_p; } if (callstack == 0) callstack = capture_callstack_postponed(); if (callstack == 0) break; /* OOM */ new_tracked(rand_binom(next_p - next_sample_p) + 1, Wosize_hp(p), 1, is_young, Val_hp(p), callstack); p = next_p; } check_action_pending(); } /**** Interface with the OCaml code. ****/ static void caml_memprof_init(void) { init = 1; xoshiro_init(); } void caml_memprof_shutdown(void) { init = 0; started = 0; lambda = 0.; caml_memprof_suspended = 0; trackst.len = 0; trackst.callback = trackst.young = trackst.delete = 0; caml_stat_free(trackst.entries); trackst.entries = NULL; trackst.alloc_len = 0; caml_stat_free(callstack_buffer); callstack_buffer = NULL; callstack_buffer_len = 0; } CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param) { CAMLparam3(lv, szv, tracker_param); double l = Double_val(lv); intnat sz = Long_val(szv); if (started) caml_failwith("Gc.Memprof.start: already started."); if (sz < 0 || !(l >= 0.) || l > 1.) /* Checks that [l] is not NAN. */ caml_invalid_argument("Gc.Memprof.start"); if (!init) caml_memprof_init(); lambda = l; if (l > 0) { one_log1m_lambda = l == 1 ? 0 : 1/caml_log1p(-l); rand_pos = RAND_BLOCK_SIZE; next_rand_geom = rand_geom(); } caml_memprof_renew_minor_sample(); callstack_size = sz; started = 1; tracker = tracker_param; caml_register_generational_global_root(&tracker); CAMLreturn(Val_unit); } CAMLprim value caml_memprof_stop(value unit) { uintnat i; if (!started) caml_failwith("Gc.Memprof.stop: not started."); /* This call to [caml_memprof_stop] will discard all the previously tracked blocks. We try one last time to call the postponed callbacks. */ caml_raise_if_exception(caml_memprof_handle_postponed_exn()); /* Discard the tracked blocks. */ for (i = 0; i < trackst.len; i++) if (trackst.entries[i].idx_ptr != NULL) *trackst.entries[i].idx_ptr = Invalid_index; trackst.len = 0; trackst.callback = trackst.young = trackst.delete = 0; caml_stat_free(trackst.entries); trackst.entries = NULL; trackst.alloc_len = 0; lambda = 0; caml_memprof_renew_minor_sample(); started = 0; caml_remove_generational_global_root(&tracker); caml_stat_free(callstack_buffer); callstack_buffer = NULL; callstack_buffer_len = 0; return Val_unit; } /**** Interface with systhread. ****/ void caml_memprof_init_th_ctx(struct caml_memprof_th_ctx* ctx) { ctx->suspended = 0; ctx->callback_running = 0; } void caml_memprof_stop_th_ctx(struct caml_memprof_th_ctx* ctx) { /* Make sure that no memprof callback is being executed in this thread. If so, memprof data structures may have pointers to the thread's stack. */ if(ctx->callback_running) caml_fatal_error("Thread.exit called from a memprof callback."); } void caml_memprof_save_th_ctx(struct caml_memprof_th_ctx* ctx) { ctx->suspended = caml_memprof_suspended; ctx->callback_running = callback_running; } void caml_memprof_restore_th_ctx(const struct caml_memprof_th_ctx* ctx) { caml_memprof_suspended = ctx->suspended; callback_running = ctx->callback_running; check_action_pending(); }