Merge pull request #9279 from stedolan/optimise-callstacks-again
Avoid scanning the stack twice when collecting callstacks in Memprof.master
commit
edee8cea3e
3
Changes
3
Changes
|
@ -44,6 +44,9 @@ Working version
|
|||
- #9230: Memprof support for native allocations.
|
||||
(Jacques-Henri Jourdan and Stephen Dolan, review by Gabriel Scherer)
|
||||
|
||||
- #9279: Memprof optimisation.
|
||||
(Stephen Dolan, review by Jacques-Henri Jourdan)
|
||||
|
||||
### Code generation and optimizations:
|
||||
|
||||
- #8637, #8805, #9247: Record debug info for each allocation.
|
||||
|
|
|
@ -325,12 +325,17 @@ CAMLprim value caml_get_exception_backtrace(value unit)
|
|||
CAMLreturn(res);
|
||||
}
|
||||
|
||||
CAMLprim value caml_get_current_callstack(value max_frames_value) {
|
||||
CAMLprim value caml_get_current_callstack(value max_frames_value)
|
||||
{
|
||||
CAMLparam1(max_frames_value);
|
||||
CAMLlocal1(res);
|
||||
|
||||
res = caml_alloc(caml_current_callstack_size(Long_val(max_frames_value)), 0);
|
||||
caml_current_callstack_write(res, -1);
|
||||
|
||||
value* callstack = NULL;
|
||||
intnat callstack_alloc_len = 0;
|
||||
intnat callstack_len =
|
||||
caml_collect_current_callstack(&callstack, &callstack_alloc_len,
|
||||
Long_val(max_frames_value), -1);
|
||||
res = caml_alloc(callstack_len, 0);
|
||||
memcpy(Op_val(res), callstack, sizeof(value) * callstack_len);
|
||||
caml_stat_free(callstack);
|
||||
CAMLreturn(res);
|
||||
}
|
||||
|
|
|
@ -275,34 +275,38 @@ code_t caml_next_frame_pointer(value ** sp, value ** trsp)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
intnat caml_current_callstack_size(intnat max_frames)
|
||||
{
|
||||
intnat trace_size;
|
||||
value * sp = Caml_state->extern_sp;
|
||||
value * trsp = Caml_state->trapsp;
|
||||
|
||||
for (trace_size = 0; trace_size < max_frames; trace_size++) {
|
||||
code_t p = caml_next_frame_pointer(&sp, &trsp);
|
||||
if (p == NULL) break;
|
||||
}
|
||||
|
||||
return trace_size;
|
||||
}
|
||||
|
||||
void caml_current_callstack_write(value trace, int alloc_idx)
|
||||
#define Default_callstack_size 32
|
||||
intnat caml_collect_current_callstack(value** ptrace, intnat* plen,
|
||||
intnat max_frames, int alloc_idx)
|
||||
{
|
||||
value * sp = Caml_state->extern_sp;
|
||||
value * trsp = Caml_state->trapsp;
|
||||
uintnat trace_pos, trace_size = Wosize_val(trace);
|
||||
intnat trace_pos = 0;
|
||||
CAMLassert(alloc_idx == 0 || alloc_idx == -1);
|
||||
|
||||
for (trace_pos = 0; trace_pos < trace_size; trace_pos++) {
|
||||
code_t p = caml_next_frame_pointer(&sp, &trsp);
|
||||
CAMLassert(p != NULL);
|
||||
/* [Val_backtrace_slot(...)] is always a long, no need to call
|
||||
[caml_modify]. */
|
||||
Field(trace, trace_pos) = Val_backtrace_slot(p);
|
||||
if (max_frames <= 0) return 0;
|
||||
if (*plen == 0) {
|
||||
value* trace =
|
||||
caml_stat_alloc_noexc(Default_callstack_size * sizeof(value));
|
||||
if (trace == NULL) return 0;
|
||||
*ptrace = trace;
|
||||
*plen = Default_callstack_size;
|
||||
}
|
||||
|
||||
while (trace_pos < max_frames) {
|
||||
code_t p = caml_next_frame_pointer(&sp, &trsp);
|
||||
if (p == NULL) break;
|
||||
if (trace_pos == *plen) {
|
||||
intnat new_len = *plen * 2;
|
||||
value * trace = caml_stat_resize_noexc(*ptrace, new_len * sizeof(value));
|
||||
if (trace == NULL) break;
|
||||
*ptrace = trace;
|
||||
*plen = new_len;
|
||||
}
|
||||
(*ptrace)[trace_pos++] = Val_backtrace_slot(p);
|
||||
}
|
||||
|
||||
return trace_pos;
|
||||
}
|
||||
|
||||
/* Read the debugging info contained in the current bytecode executable. */
|
||||
|
|
|
@ -104,23 +104,6 @@ void caml_stash_backtrace(value exn, uintnat pc, char * sp, char * trapsp)
|
|||
}
|
||||
}
|
||||
|
||||
intnat caml_current_callstack_size(intnat max_frames) {
|
||||
intnat trace_size = 0;
|
||||
uintnat pc = Caml_state->last_return_address;
|
||||
char * sp = Caml_state->bottom_of_stack;
|
||||
|
||||
while (1) {
|
||||
frame_descr * descr = caml_next_frame_descriptor(&pc, &sp);
|
||||
if (descr == NULL) break;
|
||||
if (trace_size >= max_frames) break;
|
||||
++trace_size;
|
||||
|
||||
if (sp > Caml_state->top_of_stack) break;
|
||||
}
|
||||
|
||||
return trace_size;
|
||||
}
|
||||
|
||||
/* A backtrace_slot is either a debuginfo or a frame_descr* */
|
||||
#define Slot_is_debuginfo(s) ((uintnat)(s) & 2)
|
||||
#define Debuginfo_slot(s) ((debuginfo)((uintnat)(s) - 2))
|
||||
|
@ -128,27 +111,49 @@ intnat caml_current_callstack_size(intnat max_frames) {
|
|||
#define Frame_descr_slot(s) ((frame_descr*)(s))
|
||||
#define Slot_frame_descr(f) ((backtrace_slot)(f))
|
||||
static debuginfo debuginfo_extract(frame_descr* d, int alloc_idx);
|
||||
void caml_current_callstack_write(value trace, int alloc_idx)
|
||||
|
||||
#define Default_callstack_size 32
|
||||
intnat caml_collect_current_callstack(value** ptrace, intnat* plen,
|
||||
intnat max_frames, int alloc_idx)
|
||||
{
|
||||
uintnat pc = Caml_state->last_return_address;
|
||||
char * sp = Caml_state->bottom_of_stack;
|
||||
intnat trace_pos = 0, trace_size = Wosize_val(trace);
|
||||
intnat trace_pos = 0;
|
||||
|
||||
if (alloc_idx >= 0 && trace_size > 0) {
|
||||
if (max_frames <= 0) return 0;
|
||||
if (*plen == 0) {
|
||||
value* trace =
|
||||
caml_stat_alloc_noexc(Default_callstack_size * sizeof(value));
|
||||
if (trace == NULL) return 0;
|
||||
*ptrace = trace;
|
||||
*plen = Default_callstack_size;
|
||||
}
|
||||
|
||||
if (alloc_idx >= 0) {
|
||||
/* First frame has a Comballoc selector */
|
||||
frame_descr * descr = caml_next_frame_descriptor(&pc, &sp);
|
||||
debuginfo info = debuginfo_extract(descr, alloc_idx);
|
||||
debuginfo info;
|
||||
if (descr == NULL) return 0;
|
||||
info = debuginfo_extract(descr, alloc_idx);
|
||||
CAMLassert(((uintnat)info & 3) == 0);
|
||||
Field(trace, 0) = Val_backtrace_slot(Slot_debuginfo(info));
|
||||
trace_pos++;
|
||||
(*ptrace)[trace_pos++] = Val_backtrace_slot(Slot_debuginfo(info));
|
||||
}
|
||||
|
||||
for (; trace_pos < trace_size; trace_pos++) {
|
||||
while (trace_pos < max_frames) {
|
||||
frame_descr * descr = caml_next_frame_descriptor(&pc, &sp);
|
||||
CAMLassert(descr != NULL && ((uintnat)descr & 3) == 0);
|
||||
/* [Val_backtrace_slot(...)] is always a long, no need to call
|
||||
[caml_modify]. */
|
||||
Field(trace, trace_pos) = Val_backtrace_slot(Slot_frame_descr(descr));
|
||||
if (descr == NULL) break;
|
||||
CAMLassert(((uintnat)descr & 3) == 0);
|
||||
if (trace_pos == *plen) {
|
||||
intnat new_len = *plen * 2;
|
||||
value * trace = caml_stat_resize_noexc(*ptrace, new_len * sizeof(value));
|
||||
if (trace == NULL) break;
|
||||
*ptrace = trace;
|
||||
*plen = new_len;
|
||||
}
|
||||
(*ptrace)[trace_pos++] = Val_backtrace_slot(Slot_frame_descr(descr));
|
||||
}
|
||||
|
||||
return trace_pos;
|
||||
}
|
||||
|
||||
static debuginfo debuginfo_extract(frame_descr* d, int alloc_idx)
|
||||
|
|
|
@ -90,33 +90,21 @@ value caml_remove_debug_info(code_t start);
|
|||
* It defines the [caml_stash_backtrace] function, which is called to quickly
|
||||
* fill the backtrace buffer by walking the stack when an exception is raised.
|
||||
*
|
||||
* It also defines the two following functions, which makes it possible
|
||||
* to store upto [max_frames_value] frames of the current call
|
||||
* stack. This is not used in an exception-raising context, but only
|
||||
* when the user requests to save the trace (hopefully less often), or
|
||||
* the context of profiling. Instead of using a bounded buffer as
|
||||
* [caml_stash_backtrace], we first traverse the stack to compute the
|
||||
* right size, then allocate space for the trace.
|
||||
* It also defines [caml_collect_current_callstack], which stores up
|
||||
* to [max_frames] frames of the current call stack into the
|
||||
* statically allocated buffer [*pbuffer] of length [*plen]. If the
|
||||
* buffer is not long enough, it will be reallocated. The number of
|
||||
* frames collected is returned.
|
||||
*
|
||||
* The first function, [caml_current_callstack_size] computes the size
|
||||
* (in words) of the needed buffer, while the second actually writes
|
||||
* the call stack to the buffer as an object of type
|
||||
* [raw_backtrace]. It should always be called with a buffer of the
|
||||
* size predicted by [caml_current_callstack_size]. The reason we use
|
||||
* two separated functions is to allow using either [caml_alloc] (for
|
||||
* performance) or [caml_alloc_shr] (when we need to avoid a call to
|
||||
* the GC, in memprof.c).
|
||||
*
|
||||
* The alloc_idx parameter to [caml_current_callstack_write] is used
|
||||
* to select between the backtraces of different allocation sites which
|
||||
* were combined by Comballoc. Passing -1 here means the caller doesn't
|
||||
* care which is chosen.
|
||||
* The alloc_idx parameter is used to select between the backtraces of
|
||||
* different allocation sites which were combined by Comballoc.
|
||||
* Passing -1 here means the caller doesn't care which is chosen.
|
||||
*
|
||||
* We use `intnat` for max_frames because, were it only `int`, passing
|
||||
* `max_int` from the OCaml side would overflow on 64bits machines. */
|
||||
|
||||
intnat caml_current_callstack_size(intnat max_frames);
|
||||
void caml_current_callstack_write(value trace, int alloc_idx);
|
||||
intnat caml_collect_current_callstack(value** pbuffer, intnat* plen,
|
||||
intnat max_frames, int alloc_idx);
|
||||
|
||||
#endif /* CAML_INTERNALS */
|
||||
|
||||
|
|
|
@ -63,6 +63,10 @@ static int init = 0;
|
|||
/* Whether memprof is started. */
|
||||
static int started = 0;
|
||||
|
||||
/* Buffer used to compute backtraces */
|
||||
static value* callstack_buffer = NULL;
|
||||
static intnat callstack_buffer_len = 0;
|
||||
|
||||
/**** Statistical sampling ****/
|
||||
|
||||
static double mt_generate_uniform(void)
|
||||
|
@ -147,10 +151,20 @@ static uintnat mt_generate_binom(uintnat len)
|
|||
static value capture_callstack_postponed()
|
||||
{
|
||||
value res;
|
||||
uintnat wosize = caml_current_callstack_size(callstack_size);
|
||||
if (wosize == 0) return Atom(0);
|
||||
res = caml_alloc_shr_no_track_noexc(wosize, 0);
|
||||
if (res != 0) caml_current_callstack_write(res, -1);
|
||||
intnat callstack_len =
|
||||
caml_collect_current_callstack(&callstack_buffer, &callstack_buffer_len,
|
||||
callstack_size, -1);
|
||||
if (callstack_len == 0)
|
||||
return Atom(0);
|
||||
res = caml_alloc_shr_no_track_noexc(callstack_len, 0);
|
||||
if (res == 0)
|
||||
return Atom(0);
|
||||
memcpy(Op_val(res), callstack_buffer, sizeof(value) * callstack_len);
|
||||
if (callstack_buffer_len > 256 && callstack_buffer_len > callstack_len * 8) {
|
||||
caml_stat_free(callstack_buffer);
|
||||
callstack_buffer = NULL;
|
||||
callstack_buffer_len = 0;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -161,10 +175,17 @@ static value capture_callstack_postponed()
|
|||
static value capture_callstack(int alloc_idx)
|
||||
{
|
||||
value res;
|
||||
uintnat wosize = caml_current_callstack_size(callstack_size);
|
||||
intnat callstack_len =
|
||||
caml_collect_current_callstack(&callstack_buffer, &callstack_buffer_len,
|
||||
callstack_size, alloc_idx);
|
||||
CAMLassert(caml_memprof_suspended);
|
||||
res = caml_alloc(wosize, 0);
|
||||
caml_current_callstack_write(res, alloc_idx);
|
||||
res = caml_alloc(callstack_len, 0);
|
||||
memcpy(Op_val(res), callstack_buffer, sizeof(value) * callstack_len);
|
||||
if (callstack_buffer_len > 256 && callstack_buffer_len > callstack_len * 8) {
|
||||
caml_stat_free(callstack_buffer);
|
||||
callstack_buffer = NULL;
|
||||
callstack_buffer_len = 0;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -841,6 +862,9 @@ void caml_memprof_shutdown(void) {
|
|||
caml_stat_free(trackst.entries);
|
||||
trackst.entries = NULL;
|
||||
trackst.alloc_len = 0;
|
||||
caml_stat_free(callstack_buffer);
|
||||
callstack_buffer = NULL;
|
||||
callstack_buffer_len = 0;
|
||||
}
|
||||
|
||||
CAMLprim value caml_memprof_start(value lv, value szv,
|
||||
|
@ -926,5 +950,9 @@ CAMLprim value caml_memprof_stop(value unit)
|
|||
caml_remove_generational_global_root(&callback_dealloc_minor);
|
||||
caml_remove_generational_global_root(&callback_dealloc_major);
|
||||
|
||||
caml_stat_free(callstack_buffer);
|
||||
callstack_buffer = NULL;
|
||||
callstack_buffer_len = 0;
|
||||
|
||||
return Val_unit;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue