/**************************************************************************/
/*                                                                        */
/*                                 OCaml                                  */
/*                                                                        */
/*            Jacques-Henri Jourdan, projet Gallium, INRIA Paris          */
/*                                                                        */
/*   Copyright 2016 Institut National de Recherche en Informatique et     */
/*     en Automatique.                                                    */
/*                                                                        */
/*   All rights reserved.  This file is distributed under the terms of    */
/*   the GNU Lesser General Public License version 2.1, with the          */
/*   special exception on linking described in the file LICENSE.          */
/*                                                                        */
/**************************************************************************/

#define CAML_INTERNALS

#include <string.h>
#include "caml/memprof.h"
#include "caml/fail.h"
#include "caml/alloc.h"
#include "caml/callback.h"
#include "caml/signals.h"
#include "caml/memory.h"
#include "caml/minor_gc.h"
#include "caml/backtrace_prim.h"
#include "caml/weak.h"
#include "caml/stack.h"
#include "caml/misc.h"
#include "caml/compact.h"
#include "caml/printexc.h"
#include "caml/eventlog.h"

#define RAND_BLOCK_SIZE 64

static uint32_t xoshiro_state[4][RAND_BLOCK_SIZE];
static uintnat rand_geom_buff[RAND_BLOCK_SIZE];
static uint32_t rand_pos;

static uint32_t rand_pos;

/* [lambda] is the mean number of samples for each allocated word (including
   block headers). */
static double lambda = 0;
 /* Precomputed value of [1/log(1-lambda)], for fast sampling of
    geometric distribution.
    Dummy if [lambda = 0]. */
static float one_log1m_lambda;

/* [caml_memprof_suspended] is used for masking memprof callbacks when
   a callback is running or when an uncaught exception handler is
   called. */
int caml_memprof_suspended = 0;

/* [callback_running] is used to trigger a fatal error whenever
   [Thread.exit] is called from a callback. */
static int callback_running = 0;

static intnat callstack_size;

/* accessors for the OCaml type [Gc.Memprof.tracker],
   which is the type of the [tracker] global below. */
#define Alloc_minor(tracker) (Field(tracker, 0))
#define Alloc_major(tracker) (Field(tracker, 1))
#define Promote(tracker) (Field(tracker, 2))
#define Dealloc_minor(tracker) (Field(tracker, 3))
#define Dealloc_major(tracker) (Field(tracker, 4))

static value tracker;

/* Pointer to the word following the next sample in the minor
   heap. Equals [Caml_state->young_alloc_start] if no sampling is planned in
   the current minor heap.
   Invariant: [caml_memprof_young_trigger <= Caml_state->young_ptr].
 */
value* caml_memprof_young_trigger;

/* Whether memprof has been initialized.  */
static int init = 0;

/* Whether memprof is started. */
static int started = 0;

/* Buffer used to compute backtraces */
static value* callstack_buffer = NULL;
static intnat callstack_buffer_len = 0;

/**** Statistical sampling ****/

Caml_inline uint64_t splitmix64_next(uint64_t* x) {
  uint64_t z = (*x += 0x9E3779B97F4A7C15ull);
  z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
  z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
  return z ^ (z >> 31);
}

static void xoshiro_init(void) {
  int i;
  uint64_t splitmix64_state = 42;
  rand_pos = RAND_BLOCK_SIZE;
  for (i = 0; i < RAND_BLOCK_SIZE; i++) {
    uint64_t t = splitmix64_next(&splitmix64_state);
    xoshiro_state[0][i] = t & 0xFFFFFFFF;
    xoshiro_state[1][i] = t >> 32;
    t = splitmix64_next(&splitmix64_state);
    xoshiro_state[2][i] = t & 0xFFFFFFFF;
    xoshiro_state[3][i] = t >> 32;
  }
}

Caml_inline uint32_t xoshiro_next(int i) {
  uint32_t res = xoshiro_state[0][i] + xoshiro_state[3][i];
  uint32_t t = xoshiro_state[1][i] << 9;
  xoshiro_state[2][i] ^= xoshiro_state[0][i];
  xoshiro_state[3][i] ^= xoshiro_state[1][i];
  xoshiro_state[1][i] ^= xoshiro_state[2][i];
  xoshiro_state[0][i] ^= xoshiro_state[3][i];
  xoshiro_state[2][i] ^= t;
  t = xoshiro_state[3][i];
  xoshiro_state[3][i] = (t << 11) | (t >> 21);
  return res;
}

/* Computes [log((y+0.5)/2^32)], up to a relatively good precision,
   and guarantee that the result is negative.
   The average absolute error is very close to 0. */
Caml_inline float log_approx(uint32_t y) {
  union { float f; int32_t i; } u;
  float exp, x;
  u.f = y + 0.5f;    /* We convert y to a float ... */
  exp = u.i >> 23;   /* ... of which we extract the exponent ... */
  u.i = (u.i & 0x7FFFFF) | 0x3F800000;
  x = u.f;           /* ... and the mantissa. */

  return
    /* This polynomial computes the logarithm of the mantissa (which
       is in [1, 2]), up to an additive constant. It is chosen such that :
       - Its degree is 4.
       - Its average value is that of log in [1, 2]
             (the sampling has the right mean when lambda is small).
       - f(1) = f(2) - log(2) = -159*log(2) - 1e-5
             (this guarantee that log_approx(y) is always <= -1e-5 < 0).
       - The maximum of abs(f(x)-log(x)+159*log(2)) is minimized.
    */
    x * (2.104659476859f + x * (-0.720478916626f + x * 0.107132064797f))

    /* Then, we add the term corresponding to the exponent, and
       additive constants. */
    + (-111.701724334061f + 0.6931471805f*exp);
}

/* This function regenerates [MT_STATE_SIZE] geometric random
   variables at once. Doing this by batches help us gain performances:
   many compilers (e.g., GCC, CLang, ICC) will be able to use SIMD
   instructions to get a performance boost.
*/
#ifdef SUPPORTS_TREE_VECTORIZE
__attribute__((optimize("tree-vectorize")))
#endif
static void rand_batch(void) {
  int i;

  /* Instead of using temporary buffers, we could use one big loop,
     but it turns out SIMD optimizations of compilers are more fragile
     when using larger loops.  */
  static uint32_t A[RAND_BLOCK_SIZE];
  static float B[RAND_BLOCK_SIZE];

  CAMLassert(lambda > 0.);

  /* Shuffle the xoshiro samplers, and generate uniform variables in A. */
  for(i = 0; i < RAND_BLOCK_SIZE; i++)
    A[i] = xoshiro_next(i);

  /* Generate exponential random variables by computing logarithms. We
     do not use math.h library functions, which are slow and prevent
     compiler from using SIMD instructions. */
  for(i = 0; i < RAND_BLOCK_SIZE; i++)
    B[i] = 1 + log_approx(A[i]) * one_log1m_lambda;

  /* We do the final flooring for generating geometric
     variables. Compilers are unlikely to use SIMD instructions for
     this loop, because it involves a conditional and variables of
     different sizes (32 and 64 bits). */
  for(i = 0; i < RAND_BLOCK_SIZE; i++) {
    double f = B[i];
    CAMLassert (f >= 1);
    if(f > Max_long) rand_geom_buff[i] = Max_long;
    else rand_geom_buff[i] = (uintnat)f;
  }

  rand_pos = 0;
}

/* Simulate a geometric variable of parameter [lambda].
   The result is clipped in [1..Max_long] */
static uintnat rand_geom(void)
{
  uintnat res;
  CAMLassert(lambda > 0.);
  if(rand_pos == RAND_BLOCK_SIZE) rand_batch();
  res = rand_geom_buff[rand_pos++];
  CAMLassert(1 <= res && res <= Max_long);
  return res;
}

static uintnat next_rand_geom;
/* Simulate a binomial variable of parameters [len] and [lambda].
   This sampling algorithm has running time linear with [len *
   lambda].  We could use more a involved algorithm, but this should
   be good enough since, in the average use case, [lambda] <= 0.01 and
   therefore the generation of the binomial variable is amortized by
   the initialialization of the corresponding block.

   If needed, we could use algorithm BTRS from the paper:
     Hormann, Wolfgang. "The generation of binomial random variates."
     Journal of statistical computation and simulation 46.1-2 (1993), pp101-110.
 */
static uintnat rand_binom(uintnat len)
{
  uintnat res;
  CAMLassert(lambda > 0. && len < Max_long);
  for (res = 0; next_rand_geom < len; res++)
    next_rand_geom += rand_geom();
  next_rand_geom -= len;
  return res;
}

/**** Capturing the call stack *****/

/* This function is called in, e.g., [caml_alloc_shr], which
   guarantees that the GC is not called. Clients may use it in a
   context where the heap is in an invalid state, or when the roots
   are not properly registered. Therefore, we do not use [caml_alloc],
   which may call the GC, but prefer using [caml_alloc_shr], which
   gives this guarantee. The return value is either a valid callstack
   or 0 in out-of-memory scenarios. */
static value capture_callstack_postponed()
{
  value res;
  intnat callstack_len =
    caml_collect_current_callstack(&callstack_buffer, &callstack_buffer_len,
                                   callstack_size, -1);
  if (callstack_len == 0)
    return Atom(0);
  res = caml_alloc_shr_no_track_noexc(callstack_len, 0);
  if (res == 0)
    return Atom(0);
  memcpy(Op_val(res), callstack_buffer, sizeof(value) * callstack_len);
  if (callstack_buffer_len > 256 && callstack_buffer_len > callstack_len * 8) {
    caml_stat_free(callstack_buffer);
    callstack_buffer = NULL;
    callstack_buffer_len = 0;
  }
  return res;
}

/* In this version, we are allowed to call the GC, so we use
   [caml_alloc], which is more efficient since it uses the minor
   heap.
   Should be called with [caml_memprof_suspended == 1] */
static value capture_callstack(int alloc_idx)
{
  value res;
  intnat callstack_len =
    caml_collect_current_callstack(&callstack_buffer, &callstack_buffer_len,
                                   callstack_size, alloc_idx);
  CAMLassert(caml_memprof_suspended);
  res = caml_alloc(callstack_len, 0);
  memcpy(Op_val(res), callstack_buffer, sizeof(value) * callstack_len);
  if (callstack_buffer_len > 256 && callstack_buffer_len > callstack_len * 8) {
    caml_stat_free(callstack_buffer);
    callstack_buffer = NULL;
    callstack_buffer_len = 0;
  }
  return res;
}

/**** Data structures for tracked blocks. ****/

struct tracked {
  /* Memory block being sampled. This is a weak GC root. */
  value block;

  /* Number of samples in this block. */
  uintnat n_samples;

  /* The size of this block. */
  uintnat wosize;

  /* The value returned by the previous callback for this block, or
     the callstack if the alloc callback has not been called yet.
     This is a strong GC root. */
  value user_data;

  /* Whether this block has been initially allocated in the minor heap. */
  unsigned int alloc_young : 1;

  /* Whether this block comes from unmarshalling. */
  unsigned int unmarshalled : 1;

  /* Whether this block has been promoted. Implies [alloc_young]. */
  unsigned int promoted : 1;

  /* Whether this block has been deallocated. */
  unsigned int deallocated : 1;

  /* Whether the allocation callback has been called. */
  unsigned int cb_alloc_called : 1;

  /* Whether the promotion callback has been called. */
  unsigned int cb_promote_called : 1;

  /* Whether the deallocation callback has been called. */
  unsigned int cb_dealloc_called : 1;

  /* Whether this entry is deleted. */
  unsigned int deleted : 1;

  /* Whether a callback is currently running for this entry. */
  unsigned int callback_running : 1;

  /* Pointer to the [t_idx] variable in the [run_callback] frame which
     is currently running the callback for this entry. This is needed
     to make [run_callback] reetrant, in the case it is called
     simultaneously by several threads. */
  uintnat* idx_ptr;
};

/* During the alloc callback for a minor allocation, the block being
   sampled is not yet allocated. Instead, we place in the block field
   a value computed with the following macro: */
#define Placeholder_magic 0x04200000
#define Placeholder_offs(offset) (Val_long(offset + Placeholder_magic))
#define Offs_placeholder(block) (Long_val(block) & 0xFFFF)
#define Is_placeholder(block) \
  (Is_long(block) && (Long_val(block) & ~(uintnat)0xFFFF) == Placeholder_magic)

/* When an entry is deleted, its index is replaced by that integer. */
#define Invalid_index (~(uintnat)0)


static struct tracking_state {
  struct tracked* entries;
  /* The allocated capacity of the entries array */
  uintnat alloc_len;
  /* The number of active entries. (len <= alloc_len) */
  uintnat len;
  /* Before this position, the [block] and [user_data] fields point to
     the major heap (young <= len). */
  uintnat young;
  /* There are no pending callbacks before this position (callback <= len). */
  uintnat callback;
  /* There are no blocks to be deleted before this position */
  uintnat delete;
} trackst;

#define MIN_TRACKST_ALLOC_LEN 128


/* Reallocate the [trackst] array if it is either too small or too
   large.
   Returns 1 if reallocation succeeded --[trackst.alloc_len] is at
   least [trackst.len]--, and 0 otherwise. */
static int realloc_trackst(void) {
  uintnat new_alloc_len;
  struct tracked* new_entries;
  if (trackst.len <= trackst.alloc_len &&
     (4*trackst.len >= trackst.alloc_len ||
      trackst.alloc_len == MIN_TRACKST_ALLOC_LEN))
    return 1;
  new_alloc_len = trackst.len * 2;
  if (new_alloc_len < MIN_TRACKST_ALLOC_LEN)
    new_alloc_len = MIN_TRACKST_ALLOC_LEN;
  new_entries = caml_stat_resize_noexc(trackst.entries,
      new_alloc_len * sizeof(struct tracked));
  if (new_entries == NULL) return 0;
  trackst.entries = new_entries;
  trackst.alloc_len = new_alloc_len;
  return 1;
}

Caml_inline uintnat new_tracked(uintnat n_samples, uintnat wosize,
                                int is_unmarshalled, int is_young,
                                value block, value user_data)
{
  struct tracked *t;
  trackst.len++;
  if (!realloc_trackst()) {
    trackst.len--;
    return Invalid_index;
  }
  t = &trackst.entries[trackst.len - 1];
  t->block = block;
  t->n_samples = n_samples;
  t->wosize = wosize;
  t->user_data = user_data;
  t->idx_ptr = NULL;
  t->alloc_young = is_young;
  t->unmarshalled = is_unmarshalled;
  t->promoted = 0;
  t->deallocated = 0;
  t->cb_alloc_called = t->cb_promote_called = t->cb_dealloc_called = 0;
  t->deleted = 0;
  t->callback_running = 0;
  return trackst.len - 1;
}

static void mark_deleted(uintnat t_idx)
{
  struct tracked* t = &trackst.entries[t_idx];
  t->deleted = 1;
  t->user_data = Val_unit;
  t->block = Val_unit;
  if (t_idx < trackst.delete) trackst.delete = t_idx;
  CAMLassert(t->idx_ptr == NULL);
}

/* The return value is an exception or [Val_unit] iff [*t_idx] is set to
   [Invalid_index]. In this case, the entry is deleted.
   Otherwise, the return value is a [Some(...)] block. */
Caml_inline value run_callback_exn(uintnat *t_idx, value cb, value param) {
  struct tracked* t = &trackst.entries[*t_idx];
  value res;
  CAMLassert(!t->callback_running && t->idx_ptr == NULL);
  CAMLassert(lambda > 0.);

  callback_running = t->callback_running = 1;
  t->idx_ptr = t_idx;
  res = caml_callback_exn(cb, param);
  callback_running = 0;
  /* The call above can modify [*t_idx] and thus invalidate [t]. */
  if (*t_idx == Invalid_index) {
    /* Make sure this entry has not been removed by [caml_memprof_set] */
    return Val_unit;
  }
  t = &trackst.entries[*t_idx];
  t->idx_ptr = NULL;
  t->callback_running = 0;
  if (Is_exception_result(res) || res == Val_unit) {
    /* Callback raised an exception or returned None or (), discard
       this entry. */
    mark_deleted(*t_idx);
    *t_idx = Invalid_index;
  }
  return res;
}

/* Run all the needed callbacks for a given entry.
   In case of a thread context switch during a callback, this can be
   called in a reetrant way.
   If [*t_idx] equals [trackst.callback], then this function
   increments [trackst.callback].
   The index of the entry may change. It is set to [Invalid_index] if
   the entry is discarded.
   Returns:
   - An exception result if the callback raised an exception
   - Val_long(0) == Val_unit == None otherwise
 */
static value handle_entry_callbacks_exn(uintnat* t_idx)
{
  value sample_info, res, user_data;    /* No need to make these roots */
  struct tracked* t = &trackst.entries[*t_idx];
  if (*t_idx == trackst.callback) trackst.callback++;

  if (t->deleted || t->callback_running) return Val_unit;

  if (!t->cb_alloc_called) {
    t->cb_alloc_called = 1;
    CAMLassert(Is_block(t->block)
               || Is_placeholder(t->block)
               || t->deallocated);
    sample_info = caml_alloc_small(4, 0);
    Field(sample_info, 0) = Val_long(t->n_samples);
    Field(sample_info, 1) = Val_long(t->wosize);
    Field(sample_info, 2) = Val_long(t->unmarshalled);
    Field(sample_info, 3) = t->user_data;
    t->user_data = Val_unit;
    res = run_callback_exn(t_idx,
        t->alloc_young ? Alloc_minor(tracker) : Alloc_major(tracker),
        sample_info);
    if (*t_idx == Invalid_index)
      return res;
    CAMLassert(!Is_exception_result(res) && Is_block(res) && Tag_val(res) == 0
               && Wosize_val(res) == 1);
    t = &trackst.entries[*t_idx];
    t->user_data = Field(res, 0);
    if (Is_block(t->user_data) && Is_young(t->user_data) &&
        *t_idx < trackst.young)
      trackst.young = *t_idx;
  }

  if (t->promoted && !t->cb_promote_called) {
    t->cb_promote_called = 1;
    user_data = t->user_data;
    t->user_data = Val_unit;
    res = run_callback_exn(t_idx, Promote(tracker), user_data);
    if (*t_idx == Invalid_index)
      return res;
    CAMLassert(!Is_exception_result(res) && Is_block(res) && Tag_val(res) == 0
               && Wosize_val(res) == 1);
    t = &trackst.entries[*t_idx];
    t->user_data = Field(res, 0);
    if (Is_block(t->user_data) && Is_young(t->user_data) &&
        *t_idx < trackst.young)
      trackst.young = *t_idx;
  }

  if (t->deallocated && !t->cb_dealloc_called) {
    value cb = (t->promoted || !t->alloc_young) ?
      Dealloc_major(tracker) : Dealloc_minor(tracker);
    t->cb_dealloc_called = 1;
    user_data = t->user_data;
    t->user_data = Val_unit;
    res = run_callback_exn(t_idx, cb, user_data);
    /* [t] is invalid, but we do no longer use it. */
    CAMLassert(*t_idx == Invalid_index);
    CAMLassert(Is_exception_result(res) || res == Val_unit);
    return res;
  }

  return Val_unit;
}

/* Remove any deleted entries, updating callback and young */
static void flush_deleted(void)
{
  uintnat i = trackst.delete, j = i;
  while (i < trackst.len) {
    if (!trackst.entries[i].deleted) {
      if (trackst.entries[i].idx_ptr != NULL)
        *trackst.entries[i].idx_ptr = j;
      trackst.entries[j] = trackst.entries[i];
      j++;
    }
    i++;
    if (trackst.young == i) trackst.young = j;
    if (trackst.callback == i) trackst.callback = j;
  }
  trackst.delete = trackst.len = j;
  CAMLassert(trackst.callback <= trackst.len);
  CAMLassert(trackst.young <= trackst.len);
  realloc_trackst();
}

static void check_action_pending(void) {
  if (!caml_memprof_suspended && trackst.callback < trackst.len)
    caml_set_action_pending();
}

/* In case of a thread context switch during a callback, this can be
   called in a reetrant way. */
value caml_memprof_handle_postponed_exn(void)
{
  value res = Val_unit;
  if (caml_memprof_suspended) return res;
  caml_memprof_suspended = 1;
  while (trackst.callback < trackst.len) {
    uintnat i = trackst.callback;
    res = handle_entry_callbacks_exn(&i);
    if (Is_exception_result(res)) break;
  }
  caml_memprof_suspended = 0;
  check_action_pending();  /* Needed in case of an exception */
  flush_deleted();
  return res;
}

void caml_memprof_oldify_young_roots(void)
{
  uintnat i;
  /* This loop should always have a small number of iteration (when
     compared to the size of the minor heap), because the young
     pointer should always be close to the end of the array. Indeed,
     it is only moved back when returning from a callback triggered by
     allocation or promotion, which can only happen for blocks
     allocated recently, which are close to the end of the trackst
     array. */
  for (i = trackst.young; i < trackst.len; i++)
    caml_oldify_one(trackst.entries[i].user_data,
                    &trackst.entries[i].user_data);
}

void caml_memprof_minor_update(void)
{
  uintnat i;
  /* See comment in [caml_memprof_oldify_young_roots] for the number
     of iterations of this loop. */
  for (i = trackst.young; i < trackst.len; i++) {
    struct tracked *t = &trackst.entries[i];
    CAMLassert(Is_block(t->block) || t->deleted || t->deallocated ||
               Is_placeholder(t->block));
    if (Is_block(t->block) && Is_young(t->block)) {
      if (Hd_val(t->block) == 0) {
        /* Block has been promoted */
        t->block = Field(t->block, 0);
        t->promoted = 1;
      } else {
        /* Block is dead */
        CAMLassert_young_header(Hd_val(t->block));
        t->block = Val_unit;
        t->deallocated = 1;
      }
    }
  }
  if (trackst.callback > trackst.young) {
    trackst.callback = trackst.young;
    check_action_pending();
  }
  trackst.young = trackst.len;
}

void caml_memprof_do_roots(scanning_action f)
{
  uintnat i;
  for (i = 0; i < trackst.len; i++)
    f(trackst.entries[i].user_data, &trackst.entries[i].user_data);
}

void caml_memprof_update_clean_phase(void)
{
  uintnat i;
  for (i = 0; i < trackst.len; i++) {
    struct tracked *t = &trackst.entries[i];
    if (Is_block(t->block) && !Is_young(t->block)) {
      CAMLassert(Is_in_heap(t->block));
      CAMLassert(!t->alloc_young || t->promoted);
      if (Is_white_val(t->block)) {
        t->block = Val_unit;
        t->deallocated = 1;
      }
    }
  }
  trackst.callback = 0;
  check_action_pending();
}

void caml_memprof_invert_tracked(void)
{
  uintnat i;
  for (i = 0; i < trackst.len; i++)
    caml_invert_root(trackst.entries[i].block, &trackst.entries[i].block);
}

/**** Sampling procedures ****/

void caml_memprof_track_alloc_shr(value block)
{
  uintnat n_samples;
  value callstack = 0;
  CAMLassert(Is_in_heap(block));

  /* This test also makes sure memprof is initialized. */
  if (lambda == 0 || caml_memprof_suspended) return;

  n_samples = rand_binom(Whsize_val(block));
  if (n_samples == 0) return;

  callstack = capture_callstack_postponed();
  if (callstack == 0) return;

  new_tracked(n_samples, Wosize_val(block), 0, 0, block, callstack);
  check_action_pending();
}

/* Shifts the next sample in the minor heap by [n] words. Essentially,
   this tells the sampler to ignore the next [n] words of the minor
   heap. */
static void shift_sample(uintnat n)
{
  if (caml_memprof_young_trigger - Caml_state->young_alloc_start > n)
    caml_memprof_young_trigger -= n;
  else
    caml_memprof_young_trigger = Caml_state->young_alloc_start;
  caml_update_young_limit();
}

/* Renew the next sample in the minor heap. This needs to be called
   after each minor sampling and after each minor collection. In
   practice, this is called at each sampling in the minor heap and at
   each minor collection. Extra calls do not change the statistical
   properties of the sampling because of the memorylessness of the
   geometric distribution. */
void caml_memprof_renew_minor_sample(void)
{

  if (lambda == 0) /* No trigger in the current minor heap. */
    caml_memprof_young_trigger = Caml_state->young_alloc_start;
  else {
    uintnat geom = rand_geom();
    if (Caml_state->young_ptr - Caml_state->young_alloc_start < geom)
      /* No trigger in the current minor heap. */
      caml_memprof_young_trigger = Caml_state->young_alloc_start;
    caml_memprof_young_trigger = Caml_state->young_ptr - (geom - 1);
  }

  caml_update_young_limit();
}

/* Called when exceeding the threshold for the next sample in the
   minor heap, from the C code (the handling is different when called
   from natively compiled OCaml code). */
void caml_memprof_track_young(uintnat wosize, int from_caml,
                              int nallocs, unsigned char* encoded_alloc_lens)
{
  uintnat whsize = Whsize_wosize(wosize);
  value callstack, res = Val_unit;
  int alloc_idx = 0, i, allocs_sampled = 0, has_delete = 0;
  intnat alloc_ofs, trigger_ofs;
  /* usually, only one allocation is sampled, even when the block contains
     multiple combined allocations. So, we delay allocating the full
     sampled_allocs array until we discover we actually need two entries */
  uintnat first_idx, *idx_tab = &first_idx;
  double saved_lambda = lambda;

  if (caml_memprof_suspended) {
    caml_memprof_renew_minor_sample();
    return;
  }

  /* If [lambda == 0], then [caml_memprof_young_trigger] should be
     equal to [Caml_state->young_alloc_start]. But this function is only
     called with [Caml_state->young_alloc_start <= Caml_state->young_ptr <
     caml_memprof_young_trigger], which is contradictory. */
  CAMLassert(lambda > 0);

  if (!from_caml) {
    unsigned n_samples = 1 +
      rand_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
    CAMLassert(encoded_alloc_lens == NULL);    /* No Comballoc in C! */
    caml_memprof_renew_minor_sample();

    callstack = capture_callstack_postponed();
    if (callstack == 0) return;

    new_tracked(n_samples, wosize,
                0, 1, Val_hp(Caml_state->young_ptr), callstack);
    check_action_pending();
    return;
  }

  /* We need to call the callbacks for this sampled block. Since each
     callback can potentially allocate, the sampled block will *not*
     be the one pointed to by [caml_memprof_young_trigger]. Instead,
     we remember that we need to sample the next allocated word,
     call the callback and use as a sample the block which will be
     allocated right after the callback. */

  CAMLassert(Caml_state->young_ptr < caml_memprof_young_trigger &&
             caml_memprof_young_trigger <= Caml_state->young_ptr + whsize);
  trigger_ofs = caml_memprof_young_trigger - Caml_state->young_ptr;
  alloc_ofs = whsize;

  /* Restore the minor heap in a valid state for calling the callbacks.
     We should not call the GC before these two instructions. */
  Caml_state->young_ptr += whsize;
  caml_memprof_renew_minor_sample();
  caml_memprof_suspended = 1;

  /* Perform the sampling of the block in the set of Comballoc'd
     blocks, insert them in the entries array, and run the
     callbacks. */
  for (alloc_idx = nallocs - 1; alloc_idx >= 0; alloc_idx--) {
    unsigned alloc_wosz = encoded_alloc_lens == NULL ? wosize :
      Wosize_encoded_alloc_len(encoded_alloc_lens[alloc_idx]);
    unsigned n_samples = 0;
    alloc_ofs -= Whsize_wosize(alloc_wosz);
    while (alloc_ofs < trigger_ofs) {
      n_samples++;
      trigger_ofs -= rand_geom();
    }
    if (n_samples > 0) {
      uintnat *idx_ptr, t_idx;

      callstack = capture_callstack(alloc_idx);
      t_idx = new_tracked(n_samples, alloc_wosz,
                          0, 1, Placeholder_offs(alloc_ofs), callstack);
      if (t_idx == Invalid_index) continue;
      res = handle_entry_callbacks_exn(&t_idx);
      if (t_idx == Invalid_index) {
        has_delete = 1;
        if (saved_lambda != lambda) {
          /* [lambda] changed during the callback. We need to refresh
             [trigger_ofs]. */
          saved_lambda = lambda;
          trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (rand_geom() - 1);
        }
      }
      if (Is_exception_result(res)) break;
      if (t_idx == Invalid_index) continue;

      if (allocs_sampled == 1) {
        /* Found a second sampled allocation! Allocate a buffer for them */
        idx_tab = caml_stat_alloc_noexc(sizeof(uintnat) * nallocs);
        if (idx_tab == NULL) {
          alloc_ofs = 0;
          idx_tab = &first_idx;
          break;
        }
        idx_tab[0] = first_idx;
        if (idx_tab[0] != Invalid_index)
          trackst.entries[idx_tab[0]].idx_ptr = &idx_tab[0];
      }

      /* Usually, trackst.entries[...].idx_ptr is owned by the thread
         running a callback for the entry, if any. Here, we take ownership
         of idx_ptr until the end of the function.

         This does not conflict with the usual use of idx_ptr because no
         callbacks can run on this entry until the end of the function:
         the allocation callback has already run and the other callbacks
         do not run on Placeholder values */
      idx_ptr = &idx_tab[allocs_sampled];
      *idx_ptr = t_idx;
      trackst.entries[*idx_ptr].idx_ptr = idx_ptr;
      allocs_sampled++;
    }
  }

  CAMLassert(alloc_ofs == 0 || Is_exception_result(res));
  CAMLassert(allocs_sampled <= nallocs);
  caml_memprof_suspended = 0;
  check_action_pending();
  /* We need to call [check_action_pending] since we
     reset [caml_memprof_suspended] to 0 (a GC collection may have
     triggered some new callback).

     We need to make sure that the action pending flag is not set
     systematically, which is to be expected, since [new_tracked]
     created a new block without updating
     [trackst.callback]. Fortunately, [handle_entry_callback_exn]
     increments [trackst.callback] if it is equal to [t_idx]. */

  /* This condition happens either in the case of an exception or if
     one of the callbacks returned [None]. If these cases happen
     frequently, then we need to call [flush_deleted] somewhere to
     prevent a leak. */
  if (has_delete)
    flush_deleted();

  if (Is_exception_result(res)) {
    for (i = 0; i < allocs_sampled; i++)
      if (idx_tab[i] != Invalid_index) {
        struct tracked* t = &trackst.entries[idx_tab[i]];
        /* The allocations are cancelled because of the exception,
           but this callback has already been called. We simulate a
           deallocation. */
        t->block = Val_unit;
        t->deallocated = 1;
        if (trackst.callback > idx_tab[i]) {
          trackst.callback = idx_tab[i];
          check_action_pending();
        }
      }
    if (idx_tab != &first_idx) caml_stat_free(idx_tab);
    caml_raise(Extract_exception(res));
  }

  /* We can now restore the minor heap in the state needed by
     [Alloc_small_aux]. */
  if (Caml_state->young_ptr - whsize < Caml_state->young_trigger) {
    CAML_EV_COUNTER(EV_C_FORCE_MINOR_MEMPROF, 1);
    caml_gc_dispatch();
  }

  /* Re-allocate the blocks in the minor heap. We should not call the
     GC after this. */
  Caml_state->young_ptr -= whsize;

  /* Make sure this block is not going to be sampled again. */
  shift_sample(whsize);

  for (i = 0; i < allocs_sampled; i++) {
    if (idx_tab[i] != Invalid_index) {
      /* If the execution of the callback has succeeded, then we start the
         tracking of this block..

         Subtlety: we are actually writing [t->block] with an invalid
         (uninitialized) block. This is correct because the allocation
         and initialization happens right after returning from
         [caml_memprof_track_young]. */
      struct tracked *t = &trackst.entries[idx_tab[i]];
      t->block = Val_hp(Caml_state->young_ptr + Offs_placeholder(t->block));
      t->idx_ptr = NULL;
      CAMLassert(t->cb_alloc_called);
      if (idx_tab[i] < trackst.young) trackst.young = idx_tab[i];
    }
  }
  if (idx_tab != &first_idx) caml_stat_free(idx_tab);

  /* /!\ Since the heap is in an invalid state before initialization,
     very little heap operations are allowed until then. */

  return;
}

void caml_memprof_track_interned(header_t* block, header_t* blockend) {
  header_t *p;
  value callstack = 0;
  int is_young = Is_young(Val_hp(block));

  if (lambda == 0 || caml_memprof_suspended)
    return;

  p = block;
  while (1) {
    uintnat next_sample = rand_geom();
    header_t *next_sample_p, *next_p;
    if (next_sample > blockend - p)
      break;
    /* [next_sample_p] is the block *following* the next sampled
       block! */
    next_sample_p = p + next_sample;

    while (1) {
      next_p = p + Whsize_hp(p);
      if (next_p >= next_sample_p) break;
      p = next_p;
    }

    if (callstack == 0) callstack = capture_callstack_postponed();
    if (callstack == 0) break;  /* OOM */
    new_tracked(rand_binom(next_p - next_sample_p) + 1,
                Wosize_hp(p), 1, is_young, Val_hp(p), callstack);
    p = next_p;
  }
  check_action_pending();
}

/**** Interface with the OCaml code. ****/

static void caml_memprof_init(void) {
  init = 1;
  xoshiro_init();
}

void caml_memprof_shutdown(void) {
  init = 0;
  started = 0;
  lambda = 0.;
  caml_memprof_suspended = 0;
  trackst.len = 0;
  trackst.callback = trackst.young = trackst.delete = 0;
  caml_stat_free(trackst.entries);
  trackst.entries = NULL;
  trackst.alloc_len = 0;
  caml_stat_free(callstack_buffer);
  callstack_buffer = NULL;
  callstack_buffer_len = 0;
}

CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param)
{
  CAMLparam3(lv, szv, tracker_param);

  double l = Double_val(lv);
  intnat sz = Long_val(szv);

  if (started) caml_failwith("Gc.Memprof.start: already started.");

  if (sz < 0 || !(l >= 0.) || l > 1.) /* Checks that [l] is not NAN. */
    caml_invalid_argument("Gc.Memprof.start");

  if (!init) caml_memprof_init();

  lambda = l;
  if (l > 0) {
    one_log1m_lambda = l == 1 ? 0 : 1/caml_log1p(-l);
    rand_pos = RAND_BLOCK_SIZE;
    next_rand_geom = rand_geom();
  }

  caml_memprof_renew_minor_sample();

  callstack_size = sz;
  started = 1;

  tracker = tracker_param;
  caml_register_generational_global_root(&tracker);

  CAMLreturn(Val_unit);
}

CAMLprim value caml_memprof_stop(value unit)
{
  uintnat i;

  if (!started) caml_failwith("Gc.Memprof.stop: not started.");

  /* This call to [caml_memprof_stop] will discard all the previously
     tracked blocks. We try one last time to call the postponed
     callbacks. */
  caml_raise_if_exception(caml_memprof_handle_postponed_exn());

  /* Discard the tracked blocks. */
  for (i = 0; i < trackst.len; i++)
    if (trackst.entries[i].idx_ptr != NULL)
      *trackst.entries[i].idx_ptr = Invalid_index;
  trackst.len = 0;
  trackst.callback = trackst.young = trackst.delete = 0;
  caml_stat_free(trackst.entries);
  trackst.entries = NULL;
  trackst.alloc_len = 0;

  lambda = 0;
  caml_memprof_renew_minor_sample();
  started = 0;

  caml_remove_generational_global_root(&tracker);

  caml_stat_free(callstack_buffer);
  callstack_buffer = NULL;
  callstack_buffer_len = 0;

  return Val_unit;
}

/**** Interface with systhread. ****/

void caml_memprof_init_th_ctx(struct caml_memprof_th_ctx* ctx) {
  ctx->suspended = 0;
  ctx->callback_running = 0;
}

void caml_memprof_stop_th_ctx(struct caml_memprof_th_ctx* ctx) {
  /* Make sure that no memprof callback is being executed in this
     thread. If so, memprof data structures may have pointers to the
     thread's stack. */
  if(ctx->callback_running)
    caml_fatal_error("Thread.exit called from a memprof callback.");
}

void caml_memprof_save_th_ctx(struct caml_memprof_th_ctx* ctx) {
  ctx->suspended = caml_memprof_suspended;
  ctx->callback_running = callback_running;
}

void caml_memprof_restore_th_ctx(const struct caml_memprof_th_ctx* ctx) {
  caml_memprof_suspended = ctx->suspended;
  callback_running = ctx->callback_running;
  check_action_pending();
}