ocaml/runtime/memprof.c

/**************************************************************************/
/*                                                                        */
/*                                 OCaml                                  */
/*                                                                        */
/*             Jacques-Henri Joudan, projet Gallium, INRIA Paris          */
/*                                                                        */
/*   Copyright 2016 Institut National de Recherche en Informatique et     */
/*     en Automatique.                                                    */
/*                                                                        */
/*   All rights reserved.  This file is distributed under the terms of    */
/*   the GNU Lesser General Public License version 2.1, with the          */
/*   special exception on linking described in the file LICENSE.          */
/*                                                                        */
/**************************************************************************/

#define CAML_INTERNALS

#include <math.h>
#include <string.h>
#include "caml/memprof.h"
#include "caml/fail.h"
#include "caml/alloc.h"
#include "caml/callback.h"
#include "caml/signals.h"
#include "caml/memory.h"
#include "caml/minor_gc.h"
#include "caml/backtrace_prim.h"
#include "caml/weak.h"
#include "caml/stack.h"

static uint32_t mt_state[624];
static uint32_t mt_index;

/* [lambda] is the mean number of samples for each allocated word (including
   block headers). */
static double lambda = 0;
int caml_memprof_suspended = 0;
static intnat callstack_size = 0;
static value memprof_callback = Val_unit;

/* Whether memprof has been initialized.  */
static int init = 0;

/**** Statistical sampling ****/

static double mt_generate_uniform(void) {
  int i;
  uint32_t y;

  /* Mersenne twister PRNG */
  if (mt_index == 624) {
    for(i = 0; i < 227; i++) {
      y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
      mt_state[i] = mt_state[i+397] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
    }
    for(i = 227; i < 623; i++) {
      y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
      mt_state[i] = mt_state[i-227] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
    }
    y = (mt_state[623] & 0x80000000) + (mt_state[0] & 0x7fffffff);
    mt_state[623] = mt_state[396] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
    mt_index = 0;
  }

  y = mt_state[mt_index];
  y = y ^ (y >> 11);
  y = y ^ ((y << 7) & 0x9d2c5680);
  y = y ^ ((y << 15) & 0xefc60000);
  y = y ^ (y >> 18);

  mt_index++;
  return y*2.3283064365386962890625e-10 + /* 2^-32 */
          1.16415321826934814453125e-10; /* 2^-33 */
}

/* C99's [lgammaf] function is not available in some compilers
   (including MSVC). Here is our own approximate implementation of the
   log-factorial function.

   Requirement: [n] is a non-negative integer.

   We use Ramanujan's formula. For n < 10^8, the absolute error is
   less than 10^-6, which is way better than what we need.
 */
static double lfact(double n) {
  static double tab[10] = {
    0.0000000000, 0.0000000000, 0.6931471806, 1.7917594692, 3.1780538303,
    4.7874917428, 6.5792512120, 8.5251613611, 10.6046029027, 12.8018274801 };
  if(n < 10)
    return tab[(int)n];
  return n*(log(n) - 1) + (1./6)*log(((8*n + 4)*n + 1)*n + 1./34)
       + 0.5723649429; /* log(pi)/2 */
}

#define MAX_MT_GENERATE_POISSON (1<<29)
static double next_mt_generate_poisson;
/* Simulate a Poisson distribution of parameter [lambda*len].  The
   result is clipped to the interval [0..MAX_MT_GENERATE_POISSON] */
static int32_t mt_generate_poisson(double len) {
  double cur_lambda = lambda * len;
  CAMLassert(cur_lambda >= 0 && cur_lambda < 1e20);

  if(caml_memprof_suspended || cur_lambda == 0)
    return 0;

  if(cur_lambda < 20) {
    /* First algorithm when [cur_lambda] is small: we proceed by
       repeated simulations of exponential distributions. */

    next_mt_generate_poisson -= cur_lambda;
    if(next_mt_generate_poisson > 0) {
      /* Fast path if [cur_lambda] is small: we reuse the same
         exponential sample accross several calls to
         [mt_generate_poisson]. */
      return 0;
    } else {
      /* We use the float versions of exp/log, since these functions
         are significantly faster, and we really don't need much
         precision here. The entropy contained in
         [next_mt_generate_poisson] is anyway bounded by the entropy
         provided by [mt_generate_uniform], which is 32bits. */
      double p = expf(-next_mt_generate_poisson);
      int32_t k = 0;
      do {
        k++;
        p *= mt_generate_uniform();
      } while(p > 1);

      /* [p] is now uniformly distributed in [0, 1] and independent
         from other variables (including [k]). We can therefore reuse
         [p] for reinitializing [next_mt_generate_poisson]. */
      next_mt_generate_poisson = -logf(p);

      return k;
    }

  } else {
    /* Second algorithm when [cur_lambda] is large. Taken from: */
    /* The Computer Generation of Poisson Random Variables
       A. C. Atkinson Journal of the Royal Statistical Society.
       Series C (Applied Statistics) Vol. 28, No. 1 (1979), pp. 29-35
       "Method PA" */

    double c, beta_inverse, k, log_cur_lambda;
    log_cur_lambda = log(cur_lambda);
    c = 0.767 - 3.36/cur_lambda;
    beta_inverse = sqrt(0.30396355092701332623 * cur_lambda);
                        /* ^ = 3./(PI*PI) */
    k = log(c*beta_inverse) - cur_lambda;
    while(1) {
      double u, n, v, y;
      u = mt_generate_uniform();
      y = log(1./u-1);
      n = floor(cur_lambda - y*beta_inverse + 0.5);
      if(n < 0.)
        continue;
      v = mt_generate_uniform();
      /* When [cur_lambda] is large, we expect [n*log_cur_lambda] and
         [lfact(n)] to be close, while both being relatively
         large. Hence, here, we may actually need the double precision
         in the computation of log and lfact. */
      if(y + log(v*u*u) < k + n*log_cur_lambda - lfact(n))
        return n > MAX_MT_GENERATE_POISSON ? MAX_MT_GENERATE_POISSON : n;
    }
  }
}

/**** Interface with the OCaml code. ****/

CAMLprim value caml_memprof_set(value v) {
  CAMLparam1(v);
  double l = Double_val(Field(v, 0));
  intnat sz = Long_val(Field(v, 1));

  if(sz < 0 || !(l >= 0.) || l > 1.)
    caml_failwith("caml_memprof_set");

  if(!init) {
    int i;
    init = 1;

    mt_index = 624;
    mt_state[0] = 42;
    for(i = 1; i < 624; i++)
      mt_state[i] = 0x6c078965 * (mt_state[i-1] ^ (mt_state[i-1] >> 30)) + i;

    caml_register_generational_global_root(&memprof_callback);

    next_mt_generate_poisson = -logf(mt_generate_uniform());
  }

  lambda = l;
  callstack_size = sz;
  caml_modify_generational_global_root(&memprof_callback, Field(v, 2));

  CAMLreturn(Val_unit);
}

/* Cf. Gc.Memprof.alloc_kind */
enum ml_alloc_kind {
  Minor = Val_long(0),
  Major = Val_long(1),
  Serialized = Val_long(2)
};

static value do_callback(tag_t tag, intnat wosize, int32_t occurences,
                         value callstack, enum ml_alloc_kind cb_kind) {
  CAMLparam1(callstack);
  CAMLlocal1(sample_info);
  CAMLassert(occurences > 0);

  sample_info = caml_alloc_small(5, 0);
  Field(sample_info, 0) = Val_long(occurences);
  Field(sample_info, 1) = cb_kind;
  Field(sample_info, 2) = Val_long(tag);
  Field(sample_info, 3) = Val_long(wosize);
  Field(sample_info, 4) = callstack;

  CAMLreturn(caml_callback_exn(memprof_callback, sample_info));
}

void caml_memprof_set_suspended(int new_suspended) {
  caml_memprof_suspended = new_suspended;
}

/**** Sampling procedures ****/

static value capture_callstack() {
  value res;
  intnat size = caml_current_callstack_size(callstack_size);
  /* We do not use [caml_alloc] to make sure the GC will not get called. */
  if(size == 0) return Atom (0);
  res = caml_alloc_shr_no_track(size, 0);
  caml_current_callstack_write(res);
  return res;
}

struct caml_memprof_postponed_block {
  value block;
  value callstack;
  int32_t occurences;
  struct caml_memprof_postponed_block* next;
} static *caml_memprof_postponed_head = NULL;

/* When allocating in the major heap, we cannot call the callback,
   because [caml_alloc_shr] is guaranteed not to call the GC. Hence,
   this function determines if the block needs to be sampled, and if
   so, it registers the block in the todo-list so that the callback
   call is performed when possible. */
void caml_memprof_track_alloc_shr(value block) {
  int32_t occurences = mt_generate_poisson(Whsize_val(block));
  CAMLassert(Is_in_heap(block));
  if(occurences > 0) {
    struct caml_memprof_postponed_block* pb =
      caml_stat_alloc_noexc(sizeof(struct caml_memprof_postponed_block));
    value callstack = capture_callstack();
    if(pb == NULL) return;
    pb->block = block;
    caml_register_generational_global_root(&pb->block);
    pb->callstack = callstack;
    caml_register_generational_global_root(&pb->callstack);
    pb->occurences = occurences;
    pb->next = caml_memprof_postponed_head;
    caml_memprof_postponed_head = pb;
#ifndef NATIVE_CODE
    caml_something_to_do = 1;
#else
    caml_young_limit = caml_young_alloc_end;
#endif
  }
}

void caml_memprof_handle_postponed() {
  struct caml_memprof_postponed_block *p, *q;
  value ephe;

  if(caml_memprof_postponed_head == NULL)
    return;

  // We first reverse the list
  p = caml_memprof_postponed_head;
  q = caml_memprof_postponed_head->next;
  p->next = NULL;
  while(q != NULL) {
    struct caml_memprof_postponed_block* next = q->next;
    q->next = p;
    p = q;
    q = next;
  }
  caml_memprof_postponed_head = NULL;

#define NEXT_P \
  { struct caml_memprof_postponed_block* next = p->next;   \
    caml_remove_generational_global_root(&p->callstack);   \
    caml_remove_generational_global_root(&p->block);       \
    caml_stat_free(p);                                     \
    p = next; }

  caml_memprof_set_suspended(1);
  // We then do the actual iteration on postponed blocks
  while(p != NULL) {
    ephe = do_callback(Tag_val(p->block), Wosize_val(p->block),
                       p->occurences, p->callstack, Major);
    if (Is_exception_result(ephe)) {
      caml_memprof_set_suspended(0);
      // In the case of an exception, we just forget the entire list.
      while(p != NULL) NEXT_P;
      caml_raise(Extract_exception(ephe));
    }
    if(Is_block(ephe))
      caml_ephemeron_set_key(Field(ephe, 0), 0, p->block);
    NEXT_P;
  }
  caml_memprof_set_suspended(0);
}