Memprof: optimize random sampling (#9466)

Instead of using the stdlib logf function for computing logarithms, we use a faster polynomial-based approximation. We use the xoshiro PRNG instead of the Mersenne Twister. xoshiro is simpler and faster. We generate samples by batches so that compilers can vectorize the generation loops using SIMD instructions when possible.
2020-05-25 09:51:15 +02:00 · 2020-05-25 09:51:15 +02:00 · 3d63a106b6
parent b2f467abc0
commit 3d63a106b6
6 changed files with 178 additions and 63 deletions
--- a/3
+++ b/3
@ -14,6 +14,9 @@ Working version
  adding support for Musl ppc64le along the way.
  (Xavier Leroy and Anil Madhavapeddy, review by Stephen Dolan)

+- #9466: Memprof: optimize random samples generation.
+  (Jacques-Henri Jourdan review by Xavier Leroy and Stephen Dolan)
+
 - #9508: Remove support for FreeBSD prior to 4.0R, that required explicit
  floating-point initialization to behave like IEEE standard
  (Hannes Mehnert, review by David Allsopp)
--- a/aclocal.m4
+++ b/aclocal.m4
@ -94,6 +94,22 @@ AC_DEFUN([OCAML_CC_SUPPORTS_ALIGNED], [
    AC_MSG_RESULT([yes])],
    [AC_MSG_RESULT([no])])])

+AC_DEFUN([OCAML_CC_SUPPORTS_TREE_VECTORIZE], [
+  AC_MSG_CHECKING(
+ [whether the C compiler supports __attribute__((optimize("tree-vectorize")))])
+  saved_CFLAGS="$CFLAGS"
+  CFLAGS="-Werror $CFLAGS"
+  AC_COMPILE_IFELSE(
+    [AC_LANG_SOURCE([
+       __attribute__((optimize("tree-vectorize"))) void f(void){}
+       int main() { f(); return 0; }
+    ])],
+    [AC_DEFINE([SUPPORTS_TREE_VECTORIZE])
+    AC_MSG_RESULT([yes])],
+    [AC_MSG_RESULT([no])])
+  CFLAGS="$saved_CFLAGS"
+])
+
 AC_DEFUN([OCAML_CC_HAS_DEBUG_PREFIX_MAP], [
  AC_MSG_CHECKING([whether the C compiler supports -fdebug-prefix-map])
  saved_CFLAGS="$CFLAGS"
--- a/26
+++ b/26
@ -13783,6 +13783,32 @@ $as_echo "no" >&6; }
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext

+## Check whether __attribute__((optimize("tree-vectorize")))) is supported
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))" >&5
+$as_echo_n "checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))... " >&6; }
+  saved_CFLAGS="$CFLAGS"
+  CFLAGS="-Werror $CFLAGS"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+       __attribute__((optimize("tree-vectorize"))) void f(void){}
+       int main() { f(); return 0; }
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  $as_echo "#define SUPPORTS_TREE_VECTORIZE 1" >>confdefs.h
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$saved_CFLAGS"
+
+
 # Configure the native-code compiler

 arch=none
--- a/configure.ac
+++ b/configure.ac
@ -907,6 +907,9 @@ AS_CASE(["$CC,$host"],

 OCAML_CC_SUPPORTS_ALIGNED

+## Check whether __attribute__((optimize("tree-vectorize")))) is supported
+OCAML_CC_SUPPORTS_TREE_VECTORIZE
+
 # Configure the native-code compiler

 arch=none
--- a/runtime/caml/m.h.in
+++ b/runtime/caml/m.h.in
@ -98,3 +98,5 @@
 #undef FUNCTION_SECTIONS

 #undef SUPPORTS_ALIGNED_ATTRIBUTE
+
+#undef SUPPORTS_TREE_VECTORIZE
--- a/runtime/memprof.c
+++ b/runtime/memprof.c
@ -15,7 +15,6 @@

 #define CAML_INTERNALS

-#include <math.h>
 #include <string.h>
 #include "caml/memprof.h"
 #include "caml/fail.h"
@ -32,10 +31,13 @@
 #include "caml/printexc.h"
 #include "caml/eventlog.h"

-#define MT_STATE_SIZE 624
+#define RAND_BLOCK_SIZE 64

-static uint32_t mt_state[MT_STATE_SIZE];
-static uint32_t mt_index;
+static uint32_t xoshiro_state[4][RAND_BLOCK_SIZE];
+static uintnat rand_geom_buff[RAND_BLOCK_SIZE];
+static uint32_t rand_pos;
+
+static uint32_t rand_pos;

 /* [lambda] is the mean number of samples for each allocated word (including
   block headers). */
@ -43,7 +45,7 @@ static double lambda = 0;
 /* Precomputed value of [1/log(1-lambda)], for fast sampling of
    geometric distribution.
    Dummy if [lambda = 0]. */
-static double one_log1m_lambda;
+static float one_log1m_lambda;

 /* [caml_memprof_suspended] is used for masking memprof callbacks when
   a callback is running or when an uncaught exception handler is
@ -66,7 +68,6 @@ static intnat callstack_size;

 static value tracker;

-
 /* Pointer to the word following the next sample in the minor
   heap. Equals [Caml_state->young_alloc_start] if no sampling is planned in
   the current minor heap.
@ -86,55 +87,124 @@ static intnat callstack_buffer_len = 0;

 /**** Statistical sampling ****/

-static double mt_generate_uniform(void)
-{
-  int i;
-  uint32_t y;
+Caml_inline uint64_t splitmix64_next(uint64_t* x) {
+  uint64_t z = (*x += 0x9E3779B97F4A7C15ull);
+  z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+  z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+  return z ^ (z >> 31);
+}

-  /* Mersenne twister PRNG */
-  if (mt_index == MT_STATE_SIZE) {
-    for (i = 0; i < 227; i++) {
-      y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
-      mt_state[i] = mt_state[i+397] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    }
-    for (i = 227; i < MT_STATE_SIZE - 1; i++) {
-      y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
-      mt_state[i] = mt_state[i-227] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    }
-    y = (mt_state[MT_STATE_SIZE - 1] & 0x80000000) + (mt_state[0] & 0x7fffffff);
-    mt_state[MT_STATE_SIZE - 1] =
-      mt_state[396] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    mt_index = 0;
+static void xoshiro_init(void) {
+  int i;
+  uint64_t splitmix64_state = 42;
+  rand_pos = RAND_BLOCK_SIZE;
+  for (i = 0; i < RAND_BLOCK_SIZE; i++) {
+    uint64_t t = splitmix64_next(&splitmix64_state);
+    xoshiro_state[0][i] = t & 0xFFFFFFFF;
+    xoshiro_state[1][i] = t >> 32;
+    t = splitmix64_next(&splitmix64_state);
+    xoshiro_state[2][i] = t & 0xFFFFFFFF;
+    xoshiro_state[3][i] = t >> 32;
+  }
+}
+
+Caml_inline uint32_t xoshiro_next(int i) {
+  uint32_t res = xoshiro_state[0][i] + xoshiro_state[3][i];
+  uint32_t t = xoshiro_state[1][i] << 9;
+  xoshiro_state[2][i] ^= xoshiro_state[0][i];
+  xoshiro_state[3][i] ^= xoshiro_state[1][i];
+  xoshiro_state[1][i] ^= xoshiro_state[2][i];
+  xoshiro_state[0][i] ^= xoshiro_state[3][i];
+  xoshiro_state[2][i] ^= t;
+  t = xoshiro_state[3][i];
+  xoshiro_state[3][i] = (t << 11) | (t >> 21);
+  return res;
+}
+
+/* Computes [log((y+0.5)/2^32)], up to a relatively good precision,
+   and guarantee that the result is negative.
+   The average absolute error is very close to 0. */
+Caml_inline float log_approx(uint32_t y) {
+  union { float f; int32_t i; } u;
+  float exp, x;
+  u.f = y + 0.5f;    /* We convert y to a float ... */
+  exp = u.i >> 23;   /* ... of which we extract the exponent ... */
+  u.i = (u.i & 0x7FFFFF) | 0x3F800000;
+  x = u.f;           /* ... and the mantissa. */
+
+  return
+    /* This polynomial computes the logarithm of the mantissa (which
+       is in [1, 2]), up to an additive constant. It is chosen such that :
+       - Its degree is 4.
+       - Its average value is that of log in [1, 2]
+             (the sampling has the right mean when lambda is small).
+       - f(1) = f(2) - log(2) = -159*log(2) - 1e-5
+             (this guarantee that log_approx(y) is always <= -1e-5 < 0).
+       - The maximum of abs(f(x)-log(x)+159*log(2)) is minimized.
+    */
+    x * (2.104659476859f + x * (-0.720478916626f + x * 0.107132064797f))
+
+    /* Then, we add the term corresponding to the exponent, and
+       additive constants. */
+    + (-111.701724334061f + 0.6931471805f*exp);
+}
+
+/* This function regenerates [MT_STATE_SIZE] geometric random
+   variables at once. Doing this by batches help us gain performances:
+   many compilers (e.g., GCC, CLang, ICC) will be able to use SIMD
+   instructions to get a performance boost.
+*/
+#ifdef SUPPORTS_TREE_VECTORIZE
+__attribute__((optimize("tree-vectorize")))
+#endif
+static void rand_batch(void) {
+  int i;
+
+  /* Instead of using temporary buffers, we could use one big loop,
+     but it turns out SIMD optimizations of compilers are more fragile
+     when using larger loops.  */
+  static uint32_t A[RAND_BLOCK_SIZE];
+  static float B[RAND_BLOCK_SIZE];
+
+  CAMLassert(lambda > 0.);
+
+  /* Shuffle the xoshiro samplers, and generate uniform variables in A. */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++)
+    A[i] = xoshiro_next(i);
+
+  /* Generate exponential random variables by computing logarithms. We
+     do not use math.h library functions, which are slow and prevent
+     compiler from using SIMD instructions. */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++)
+    B[i] = 1 + log_approx(A[i]) * one_log1m_lambda;
+
+  /* We do the final flooring for generating geometric
+     variables. Compilers are unlikely to use SIMD instructions for
+     this loop, because it involves a conditional and variables of
+     different sizes (32 and 64 bits). */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++) {
+    double f = B[i];
+    CAMLassert (f >= 1);
+    if(f > Max_long) rand_geom_buff[i] = Max_long;
+    else rand_geom_buff[i] = (uintnat)f;
  }

-  y = mt_state[mt_index];
-  y = y ^ (y >> 11);
-  y = y ^ ((y << 7) & 0x9d2c5680);
-  y = y ^ ((y << 15) & 0xefc60000);
-  y = y ^ (y >> 18);
-
-  mt_index++;
-  return y*2.3283064365386962890625e-10 + /* 2^-32 */
-          1.16415321826934814453125e-10; /* 2^-33 */
+  rand_pos = 0;
 }

 /* Simulate a geometric variable of parameter [lambda].
   The result is clipped in [1..Max_long] */
-static uintnat mt_generate_geom(void)
+static uintnat rand_geom(void)
 {
-  double res;
+  uintnat res;
  CAMLassert(lambda > 0.);
-  /* We use the float versions of exp/log, since these functions are
-     significantly faster, and we really don't need much precision
-     here. The entropy contained in [next_mt_generate_geom] is anyway
-     bounded by the entropy provided by [mt_generate_uniform], which
-     is 32bits. */
-  res = 1 + logf(mt_generate_uniform()) * one_log1m_lambda;
-  if (res > Max_long) return Max_long;
-  return (uintnat)res;
+  if(rand_pos == RAND_BLOCK_SIZE) rand_batch();
+  res = rand_geom_buff[rand_pos++];
+  CAMLassert(1 <= res && res <= Max_long);
+  return res;
 }

-static uintnat next_mt_generate_geom;
+static uintnat next_rand_geom;
 /* Simulate a binomial variable of parameters [len] and [lambda].
   This sampling algorithm has running time linear with [len *
   lambda].  We could use more a involved algorithm, but this should
@ -146,13 +216,13 @@ static uintnat next_mt_generate_geom;
     Hormann, Wolfgang. "The generation of binomial random variates."
     Journal of statistical computation and simulation 46.1-2 (1993), pp101-110.
 */
-static uintnat mt_generate_binom(uintnat len)
+static uintnat rand_binom(uintnat len)
 {
  uintnat res;
  CAMLassert(lambda > 0. && len < Max_long);
-  for (res = 0; next_mt_generate_geom < len; res++)
-    next_mt_generate_geom += mt_generate_geom();
-  next_mt_generate_geom -= len;
+  for (res = 0; next_rand_geom < len; res++)
+    next_rand_geom += rand_geom();
+  next_rand_geom -= len;
  return res;
 }

@ -583,7 +653,7 @@ void caml_memprof_track_alloc_shr(value block)
  /* This test also makes sure memprof is initialized. */
  if (lambda == 0 || caml_memprof_suspended) return;

-  n_samples = mt_generate_binom(Whsize_val(block));
+  n_samples = rand_binom(Whsize_val(block));
  if (n_samples == 0) return;

  callstack = capture_callstack_postponed();
@ -617,7 +687,7 @@ void caml_memprof_renew_minor_sample(void)
  if (lambda == 0) /* No trigger in the current minor heap. */
    caml_memprof_young_trigger = Caml_state->young_alloc_start;
  else {
-    uintnat geom = mt_generate_geom();
+    uintnat geom = rand_geom();
    if (Caml_state->young_ptr - Caml_state->young_alloc_start < geom)
      /* No trigger in the current minor heap. */
      caml_memprof_young_trigger = Caml_state->young_alloc_start;
@ -656,7 +726,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,

  if (!from_caml) {
    unsigned n_samples = 1 +
-      mt_generate_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
+      rand_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
    CAMLassert(encoded_alloc_lens == NULL);    /* No Comballoc in C! */
    caml_memprof_renew_minor_sample();

@ -697,7 +767,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
    alloc_ofs -= Whsize_wosize(alloc_wosz);
    while (alloc_ofs < trigger_ofs) {
      n_samples++;
-      trigger_ofs -= mt_generate_geom();
+      trigger_ofs -= rand_geom();
    }
    if (n_samples > 0) {
      uintnat *idx_ptr, t_idx;
@ -713,7 +783,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
          /* [lambda] changed during the callback. We need to refresh
             [trigger_ofs]. */
          saved_lambda = lambda;
-          trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (mt_generate_geom() - 1);
+          trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (rand_geom() - 1);
        }
      }
      if (Is_exception_result(res)) break;
@ -834,7 +904,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {

  p = block;
  while (1) {
-    uintnat next_sample = mt_generate_geom();
+    uintnat next_sample = rand_geom();
    header_t *next_sample_p, *next_p;
    if (next_sample > blockend - p)
      break;
@ -850,7 +920,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {

    if (callstack == 0) callstack = capture_callstack_postponed();
    if (callstack == 0) break;  /* OOM */
-    new_tracked(mt_generate_binom(next_p - next_sample_p) + 1,
+    new_tracked(rand_binom(next_p - next_sample_p) + 1,
                Wosize_hp(p), 1, is_young, Val_hp(p), callstack);
    p = next_p;
  }
@ -860,14 +930,8 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
 /**** Interface with the OCaml code. ****/

 static void caml_memprof_init(void) {
-  uintnat i;
-
  init = 1;
-
-  mt_index = MT_STATE_SIZE;
-  mt_state[0] = 42;
-  for (i = 1; i < MT_STATE_SIZE; i++)
-    mt_state[i] = 0x6c078965 * (mt_state[i-1] ^ (mt_state[i-1] >> 30)) + i;
+  xoshiro_init();
 }

 void caml_memprof_shutdown(void) {
@ -902,7 +966,8 @@ CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param)
  lambda = l;
  if (l > 0) {
    one_log1m_lambda = l == 1 ? 0 : 1/caml_log1p(-l);
-    next_mt_generate_geom = mt_generate_geom();
+    rand_pos = RAND_BLOCK_SIZE;
+    next_rand_geom = rand_geom();
  }

  caml_memprof_renew_minor_sample();