From 3d63a106b678f54c47698f11db7d810d91d442ec Mon Sep 17 00:00:00 2001
From: Jacques-Henri Jourdan <jacques-henri.jourdan@normalesup.org>
Date: Mon, 25 May 2020 09:51:15 +0200
Subject: [PATCH] Memprof: optimize random sampling (#9466)

Instead of using the stdlib logf function for computing logarithms, we
use a faster polynomial-based approximation.

We use the xoshiro PRNG instead of the Mersenne Twister.  xoshiro is
simpler and faster.

We generate samples by batches so that compilers can vectorize the
generation loops using SIMD instructions when possible.
---
 Changes             |   3 +
 aclocal.m4          |  16 ++++
 configure           |  26 ++++++
 configure.ac        |   3 +
 runtime/caml/m.h.in |   2 +
 runtime/memprof.c   | 191 +++++++++++++++++++++++++++++---------------
 6 files changed, 178 insertions(+), 63 deletions(-)

diff --git a/Changes b/Changes
index a837859d0..f90c43536 100644
--- a/Changes
+++ b/Changes
@@ -14,6 +14,9 @@ Working version
   adding support for Musl ppc64le along the way.
   (Xavier Leroy and Anil Madhavapeddy, review by Stephen Dolan)
 
+- #9466: Memprof: optimize random samples generation.
+  (Jacques-Henri Jourdan review by Xavier Leroy and Stephen Dolan)
+
 - #9508: Remove support for FreeBSD prior to 4.0R, that required explicit
   floating-point initialization to behave like IEEE standard
   (Hannes Mehnert, review by David Allsopp)
diff --git a/aclocal.m4 b/aclocal.m4
index 5ac1b7293..34290b3e1 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -94,6 +94,22 @@ AC_DEFUN([OCAML_CC_SUPPORTS_ALIGNED], [
     AC_MSG_RESULT([yes])],
     [AC_MSG_RESULT([no])])])
 
+AC_DEFUN([OCAML_CC_SUPPORTS_TREE_VECTORIZE], [
+  AC_MSG_CHECKING(
+ [whether the C compiler supports __attribute__((optimize("tree-vectorize")))])
+  saved_CFLAGS="$CFLAGS"
+  CFLAGS="-Werror $CFLAGS"
+  AC_COMPILE_IFELSE(
+    [AC_LANG_SOURCE([
+       __attribute__((optimize("tree-vectorize"))) void f(void){}
+       int main() { f(); return 0; }
+    ])],
+    [AC_DEFINE([SUPPORTS_TREE_VECTORIZE])
+    AC_MSG_RESULT([yes])],
+    [AC_MSG_RESULT([no])])
+  CFLAGS="$saved_CFLAGS"
+])
+
 AC_DEFUN([OCAML_CC_HAS_DEBUG_PREFIX_MAP], [
   AC_MSG_CHECKING([whether the C compiler supports -fdebug-prefix-map])
   saved_CFLAGS="$CFLAGS"
diff --git a/configure b/configure
index 8b68bf02c..518f3f3f7 100755
--- a/configure
+++ b/configure
@@ -13783,6 +13783,32 @@ $as_echo "no" >&6; }
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
+## Check whether __attribute__((optimize("tree-vectorize")))) is supported
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))" >&5
+$as_echo_n "checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))... " >&6; }
+  saved_CFLAGS="$CFLAGS"
+  CFLAGS="-Werror $CFLAGS"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+       __attribute__((optimize("tree-vectorize"))) void f(void){}
+       int main() { f(); return 0; }
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  $as_echo "#define SUPPORTS_TREE_VECTORIZE 1" >>confdefs.h
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$saved_CFLAGS"
+
+
 # Configure the native-code compiler
 
 arch=none
diff --git a/configure.ac b/configure.ac
index f44e1be7d..e756cb638 100644
--- a/configure.ac
+++ b/configure.ac
@@ -907,6 +907,9 @@ AS_CASE(["$CC,$host"],
 
 OCAML_CC_SUPPORTS_ALIGNED
 
+## Check whether __attribute__((optimize("tree-vectorize")))) is supported
+OCAML_CC_SUPPORTS_TREE_VECTORIZE
+
 # Configure the native-code compiler
 
 arch=none
diff --git a/runtime/caml/m.h.in b/runtime/caml/m.h.in
index b5a7205b2..0e099134c 100644
--- a/runtime/caml/m.h.in
+++ b/runtime/caml/m.h.in
@@ -98,3 +98,5 @@
 #undef FUNCTION_SECTIONS
 
 #undef SUPPORTS_ALIGNED_ATTRIBUTE
+
+#undef SUPPORTS_TREE_VECTORIZE
diff --git a/runtime/memprof.c b/runtime/memprof.c
index aead07a08..1e3c885d6 100644
--- a/runtime/memprof.c
+++ b/runtime/memprof.c
@@ -15,7 +15,6 @@
 
 #define CAML_INTERNALS
 
-#include <math.h>
 #include <string.h>
 #include "caml/memprof.h"
 #include "caml/fail.h"
@@ -32,10 +31,13 @@
 #include "caml/printexc.h"
 #include "caml/eventlog.h"
 
-#define MT_STATE_SIZE 624
+#define RAND_BLOCK_SIZE 64
 
-static uint32_t mt_state[MT_STATE_SIZE];
-static uint32_t mt_index;
+static uint32_t xoshiro_state[4][RAND_BLOCK_SIZE];
+static uintnat rand_geom_buff[RAND_BLOCK_SIZE];
+static uint32_t rand_pos;
+
+static uint32_t rand_pos;
 
 /* [lambda] is the mean number of samples for each allocated word (including
    block headers). */
@@ -43,7 +45,7 @@ static double lambda = 0;
  /* Precomputed value of [1/log(1-lambda)], for fast sampling of
     geometric distribution.
     Dummy if [lambda = 0]. */
-static double one_log1m_lambda;
+static float one_log1m_lambda;
 
 /* [caml_memprof_suspended] is used for masking memprof callbacks when
    a callback is running or when an uncaught exception handler is
@@ -66,7 +68,6 @@ static intnat callstack_size;
 
 static value tracker;
 
-
 /* Pointer to the word following the next sample in the minor
    heap. Equals [Caml_state->young_alloc_start] if no sampling is planned in
    the current minor heap.
@@ -86,55 +87,124 @@ static intnat callstack_buffer_len = 0;
 
 /**** Statistical sampling ****/
 
-static double mt_generate_uniform(void)
-{
-  int i;
-  uint32_t y;
+Caml_inline uint64_t splitmix64_next(uint64_t* x) {
+  uint64_t z = (*x += 0x9E3779B97F4A7C15ull);
+  z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+  z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+  return z ^ (z >> 31);
+}
 
-  /* Mersenne twister PRNG */
-  if (mt_index == MT_STATE_SIZE) {
-    for (i = 0; i < 227; i++) {
-      y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
-      mt_state[i] = mt_state[i+397] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    }
-    for (i = 227; i < MT_STATE_SIZE - 1; i++) {
-      y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
-      mt_state[i] = mt_state[i-227] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    }
-    y = (mt_state[MT_STATE_SIZE - 1] & 0x80000000) + (mt_state[0] & 0x7fffffff);
-    mt_state[MT_STATE_SIZE - 1] =
-      mt_state[396] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    mt_index = 0;
+static void xoshiro_init(void) {
+  int i;
+  uint64_t splitmix64_state = 42;
+  rand_pos = RAND_BLOCK_SIZE;
+  for (i = 0; i < RAND_BLOCK_SIZE; i++) {
+    uint64_t t = splitmix64_next(&splitmix64_state);
+    xoshiro_state[0][i] = t & 0xFFFFFFFF;
+    xoshiro_state[1][i] = t >> 32;
+    t = splitmix64_next(&splitmix64_state);
+    xoshiro_state[2][i] = t & 0xFFFFFFFF;
+    xoshiro_state[3][i] = t >> 32;
+  }
+}
+
+Caml_inline uint32_t xoshiro_next(int i) {
+  uint32_t res = xoshiro_state[0][i] + xoshiro_state[3][i];
+  uint32_t t = xoshiro_state[1][i] << 9;
+  xoshiro_state[2][i] ^= xoshiro_state[0][i];
+  xoshiro_state[3][i] ^= xoshiro_state[1][i];
+  xoshiro_state[1][i] ^= xoshiro_state[2][i];
+  xoshiro_state[0][i] ^= xoshiro_state[3][i];
+  xoshiro_state[2][i] ^= t;
+  t = xoshiro_state[3][i];
+  xoshiro_state[3][i] = (t << 11) | (t >> 21);
+  return res;
+}
+
+/* Computes [log((y+0.5)/2^32)], up to a relatively good precision,
+   and guarantee that the result is negative.
+   The average absolute error is very close to 0. */
+Caml_inline float log_approx(uint32_t y) {
+  union { float f; int32_t i; } u;
+  float exp, x;
+  u.f = y + 0.5f;    /* We convert y to a float ... */
+  exp = u.i >> 23;   /* ... of which we extract the exponent ... */
+  u.i = (u.i & 0x7FFFFF) | 0x3F800000;
+  x = u.f;           /* ... and the mantissa. */
+
+  return
+    /* This polynomial computes the logarithm of the mantissa (which
+       is in [1, 2]), up to an additive constant. It is chosen such that :
+       - Its degree is 4.
+       - Its average value is that of log in [1, 2]
+             (the sampling has the right mean when lambda is small).
+       - f(1) = f(2) - log(2) = -159*log(2) - 1e-5
+             (this guarantee that log_approx(y) is always <= -1e-5 < 0).
+       - The maximum of abs(f(x)-log(x)+159*log(2)) is minimized.
+    */
+    x * (2.104659476859f + x * (-0.720478916626f + x * 0.107132064797f))
+
+    /* Then, we add the term corresponding to the exponent, and
+       additive constants. */
+    + (-111.701724334061f + 0.6931471805f*exp);
+}
+
+/* This function regenerates [MT_STATE_SIZE] geometric random
+   variables at once. Doing this by batches help us gain performances:
+   many compilers (e.g., GCC, CLang, ICC) will be able to use SIMD
+   instructions to get a performance boost.
+*/
+#ifdef SUPPORTS_TREE_VECTORIZE
+__attribute__((optimize("tree-vectorize")))
+#endif
+static void rand_batch(void) {
+  int i;
+
+  /* Instead of using temporary buffers, we could use one big loop,
+     but it turns out SIMD optimizations of compilers are more fragile
+     when using larger loops.  */
+  static uint32_t A[RAND_BLOCK_SIZE];
+  static float B[RAND_BLOCK_SIZE];
+
+  CAMLassert(lambda > 0.);
+
+  /* Shuffle the xoshiro samplers, and generate uniform variables in A. */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++)
+    A[i] = xoshiro_next(i);
+
+  /* Generate exponential random variables by computing logarithms. We
+     do not use math.h library functions, which are slow and prevent
+     compiler from using SIMD instructions. */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++)
+    B[i] = 1 + log_approx(A[i]) * one_log1m_lambda;
+
+  /* We do the final flooring for generating geometric
+     variables. Compilers are unlikely to use SIMD instructions for
+     this loop, because it involves a conditional and variables of
+     different sizes (32 and 64 bits). */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++) {
+    double f = B[i];
+    CAMLassert (f >= 1);
+    if(f > Max_long) rand_geom_buff[i] = Max_long;
+    else rand_geom_buff[i] = (uintnat)f;
   }
 
-  y = mt_state[mt_index];
-  y = y ^ (y >> 11);
-  y = y ^ ((y << 7) & 0x9d2c5680);
-  y = y ^ ((y << 15) & 0xefc60000);
-  y = y ^ (y >> 18);
-
-  mt_index++;
-  return y*2.3283064365386962890625e-10 + /* 2^-32 */
-          1.16415321826934814453125e-10; /* 2^-33 */
+  rand_pos = 0;
 }
 
 /* Simulate a geometric variable of parameter [lambda].
    The result is clipped in [1..Max_long] */
-static uintnat mt_generate_geom(void)
+static uintnat rand_geom(void)
 {
-  double res;
+  uintnat res;
   CAMLassert(lambda > 0.);
-  /* We use the float versions of exp/log, since these functions are
-     significantly faster, and we really don't need much precision
-     here. The entropy contained in [next_mt_generate_geom] is anyway
-     bounded by the entropy provided by [mt_generate_uniform], which
-     is 32bits. */
-  res = 1 + logf(mt_generate_uniform()) * one_log1m_lambda;
-  if (res > Max_long) return Max_long;
-  return (uintnat)res;
+  if(rand_pos == RAND_BLOCK_SIZE) rand_batch();
+  res = rand_geom_buff[rand_pos++];
+  CAMLassert(1 <= res && res <= Max_long);
+  return res;
 }
 
-static uintnat next_mt_generate_geom;
+static uintnat next_rand_geom;
 /* Simulate a binomial variable of parameters [len] and [lambda].
    This sampling algorithm has running time linear with [len *
    lambda].  We could use more a involved algorithm, but this should
@@ -146,13 +216,13 @@ static uintnat next_mt_generate_geom;
      Hormann, Wolfgang. "The generation of binomial random variates."
      Journal of statistical computation and simulation 46.1-2 (1993), pp101-110.
  */
-static uintnat mt_generate_binom(uintnat len)
+static uintnat rand_binom(uintnat len)
 {
   uintnat res;
   CAMLassert(lambda > 0. && len < Max_long);
-  for (res = 0; next_mt_generate_geom < len; res++)
-    next_mt_generate_geom += mt_generate_geom();
-  next_mt_generate_geom -= len;
+  for (res = 0; next_rand_geom < len; res++)
+    next_rand_geom += rand_geom();
+  next_rand_geom -= len;
   return res;
 }
 
@@ -583,7 +653,7 @@ void caml_memprof_track_alloc_shr(value block)
   /* This test also makes sure memprof is initialized. */
   if (lambda == 0 || caml_memprof_suspended) return;
 
-  n_samples = mt_generate_binom(Whsize_val(block));
+  n_samples = rand_binom(Whsize_val(block));
   if (n_samples == 0) return;
 
   callstack = capture_callstack_postponed();
@@ -617,7 +687,7 @@ void caml_memprof_renew_minor_sample(void)
   if (lambda == 0) /* No trigger in the current minor heap. */
     caml_memprof_young_trigger = Caml_state->young_alloc_start;
   else {
-    uintnat geom = mt_generate_geom();
+    uintnat geom = rand_geom();
     if (Caml_state->young_ptr - Caml_state->young_alloc_start < geom)
       /* No trigger in the current minor heap. */
       caml_memprof_young_trigger = Caml_state->young_alloc_start;
@@ -656,7 +726,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
 
   if (!from_caml) {
     unsigned n_samples = 1 +
-      mt_generate_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
+      rand_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
     CAMLassert(encoded_alloc_lens == NULL);    /* No Comballoc in C! */
     caml_memprof_renew_minor_sample();
 
@@ -697,7 +767,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
     alloc_ofs -= Whsize_wosize(alloc_wosz);
     while (alloc_ofs < trigger_ofs) {
       n_samples++;
-      trigger_ofs -= mt_generate_geom();
+      trigger_ofs -= rand_geom();
     }
     if (n_samples > 0) {
       uintnat *idx_ptr, t_idx;
@@ -713,7 +783,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
           /* [lambda] changed during the callback. We need to refresh
              [trigger_ofs]. */
           saved_lambda = lambda;
-          trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (mt_generate_geom() - 1);
+          trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (rand_geom() - 1);
         }
       }
       if (Is_exception_result(res)) break;
@@ -834,7 +904,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
 
   p = block;
   while (1) {
-    uintnat next_sample = mt_generate_geom();
+    uintnat next_sample = rand_geom();
     header_t *next_sample_p, *next_p;
     if (next_sample > blockend - p)
       break;
@@ -850,7 +920,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
 
     if (callstack == 0) callstack = capture_callstack_postponed();
     if (callstack == 0) break;  /* OOM */
-    new_tracked(mt_generate_binom(next_p - next_sample_p) + 1,
+    new_tracked(rand_binom(next_p - next_sample_p) + 1,
                 Wosize_hp(p), 1, is_young, Val_hp(p), callstack);
     p = next_p;
   }
@@ -860,14 +930,8 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
 /**** Interface with the OCaml code. ****/
 
 static void caml_memprof_init(void) {
-  uintnat i;
-
   init = 1;
-
-  mt_index = MT_STATE_SIZE;
-  mt_state[0] = 42;
-  for (i = 1; i < MT_STATE_SIZE; i++)
-    mt_state[i] = 0x6c078965 * (mt_state[i-1] ^ (mt_state[i-1] >> 30)) + i;
+  xoshiro_init();
 }
 
 void caml_memprof_shutdown(void) {
@@ -902,7 +966,8 @@ CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param)
   lambda = l;
   if (l > 0) {
     one_log1m_lambda = l == 1 ? 0 : 1/caml_log1p(-l);
-    next_mt_generate_geom = mt_generate_geom();
+    rand_pos = RAND_BLOCK_SIZE;
+    next_rand_geom = rand_geom();
   }
 
   caml_memprof_renew_minor_sample();