diff --git a/Changes b/Changes
index a837859d0..f90c43536 100644
--- a/Changes
+++ b/Changes
@@ -14,6 +14,9 @@ Working version
   adding support for Musl ppc64le along the way.
   (Xavier Leroy and Anil Madhavapeddy, review by Stephen Dolan)
 
+- #9466: Memprof: optimize random samples generation.
+  (Jacques-Henri Jourdan review by Xavier Leroy and Stephen Dolan)
+
 - #9508: Remove support for FreeBSD prior to 4.0R, that required explicit
   floating-point initialization to behave like IEEE standard
   (Hannes Mehnert, review by David Allsopp)
diff --git a/aclocal.m4 b/aclocal.m4
index 5ac1b7293..34290b3e1 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -94,6 +94,22 @@ AC_DEFUN([OCAML_CC_SUPPORTS_ALIGNED], [
     AC_MSG_RESULT([yes])],
     [AC_MSG_RESULT([no])])])
 
+AC_DEFUN([OCAML_CC_SUPPORTS_TREE_VECTORIZE], [
+  AC_MSG_CHECKING(
+ [whether the C compiler supports __attribute__((optimize("tree-vectorize")))])
+  saved_CFLAGS="$CFLAGS"
+  CFLAGS="-Werror $CFLAGS"
+  AC_COMPILE_IFELSE(
+    [AC_LANG_SOURCE([
+       __attribute__((optimize("tree-vectorize"))) void f(void){}
+       int main() { f(); return 0; }
+    ])],
+    [AC_DEFINE([SUPPORTS_TREE_VECTORIZE])
+    AC_MSG_RESULT([yes])],
+    [AC_MSG_RESULT([no])])
+  CFLAGS="$saved_CFLAGS"
+])
+
 AC_DEFUN([OCAML_CC_HAS_DEBUG_PREFIX_MAP], [
   AC_MSG_CHECKING([whether the C compiler supports -fdebug-prefix-map])
   saved_CFLAGS="$CFLAGS"
diff --git a/configure b/configure
index 8b68bf02c..518f3f3f7 100755
--- a/configure
+++ b/configure
@@ -13783,6 +13783,32 @@ $as_echo "no" >&6; }
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
+## Check whether __attribute__((optimize("tree-vectorize")))) is supported
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))" >&5
+$as_echo_n "checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))... " >&6; }
+  saved_CFLAGS="$CFLAGS"
+  CFLAGS="-Werror $CFLAGS"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+       __attribute__((optimize("tree-vectorize"))) void f(void){}
+       int main() { f(); return 0; }
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  $as_echo "#define SUPPORTS_TREE_VECTORIZE 1" >>confdefs.h
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$saved_CFLAGS"
+
+
 # Configure the native-code compiler
 
 arch=none
diff --git a/configure.ac b/configure.ac
index f44e1be7d..e756cb638 100644
--- a/configure.ac
+++ b/configure.ac
@@ -907,6 +907,9 @@ AS_CASE(["$CC,$host"],
 
 OCAML_CC_SUPPORTS_ALIGNED
 
+## Check whether __attribute__((optimize("tree-vectorize")))) is supported
+OCAML_CC_SUPPORTS_TREE_VECTORIZE
+
 # Configure the native-code compiler
 
 arch=none
diff --git a/runtime/caml/m.h.in b/runtime/caml/m.h.in
index b5a7205b2..0e099134c 100644
--- a/runtime/caml/m.h.in
+++ b/runtime/caml/m.h.in
@@ -98,3 +98,5 @@
 #undef FUNCTION_SECTIONS
 
 #undef SUPPORTS_ALIGNED_ATTRIBUTE
+
+#undef SUPPORTS_TREE_VECTORIZE
diff --git a/runtime/memprof.c b/runtime/memprof.c
index aead07a08..1e3c885d6 100644
--- a/runtime/memprof.c
+++ b/runtime/memprof.c
@@ -15,7 +15,6 @@
 
 #define CAML_INTERNALS
 
-#include <math.h>
 #include <string.h>
 #include "caml/memprof.h"
 #include "caml/fail.h"
@@ -32,10 +31,13 @@
 #include "caml/printexc.h"
 #include "caml/eventlog.h"
 
-#define MT_STATE_SIZE 624
+#define RAND_BLOCK_SIZE 64
 
-static uint32_t mt_state[MT_STATE_SIZE];
-static uint32_t mt_index;
+static uint32_t xoshiro_state[4][RAND_BLOCK_SIZE];
+static uintnat rand_geom_buff[RAND_BLOCK_SIZE];
+static uint32_t rand_pos;
+
+static uint32_t rand_pos;
 
 /* [lambda] is the mean number of samples for each allocated word (including
    block headers). */
@@ -43,7 +45,7 @@ static double lambda = 0;
  /* Precomputed value of [1/log(1-lambda)], for fast sampling of
     geometric distribution.
     Dummy if [lambda = 0]. */
-static double one_log1m_lambda;
+static float one_log1m_lambda;
 
 /* [caml_memprof_suspended] is used for masking memprof callbacks when
    a callback is running or when an uncaught exception handler is
@@ -66,7 +68,6 @@ static intnat callstack_size;
 
 static value tracker;
 
-
 /* Pointer to the word following the next sample in the minor
    heap. Equals [Caml_state->young_alloc_start] if no sampling is planned in
    the current minor heap.
@@ -86,55 +87,124 @@ static intnat callstack_buffer_len = 0;
 
 /**** Statistical sampling ****/
 
-static double mt_generate_uniform(void)
-{
-  int i;
-  uint32_t y;
+Caml_inline uint64_t splitmix64_next(uint64_t* x) {
+  uint64_t z = (*x += 0x9E3779B97F4A7C15ull);
+  z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+  z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+  return z ^ (z >> 31);
+}
 
-  /* Mersenne twister PRNG */
-  if (mt_index == MT_STATE_SIZE) {
-    for (i = 0; i < 227; i++) {
-      y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
-      mt_state[i] = mt_state[i+397] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    }
-    for (i = 227; i < MT_STATE_SIZE - 1; i++) {
-      y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
-      mt_state[i] = mt_state[i-227] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    }
-    y = (mt_state[MT_STATE_SIZE - 1] & 0x80000000) + (mt_state[0] & 0x7fffffff);
-    mt_state[MT_STATE_SIZE - 1] =
-      mt_state[396] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
-    mt_index = 0;
+static void xoshiro_init(void) {
+  int i;
+  uint64_t splitmix64_state = 42;
+  rand_pos = RAND_BLOCK_SIZE;
+  for (i = 0; i < RAND_BLOCK_SIZE; i++) {
+    uint64_t t = splitmix64_next(&splitmix64_state);
+    xoshiro_state[0][i] = t & 0xFFFFFFFF;
+    xoshiro_state[1][i] = t >> 32;
+    t = splitmix64_next(&splitmix64_state);
+    xoshiro_state[2][i] = t & 0xFFFFFFFF;
+    xoshiro_state[3][i] = t >> 32;
+  }
+}
+
+Caml_inline uint32_t xoshiro_next(int i) {
+  uint32_t res = xoshiro_state[0][i] + xoshiro_state[3][i];
+  uint32_t t = xoshiro_state[1][i] << 9;
+  xoshiro_state[2][i] ^= xoshiro_state[0][i];
+  xoshiro_state[3][i] ^= xoshiro_state[1][i];
+  xoshiro_state[1][i] ^= xoshiro_state[2][i];
+  xoshiro_state[0][i] ^= xoshiro_state[3][i];
+  xoshiro_state[2][i] ^= t;
+  t = xoshiro_state[3][i];
+  xoshiro_state[3][i] = (t << 11) | (t >> 21);
+  return res;
+}
+
+/* Computes [log((y+0.5)/2^32)], up to a relatively good precision,
+   and guarantee that the result is negative.
+   The average absolute error is very close to 0. */
+Caml_inline float log_approx(uint32_t y) {
+  union { float f; int32_t i; } u;
+  float exp, x;
+  u.f = y + 0.5f;    /* We convert y to a float ... */
+  exp = u.i >> 23;   /* ... of which we extract the exponent ... */
+  u.i = (u.i & 0x7FFFFF) | 0x3F800000;
+  x = u.f;           /* ... and the mantissa. */
+
+  return
+    /* This polynomial computes the logarithm of the mantissa (which
+       is in [1, 2]), up to an additive constant. It is chosen such that :
+       - Its degree is 4.
+       - Its average value is that of log in [1, 2]
+             (the sampling has the right mean when lambda is small).
+       - f(1) = f(2) - log(2) = -159*log(2) - 1e-5
+             (this guarantee that log_approx(y) is always <= -1e-5 < 0).
+       - The maximum of abs(f(x)-log(x)+159*log(2)) is minimized.
+    */
+    x * (2.104659476859f + x * (-0.720478916626f + x * 0.107132064797f))
+
+    /* Then, we add the term corresponding to the exponent, and
+       additive constants. */
+    + (-111.701724334061f + 0.6931471805f*exp);
+}
+
+/* This function regenerates [MT_STATE_SIZE] geometric random
+   variables at once. Doing this by batches help us gain performances:
+   many compilers (e.g., GCC, CLang, ICC) will be able to use SIMD
+   instructions to get a performance boost.
+*/
+#ifdef SUPPORTS_TREE_VECTORIZE
+__attribute__((optimize("tree-vectorize")))
+#endif
+static void rand_batch(void) {
+  int i;
+
+  /* Instead of using temporary buffers, we could use one big loop,
+     but it turns out SIMD optimizations of compilers are more fragile
+     when using larger loops.  */
+  static uint32_t A[RAND_BLOCK_SIZE];
+  static float B[RAND_BLOCK_SIZE];
+
+  CAMLassert(lambda > 0.);
+
+  /* Shuffle the xoshiro samplers, and generate uniform variables in A. */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++)
+    A[i] = xoshiro_next(i);
+
+  /* Generate exponential random variables by computing logarithms. We
+     do not use math.h library functions, which are slow and prevent
+     compiler from using SIMD instructions. */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++)
+    B[i] = 1 + log_approx(A[i]) * one_log1m_lambda;
+
+  /* We do the final flooring for generating geometric
+     variables. Compilers are unlikely to use SIMD instructions for
+     this loop, because it involves a conditional and variables of
+     different sizes (32 and 64 bits). */
+  for(i = 0; i < RAND_BLOCK_SIZE; i++) {
+    double f = B[i];
+    CAMLassert (f >= 1);
+    if(f > Max_long) rand_geom_buff[i] = Max_long;
+    else rand_geom_buff[i] = (uintnat)f;
   }
 
-  y = mt_state[mt_index];
-  y = y ^ (y >> 11);
-  y = y ^ ((y << 7) & 0x9d2c5680);
-  y = y ^ ((y << 15) & 0xefc60000);
-  y = y ^ (y >> 18);
-
-  mt_index++;
-  return y*2.3283064365386962890625e-10 + /* 2^-32 */
-          1.16415321826934814453125e-10; /* 2^-33 */
+  rand_pos = 0;
 }
 
 /* Simulate a geometric variable of parameter [lambda].
    The result is clipped in [1..Max_long] */
-static uintnat mt_generate_geom(void)
+static uintnat rand_geom(void)
 {
-  double res;
+  uintnat res;
   CAMLassert(lambda > 0.);
-  /* We use the float versions of exp/log, since these functions are
-     significantly faster, and we really don't need much precision
-     here. The entropy contained in [next_mt_generate_geom] is anyway
-     bounded by the entropy provided by [mt_generate_uniform], which
-     is 32bits. */
-  res = 1 + logf(mt_generate_uniform()) * one_log1m_lambda;
-  if (res > Max_long) return Max_long;
-  return (uintnat)res;
+  if(rand_pos == RAND_BLOCK_SIZE) rand_batch();
+  res = rand_geom_buff[rand_pos++];
+  CAMLassert(1 <= res && res <= Max_long);
+  return res;
 }
 
-static uintnat next_mt_generate_geom;
+static uintnat next_rand_geom;
 /* Simulate a binomial variable of parameters [len] and [lambda].
    This sampling algorithm has running time linear with [len *
    lambda].  We could use more a involved algorithm, but this should
@@ -146,13 +216,13 @@ static uintnat next_mt_generate_geom;
      Hormann, Wolfgang. "The generation of binomial random variates."
      Journal of statistical computation and simulation 46.1-2 (1993), pp101-110.
  */
-static uintnat mt_generate_binom(uintnat len)
+static uintnat rand_binom(uintnat len)
 {
   uintnat res;
   CAMLassert(lambda > 0. && len < Max_long);
-  for (res = 0; next_mt_generate_geom < len; res++)
-    next_mt_generate_geom += mt_generate_geom();
-  next_mt_generate_geom -= len;
+  for (res = 0; next_rand_geom < len; res++)
+    next_rand_geom += rand_geom();
+  next_rand_geom -= len;
   return res;
 }
 
@@ -583,7 +653,7 @@ void caml_memprof_track_alloc_shr(value block)
   /* This test also makes sure memprof is initialized. */
   if (lambda == 0 || caml_memprof_suspended) return;
 
-  n_samples = mt_generate_binom(Whsize_val(block));
+  n_samples = rand_binom(Whsize_val(block));
   if (n_samples == 0) return;
 
   callstack = capture_callstack_postponed();
@@ -617,7 +687,7 @@ void caml_memprof_renew_minor_sample(void)
   if (lambda == 0) /* No trigger in the current minor heap. */
     caml_memprof_young_trigger = Caml_state->young_alloc_start;
   else {
-    uintnat geom = mt_generate_geom();
+    uintnat geom = rand_geom();
     if (Caml_state->young_ptr - Caml_state->young_alloc_start < geom)
       /* No trigger in the current minor heap. */
       caml_memprof_young_trigger = Caml_state->young_alloc_start;
@@ -656,7 +726,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
 
   if (!from_caml) {
     unsigned n_samples = 1 +
-      mt_generate_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
+      rand_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
     CAMLassert(encoded_alloc_lens == NULL);    /* No Comballoc in C! */
     caml_memprof_renew_minor_sample();
 
@@ -697,7 +767,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
     alloc_ofs -= Whsize_wosize(alloc_wosz);
     while (alloc_ofs < trigger_ofs) {
       n_samples++;
-      trigger_ofs -= mt_generate_geom();
+      trigger_ofs -= rand_geom();
     }
     if (n_samples > 0) {
       uintnat *idx_ptr, t_idx;
@@ -713,7 +783,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
           /* [lambda] changed during the callback. We need to refresh
              [trigger_ofs]. */
           saved_lambda = lambda;
-          trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (mt_generate_geom() - 1);
+          trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (rand_geom() - 1);
         }
       }
       if (Is_exception_result(res)) break;
@@ -834,7 +904,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
 
   p = block;
   while (1) {
-    uintnat next_sample = mt_generate_geom();
+    uintnat next_sample = rand_geom();
     header_t *next_sample_p, *next_p;
     if (next_sample > blockend - p)
       break;
@@ -850,7 +920,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
 
     if (callstack == 0) callstack = capture_callstack_postponed();
     if (callstack == 0) break;  /* OOM */
-    new_tracked(mt_generate_binom(next_p - next_sample_p) + 1,
+    new_tracked(rand_binom(next_p - next_sample_p) + 1,
                 Wosize_hp(p), 1, is_young, Val_hp(p), callstack);
     p = next_p;
   }
@@ -860,14 +930,8 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
 /**** Interface with the OCaml code. ****/
 
 static void caml_memprof_init(void) {
-  uintnat i;
-
   init = 1;
-
-  mt_index = MT_STATE_SIZE;
-  mt_state[0] = 42;
-  for (i = 1; i < MT_STATE_SIZE; i++)
-    mt_state[i] = 0x6c078965 * (mt_state[i-1] ^ (mt_state[i-1] >> 30)) + i;
+  xoshiro_init();
 }
 
 void caml_memprof_shutdown(void) {
@@ -902,7 +966,8 @@ CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param)
   lambda = l;
   if (l > 0) {
     one_log1m_lambda = l == 1 ? 0 : 1/caml_log1p(-l);
-    next_mt_generate_geom = mt_generate_geom();
+    rand_pos = RAND_BLOCK_SIZE;
+    next_rand_geom = rand_geom();
   }
 
   caml_memprof_renew_minor_sample();