Memprof: optimize random sampling (#9466)

Instead of using the stdlib logf function for computing logarithms, we
use a faster polynomial-based approximation.

We use the xoshiro PRNG instead of the Mersenne Twister.  xoshiro is
simpler and faster.

We generate samples by batches so that compilers can vectorize the
generation loops using SIMD instructions when possible.
master
Jacques-Henri Jourdan 2020-05-25 09:51:15 +02:00 committed by GitHub
parent b2f467abc0
commit 3d63a106b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 178 additions and 63 deletions

View File

@ -14,6 +14,9 @@ Working version
adding support for Musl ppc64le along the way.
(Xavier Leroy and Anil Madhavapeddy, review by Stephen Dolan)
- #9466: Memprof: optimize random samples generation.
(Jacques-Henri Jourdan review by Xavier Leroy and Stephen Dolan)
- #9508: Remove support for FreeBSD prior to 4.0R, that required explicit
floating-point initialization to behave like IEEE standard
(Hannes Mehnert, review by David Allsopp)

16
aclocal.m4 vendored
View File

@ -94,6 +94,22 @@ AC_DEFUN([OCAML_CC_SUPPORTS_ALIGNED], [
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])])
AC_DEFUN([OCAML_CC_SUPPORTS_TREE_VECTORIZE], [
AC_MSG_CHECKING(
[whether the C compiler supports __attribute__((optimize("tree-vectorize")))])
saved_CFLAGS="$CFLAGS"
CFLAGS="-Werror $CFLAGS"
AC_COMPILE_IFELSE(
[AC_LANG_SOURCE([
__attribute__((optimize("tree-vectorize"))) void f(void){}
int main() { f(); return 0; }
])],
[AC_DEFINE([SUPPORTS_TREE_VECTORIZE])
AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no])])
CFLAGS="$saved_CFLAGS"
])
AC_DEFUN([OCAML_CC_HAS_DEBUG_PREFIX_MAP], [
AC_MSG_CHECKING([whether the C compiler supports -fdebug-prefix-map])
saved_CFLAGS="$CFLAGS"

26
configure vendored
View File

@ -13783,6 +13783,32 @@ $as_echo "no" >&6; }
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
## Check whether __attribute__((optimize("tree-vectorize")))) is supported
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))" >&5
$as_echo_n "checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))... " >&6; }
saved_CFLAGS="$CFLAGS"
CFLAGS="-Werror $CFLAGS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
__attribute__((optimize("tree-vectorize"))) void f(void){}
int main() { f(); return 0; }
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define SUPPORTS_TREE_VECTORIZE 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
$as_echo "no" >&6; }
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$saved_CFLAGS"
# Configure the native-code compiler
arch=none

View File

@ -907,6 +907,9 @@ AS_CASE(["$CC,$host"],
OCAML_CC_SUPPORTS_ALIGNED
## Check whether __attribute__((optimize("tree-vectorize")))) is supported
OCAML_CC_SUPPORTS_TREE_VECTORIZE
# Configure the native-code compiler
arch=none

View File

@ -98,3 +98,5 @@
#undef FUNCTION_SECTIONS
#undef SUPPORTS_ALIGNED_ATTRIBUTE
#undef SUPPORTS_TREE_VECTORIZE

View File

@ -15,7 +15,6 @@
#define CAML_INTERNALS
#include <math.h>
#include <string.h>
#include "caml/memprof.h"
#include "caml/fail.h"
@ -32,10 +31,13 @@
#include "caml/printexc.h"
#include "caml/eventlog.h"
#define MT_STATE_SIZE 624
#define RAND_BLOCK_SIZE 64
static uint32_t mt_state[MT_STATE_SIZE];
static uint32_t mt_index;
static uint32_t xoshiro_state[4][RAND_BLOCK_SIZE];
static uintnat rand_geom_buff[RAND_BLOCK_SIZE];
static uint32_t rand_pos;
static uint32_t rand_pos;
/* [lambda] is the mean number of samples for each allocated word (including
block headers). */
@ -43,7 +45,7 @@ static double lambda = 0;
/* Precomputed value of [1/log(1-lambda)], for fast sampling of
geometric distribution.
Dummy if [lambda = 0]. */
static double one_log1m_lambda;
static float one_log1m_lambda;
/* [caml_memprof_suspended] is used for masking memprof callbacks when
a callback is running or when an uncaught exception handler is
@ -66,7 +68,6 @@ static intnat callstack_size;
static value tracker;
/* Pointer to the word following the next sample in the minor
heap. Equals [Caml_state->young_alloc_start] if no sampling is planned in
the current minor heap.
@ -86,55 +87,124 @@ static intnat callstack_buffer_len = 0;
/**** Statistical sampling ****/
static double mt_generate_uniform(void)
{
int i;
uint32_t y;
Caml_inline uint64_t splitmix64_next(uint64_t* x) {
uint64_t z = (*x += 0x9E3779B97F4A7C15ull);
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
return z ^ (z >> 31);
}
/* Mersenne twister PRNG */
if (mt_index == MT_STATE_SIZE) {
for (i = 0; i < 227; i++) {
y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
mt_state[i] = mt_state[i+397] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
}
for (i = 227; i < MT_STATE_SIZE - 1; i++) {
y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff);
mt_state[i] = mt_state[i-227] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
}
y = (mt_state[MT_STATE_SIZE - 1] & 0x80000000) + (mt_state[0] & 0x7fffffff);
mt_state[MT_STATE_SIZE - 1] =
mt_state[396] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df);
mt_index = 0;
static void xoshiro_init(void) {
int i;
uint64_t splitmix64_state = 42;
rand_pos = RAND_BLOCK_SIZE;
for (i = 0; i < RAND_BLOCK_SIZE; i++) {
uint64_t t = splitmix64_next(&splitmix64_state);
xoshiro_state[0][i] = t & 0xFFFFFFFF;
xoshiro_state[1][i] = t >> 32;
t = splitmix64_next(&splitmix64_state);
xoshiro_state[2][i] = t & 0xFFFFFFFF;
xoshiro_state[3][i] = t >> 32;
}
}
Caml_inline uint32_t xoshiro_next(int i) {
uint32_t res = xoshiro_state[0][i] + xoshiro_state[3][i];
uint32_t t = xoshiro_state[1][i] << 9;
xoshiro_state[2][i] ^= xoshiro_state[0][i];
xoshiro_state[3][i] ^= xoshiro_state[1][i];
xoshiro_state[1][i] ^= xoshiro_state[2][i];
xoshiro_state[0][i] ^= xoshiro_state[3][i];
xoshiro_state[2][i] ^= t;
t = xoshiro_state[3][i];
xoshiro_state[3][i] = (t << 11) | (t >> 21);
return res;
}
/* Computes [log((y+0.5)/2^32)], up to a relatively good precision,
and guarantee that the result is negative.
The average absolute error is very close to 0. */
Caml_inline float log_approx(uint32_t y) {
union { float f; int32_t i; } u;
float exp, x;
u.f = y + 0.5f; /* We convert y to a float ... */
exp = u.i >> 23; /* ... of which we extract the exponent ... */
u.i = (u.i & 0x7FFFFF) | 0x3F800000;
x = u.f; /* ... and the mantissa. */
return
/* This polynomial computes the logarithm of the mantissa (which
is in [1, 2]), up to an additive constant. It is chosen such that :
- Its degree is 4.
- Its average value is that of log in [1, 2]
(the sampling has the right mean when lambda is small).
- f(1) = f(2) - log(2) = -159*log(2) - 1e-5
(this guarantee that log_approx(y) is always <= -1e-5 < 0).
- The maximum of abs(f(x)-log(x)+159*log(2)) is minimized.
*/
x * (2.104659476859f + x * (-0.720478916626f + x * 0.107132064797f))
/* Then, we add the term corresponding to the exponent, and
additive constants. */
+ (-111.701724334061f + 0.6931471805f*exp);
}
/* This function regenerates [MT_STATE_SIZE] geometric random
variables at once. Doing this by batches help us gain performances:
many compilers (e.g., GCC, CLang, ICC) will be able to use SIMD
instructions to get a performance boost.
*/
#ifdef SUPPORTS_TREE_VECTORIZE
__attribute__((optimize("tree-vectorize")))
#endif
static void rand_batch(void) {
int i;
/* Instead of using temporary buffers, we could use one big loop,
but it turns out SIMD optimizations of compilers are more fragile
when using larger loops. */
static uint32_t A[RAND_BLOCK_SIZE];
static float B[RAND_BLOCK_SIZE];
CAMLassert(lambda > 0.);
/* Shuffle the xoshiro samplers, and generate uniform variables in A. */
for(i = 0; i < RAND_BLOCK_SIZE; i++)
A[i] = xoshiro_next(i);
/* Generate exponential random variables by computing logarithms. We
do not use math.h library functions, which are slow and prevent
compiler from using SIMD instructions. */
for(i = 0; i < RAND_BLOCK_SIZE; i++)
B[i] = 1 + log_approx(A[i]) * one_log1m_lambda;
/* We do the final flooring for generating geometric
variables. Compilers are unlikely to use SIMD instructions for
this loop, because it involves a conditional and variables of
different sizes (32 and 64 bits). */
for(i = 0; i < RAND_BLOCK_SIZE; i++) {
double f = B[i];
CAMLassert (f >= 1);
if(f > Max_long) rand_geom_buff[i] = Max_long;
else rand_geom_buff[i] = (uintnat)f;
}
y = mt_state[mt_index];
y = y ^ (y >> 11);
y = y ^ ((y << 7) & 0x9d2c5680);
y = y ^ ((y << 15) & 0xefc60000);
y = y ^ (y >> 18);
mt_index++;
return y*2.3283064365386962890625e-10 + /* 2^-32 */
1.16415321826934814453125e-10; /* 2^-33 */
rand_pos = 0;
}
/* Simulate a geometric variable of parameter [lambda].
The result is clipped in [1..Max_long] */
static uintnat mt_generate_geom(void)
static uintnat rand_geom(void)
{
double res;
uintnat res;
CAMLassert(lambda > 0.);
/* We use the float versions of exp/log, since these functions are
significantly faster, and we really don't need much precision
here. The entropy contained in [next_mt_generate_geom] is anyway
bounded by the entropy provided by [mt_generate_uniform], which
is 32bits. */
res = 1 + logf(mt_generate_uniform()) * one_log1m_lambda;
if (res > Max_long) return Max_long;
return (uintnat)res;
if(rand_pos == RAND_BLOCK_SIZE) rand_batch();
res = rand_geom_buff[rand_pos++];
CAMLassert(1 <= res && res <= Max_long);
return res;
}
static uintnat next_mt_generate_geom;
static uintnat next_rand_geom;
/* Simulate a binomial variable of parameters [len] and [lambda].
This sampling algorithm has running time linear with [len *
lambda]. We could use more a involved algorithm, but this should
@ -146,13 +216,13 @@ static uintnat next_mt_generate_geom;
Hormann, Wolfgang. "The generation of binomial random variates."
Journal of statistical computation and simulation 46.1-2 (1993), pp101-110.
*/
static uintnat mt_generate_binom(uintnat len)
static uintnat rand_binom(uintnat len)
{
uintnat res;
CAMLassert(lambda > 0. && len < Max_long);
for (res = 0; next_mt_generate_geom < len; res++)
next_mt_generate_geom += mt_generate_geom();
next_mt_generate_geom -= len;
for (res = 0; next_rand_geom < len; res++)
next_rand_geom += rand_geom();
next_rand_geom -= len;
return res;
}
@ -583,7 +653,7 @@ void caml_memprof_track_alloc_shr(value block)
/* This test also makes sure memprof is initialized. */
if (lambda == 0 || caml_memprof_suspended) return;
n_samples = mt_generate_binom(Whsize_val(block));
n_samples = rand_binom(Whsize_val(block));
if (n_samples == 0) return;
callstack = capture_callstack_postponed();
@ -617,7 +687,7 @@ void caml_memprof_renew_minor_sample(void)
if (lambda == 0) /* No trigger in the current minor heap. */
caml_memprof_young_trigger = Caml_state->young_alloc_start;
else {
uintnat geom = mt_generate_geom();
uintnat geom = rand_geom();
if (Caml_state->young_ptr - Caml_state->young_alloc_start < geom)
/* No trigger in the current minor heap. */
caml_memprof_young_trigger = Caml_state->young_alloc_start;
@ -656,7 +726,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
if (!from_caml) {
unsigned n_samples = 1 +
mt_generate_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
rand_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr);
CAMLassert(encoded_alloc_lens == NULL); /* No Comballoc in C! */
caml_memprof_renew_minor_sample();
@ -697,7 +767,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
alloc_ofs -= Whsize_wosize(alloc_wosz);
while (alloc_ofs < trigger_ofs) {
n_samples++;
trigger_ofs -= mt_generate_geom();
trigger_ofs -= rand_geom();
}
if (n_samples > 0) {
uintnat *idx_ptr, t_idx;
@ -713,7 +783,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml,
/* [lambda] changed during the callback. We need to refresh
[trigger_ofs]. */
saved_lambda = lambda;
trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (mt_generate_geom() - 1);
trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (rand_geom() - 1);
}
}
if (Is_exception_result(res)) break;
@ -834,7 +904,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
p = block;
while (1) {
uintnat next_sample = mt_generate_geom();
uintnat next_sample = rand_geom();
header_t *next_sample_p, *next_p;
if (next_sample > blockend - p)
break;
@ -850,7 +920,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
if (callstack == 0) callstack = capture_callstack_postponed();
if (callstack == 0) break; /* OOM */
new_tracked(mt_generate_binom(next_p - next_sample_p) + 1,
new_tracked(rand_binom(next_p - next_sample_p) + 1,
Wosize_hp(p), 1, is_young, Val_hp(p), callstack);
p = next_p;
}
@ -860,14 +930,8 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) {
/**** Interface with the OCaml code. ****/
static void caml_memprof_init(void) {
uintnat i;
init = 1;
mt_index = MT_STATE_SIZE;
mt_state[0] = 42;
for (i = 1; i < MT_STATE_SIZE; i++)
mt_state[i] = 0x6c078965 * (mt_state[i-1] ^ (mt_state[i-1] >> 30)) + i;
xoshiro_init();
}
void caml_memprof_shutdown(void) {
@ -902,7 +966,8 @@ CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param)
lambda = l;
if (l > 0) {
one_log1m_lambda = l == 1 ? 0 : 1/caml_log1p(-l);
next_mt_generate_geom = mt_generate_geom();
rand_pos = RAND_BLOCK_SIZE;
next_rand_geom = rand_geom();
}
caml_memprof_renew_minor_sample();