diff --git a/Changes b/Changes index a837859d0..f90c43536 100644 --- a/Changes +++ b/Changes @@ -14,6 +14,9 @@ Working version adding support for Musl ppc64le along the way. (Xavier Leroy and Anil Madhavapeddy, review by Stephen Dolan) +- #9466: Memprof: optimize random samples generation. + (Jacques-Henri Jourdan review by Xavier Leroy and Stephen Dolan) + - #9508: Remove support for FreeBSD prior to 4.0R, that required explicit floating-point initialization to behave like IEEE standard (Hannes Mehnert, review by David Allsopp) diff --git a/aclocal.m4 b/aclocal.m4 index 5ac1b7293..34290b3e1 100644 --- a/aclocal.m4 +++ b/aclocal.m4 @@ -94,6 +94,22 @@ AC_DEFUN([OCAML_CC_SUPPORTS_ALIGNED], [ AC_MSG_RESULT([yes])], [AC_MSG_RESULT([no])])]) +AC_DEFUN([OCAML_CC_SUPPORTS_TREE_VECTORIZE], [ + AC_MSG_CHECKING( + [whether the C compiler supports __attribute__((optimize("tree-vectorize")))]) + saved_CFLAGS="$CFLAGS" + CFLAGS="-Werror $CFLAGS" + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE([ + __attribute__((optimize("tree-vectorize"))) void f(void){} + int main() { f(); return 0; } + ])], + [AC_DEFINE([SUPPORTS_TREE_VECTORIZE]) + AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])]) + CFLAGS="$saved_CFLAGS" +]) + AC_DEFUN([OCAML_CC_HAS_DEBUG_PREFIX_MAP], [ AC_MSG_CHECKING([whether the C compiler supports -fdebug-prefix-map]) saved_CFLAGS="$CFLAGS" diff --git a/configure b/configure index 8b68bf02c..518f3f3f7 100755 --- a/configure +++ b/configure @@ -13783,6 +13783,32 @@ $as_echo "no" >&6; } fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +## Check whether __attribute__((optimize("tree-vectorize")))) is supported + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))" >&5 +$as_echo_n "checking whether the C compiler supports __attribute__((optimize(\"tree-vectorize\")))... " >&6; } + saved_CFLAGS="$CFLAGS" + CFLAGS="-Werror $CFLAGS" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + __attribute__((optimize("tree-vectorize"))) void f(void){} + int main() { f(); return 0; } + +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + $as_echo "#define SUPPORTS_TREE_VECTORIZE 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + CFLAGS="$saved_CFLAGS" + + # Configure the native-code compiler arch=none diff --git a/configure.ac b/configure.ac index f44e1be7d..e756cb638 100644 --- a/configure.ac +++ b/configure.ac @@ -907,6 +907,9 @@ AS_CASE(["$CC,$host"], OCAML_CC_SUPPORTS_ALIGNED +## Check whether __attribute__((optimize("tree-vectorize")))) is supported +OCAML_CC_SUPPORTS_TREE_VECTORIZE + # Configure the native-code compiler arch=none diff --git a/runtime/caml/m.h.in b/runtime/caml/m.h.in index b5a7205b2..0e099134c 100644 --- a/runtime/caml/m.h.in +++ b/runtime/caml/m.h.in @@ -98,3 +98,5 @@ #undef FUNCTION_SECTIONS #undef SUPPORTS_ALIGNED_ATTRIBUTE + +#undef SUPPORTS_TREE_VECTORIZE diff --git a/runtime/memprof.c b/runtime/memprof.c index aead07a08..1e3c885d6 100644 --- a/runtime/memprof.c +++ b/runtime/memprof.c @@ -15,7 +15,6 @@ #define CAML_INTERNALS -#include #include #include "caml/memprof.h" #include "caml/fail.h" @@ -32,10 +31,13 @@ #include "caml/printexc.h" #include "caml/eventlog.h" -#define MT_STATE_SIZE 624 +#define RAND_BLOCK_SIZE 64 -static uint32_t mt_state[MT_STATE_SIZE]; -static uint32_t mt_index; +static uint32_t xoshiro_state[4][RAND_BLOCK_SIZE]; +static uintnat rand_geom_buff[RAND_BLOCK_SIZE]; +static uint32_t rand_pos; + +static uint32_t rand_pos; /* [lambda] is the mean number of samples for each allocated word (including block headers). */ @@ -43,7 +45,7 @@ static double lambda = 0; /* Precomputed value of [1/log(1-lambda)], for fast sampling of geometric distribution. Dummy if [lambda = 0]. */ -static double one_log1m_lambda; +static float one_log1m_lambda; /* [caml_memprof_suspended] is used for masking memprof callbacks when a callback is running or when an uncaught exception handler is @@ -66,7 +68,6 @@ static intnat callstack_size; static value tracker; - /* Pointer to the word following the next sample in the minor heap. Equals [Caml_state->young_alloc_start] if no sampling is planned in the current minor heap. @@ -86,55 +87,124 @@ static intnat callstack_buffer_len = 0; /**** Statistical sampling ****/ -static double mt_generate_uniform(void) -{ - int i; - uint32_t y; +Caml_inline uint64_t splitmix64_next(uint64_t* x) { + uint64_t z = (*x += 0x9E3779B97F4A7C15ull); + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); +} - /* Mersenne twister PRNG */ - if (mt_index == MT_STATE_SIZE) { - for (i = 0; i < 227; i++) { - y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff); - mt_state[i] = mt_state[i+397] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df); - } - for (i = 227; i < MT_STATE_SIZE - 1; i++) { - y = (mt_state[i] & 0x80000000) + (mt_state[i+1] & 0x7fffffff); - mt_state[i] = mt_state[i-227] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df); - } - y = (mt_state[MT_STATE_SIZE - 1] & 0x80000000) + (mt_state[0] & 0x7fffffff); - mt_state[MT_STATE_SIZE - 1] = - mt_state[396] ^ (y >> 1) ^ ((-(y&1)) & 0x9908b0df); - mt_index = 0; +static void xoshiro_init(void) { + int i; + uint64_t splitmix64_state = 42; + rand_pos = RAND_BLOCK_SIZE; + for (i = 0; i < RAND_BLOCK_SIZE; i++) { + uint64_t t = splitmix64_next(&splitmix64_state); + xoshiro_state[0][i] = t & 0xFFFFFFFF; + xoshiro_state[1][i] = t >> 32; + t = splitmix64_next(&splitmix64_state); + xoshiro_state[2][i] = t & 0xFFFFFFFF; + xoshiro_state[3][i] = t >> 32; + } +} + +Caml_inline uint32_t xoshiro_next(int i) { + uint32_t res = xoshiro_state[0][i] + xoshiro_state[3][i]; + uint32_t t = xoshiro_state[1][i] << 9; + xoshiro_state[2][i] ^= xoshiro_state[0][i]; + xoshiro_state[3][i] ^= xoshiro_state[1][i]; + xoshiro_state[1][i] ^= xoshiro_state[2][i]; + xoshiro_state[0][i] ^= xoshiro_state[3][i]; + xoshiro_state[2][i] ^= t; + t = xoshiro_state[3][i]; + xoshiro_state[3][i] = (t << 11) | (t >> 21); + return res; +} + +/* Computes [log((y+0.5)/2^32)], up to a relatively good precision, + and guarantee that the result is negative. + The average absolute error is very close to 0. */ +Caml_inline float log_approx(uint32_t y) { + union { float f; int32_t i; } u; + float exp, x; + u.f = y + 0.5f; /* We convert y to a float ... */ + exp = u.i >> 23; /* ... of which we extract the exponent ... */ + u.i = (u.i & 0x7FFFFF) | 0x3F800000; + x = u.f; /* ... and the mantissa. */ + + return + /* This polynomial computes the logarithm of the mantissa (which + is in [1, 2]), up to an additive constant. It is chosen such that : + - Its degree is 4. + - Its average value is that of log in [1, 2] + (the sampling has the right mean when lambda is small). + - f(1) = f(2) - log(2) = -159*log(2) - 1e-5 + (this guarantee that log_approx(y) is always <= -1e-5 < 0). + - The maximum of abs(f(x)-log(x)+159*log(2)) is minimized. + */ + x * (2.104659476859f + x * (-0.720478916626f + x * 0.107132064797f)) + + /* Then, we add the term corresponding to the exponent, and + additive constants. */ + + (-111.701724334061f + 0.6931471805f*exp); +} + +/* This function regenerates [MT_STATE_SIZE] geometric random + variables at once. Doing this by batches help us gain performances: + many compilers (e.g., GCC, CLang, ICC) will be able to use SIMD + instructions to get a performance boost. +*/ +#ifdef SUPPORTS_TREE_VECTORIZE +__attribute__((optimize("tree-vectorize"))) +#endif +static void rand_batch(void) { + int i; + + /* Instead of using temporary buffers, we could use one big loop, + but it turns out SIMD optimizations of compilers are more fragile + when using larger loops. */ + static uint32_t A[RAND_BLOCK_SIZE]; + static float B[RAND_BLOCK_SIZE]; + + CAMLassert(lambda > 0.); + + /* Shuffle the xoshiro samplers, and generate uniform variables in A. */ + for(i = 0; i < RAND_BLOCK_SIZE; i++) + A[i] = xoshiro_next(i); + + /* Generate exponential random variables by computing logarithms. We + do not use math.h library functions, which are slow and prevent + compiler from using SIMD instructions. */ + for(i = 0; i < RAND_BLOCK_SIZE; i++) + B[i] = 1 + log_approx(A[i]) * one_log1m_lambda; + + /* We do the final flooring for generating geometric + variables. Compilers are unlikely to use SIMD instructions for + this loop, because it involves a conditional and variables of + different sizes (32 and 64 bits). */ + for(i = 0; i < RAND_BLOCK_SIZE; i++) { + double f = B[i]; + CAMLassert (f >= 1); + if(f > Max_long) rand_geom_buff[i] = Max_long; + else rand_geom_buff[i] = (uintnat)f; } - y = mt_state[mt_index]; - y = y ^ (y >> 11); - y = y ^ ((y << 7) & 0x9d2c5680); - y = y ^ ((y << 15) & 0xefc60000); - y = y ^ (y >> 18); - - mt_index++; - return y*2.3283064365386962890625e-10 + /* 2^-32 */ - 1.16415321826934814453125e-10; /* 2^-33 */ + rand_pos = 0; } /* Simulate a geometric variable of parameter [lambda]. The result is clipped in [1..Max_long] */ -static uintnat mt_generate_geom(void) +static uintnat rand_geom(void) { - double res; + uintnat res; CAMLassert(lambda > 0.); - /* We use the float versions of exp/log, since these functions are - significantly faster, and we really don't need much precision - here. The entropy contained in [next_mt_generate_geom] is anyway - bounded by the entropy provided by [mt_generate_uniform], which - is 32bits. */ - res = 1 + logf(mt_generate_uniform()) * one_log1m_lambda; - if (res > Max_long) return Max_long; - return (uintnat)res; + if(rand_pos == RAND_BLOCK_SIZE) rand_batch(); + res = rand_geom_buff[rand_pos++]; + CAMLassert(1 <= res && res <= Max_long); + return res; } -static uintnat next_mt_generate_geom; +static uintnat next_rand_geom; /* Simulate a binomial variable of parameters [len] and [lambda]. This sampling algorithm has running time linear with [len * lambda]. We could use more a involved algorithm, but this should @@ -146,13 +216,13 @@ static uintnat next_mt_generate_geom; Hormann, Wolfgang. "The generation of binomial random variates." Journal of statistical computation and simulation 46.1-2 (1993), pp101-110. */ -static uintnat mt_generate_binom(uintnat len) +static uintnat rand_binom(uintnat len) { uintnat res; CAMLassert(lambda > 0. && len < Max_long); - for (res = 0; next_mt_generate_geom < len; res++) - next_mt_generate_geom += mt_generate_geom(); - next_mt_generate_geom -= len; + for (res = 0; next_rand_geom < len; res++) + next_rand_geom += rand_geom(); + next_rand_geom -= len; return res; } @@ -583,7 +653,7 @@ void caml_memprof_track_alloc_shr(value block) /* This test also makes sure memprof is initialized. */ if (lambda == 0 || caml_memprof_suspended) return; - n_samples = mt_generate_binom(Whsize_val(block)); + n_samples = rand_binom(Whsize_val(block)); if (n_samples == 0) return; callstack = capture_callstack_postponed(); @@ -617,7 +687,7 @@ void caml_memprof_renew_minor_sample(void) if (lambda == 0) /* No trigger in the current minor heap. */ caml_memprof_young_trigger = Caml_state->young_alloc_start; else { - uintnat geom = mt_generate_geom(); + uintnat geom = rand_geom(); if (Caml_state->young_ptr - Caml_state->young_alloc_start < geom) /* No trigger in the current minor heap. */ caml_memprof_young_trigger = Caml_state->young_alloc_start; @@ -656,7 +726,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml, if (!from_caml) { unsigned n_samples = 1 + - mt_generate_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr); + rand_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr); CAMLassert(encoded_alloc_lens == NULL); /* No Comballoc in C! */ caml_memprof_renew_minor_sample(); @@ -697,7 +767,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml, alloc_ofs -= Whsize_wosize(alloc_wosz); while (alloc_ofs < trigger_ofs) { n_samples++; - trigger_ofs -= mt_generate_geom(); + trigger_ofs -= rand_geom(); } if (n_samples > 0) { uintnat *idx_ptr, t_idx; @@ -713,7 +783,7 @@ void caml_memprof_track_young(uintnat wosize, int from_caml, /* [lambda] changed during the callback. We need to refresh [trigger_ofs]. */ saved_lambda = lambda; - trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (mt_generate_geom() - 1); + trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (rand_geom() - 1); } } if (Is_exception_result(res)) break; @@ -834,7 +904,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) { p = block; while (1) { - uintnat next_sample = mt_generate_geom(); + uintnat next_sample = rand_geom(); header_t *next_sample_p, *next_p; if (next_sample > blockend - p) break; @@ -850,7 +920,7 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) { if (callstack == 0) callstack = capture_callstack_postponed(); if (callstack == 0) break; /* OOM */ - new_tracked(mt_generate_binom(next_p - next_sample_p) + 1, + new_tracked(rand_binom(next_p - next_sample_p) + 1, Wosize_hp(p), 1, is_young, Val_hp(p), callstack); p = next_p; } @@ -860,14 +930,8 @@ void caml_memprof_track_interned(header_t* block, header_t* blockend) { /**** Interface with the OCaml code. ****/ static void caml_memprof_init(void) { - uintnat i; - init = 1; - - mt_index = MT_STATE_SIZE; - mt_state[0] = 42; - for (i = 1; i < MT_STATE_SIZE; i++) - mt_state[i] = 0x6c078965 * (mt_state[i-1] ^ (mt_state[i-1] >> 30)) + i; + xoshiro_init(); } void caml_memprof_shutdown(void) { @@ -902,7 +966,8 @@ CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param) lambda = l; if (l > 0) { one_log1m_lambda = l == 1 ? 0 : 1/caml_log1p(-l); - next_mt_generate_geom = mt_generate_geom(); + rand_pos = RAND_BLOCK_SIZE; + next_rand_geom = rand_geom(); } caml_memprof_renew_minor_sample();