ocaml/runtime/hash.c

319 lines
9.3 KiB
C

/**************************************************************************/
/* */
/* OCaml */
/* */
/* Xavier Leroy, projet Cristal, INRIA Rocquencourt */
/* */
/* Copyright 1996 Institut National de Recherche en Informatique et */
/* en Automatique. */
/* */
/* All rights reserved. This file is distributed under the terms of */
/* the GNU Lesser General Public License version 2.1, with the */
/* special exception on linking described in the file LICENSE. */
/* */
/**************************************************************************/
#define CAML_INTERNALS
/* The generic hashing primitive */
/* The interface of this file is in "mlvalues.h" (for [caml_hash_variant])
and in "hash.h" (for the other exported functions). */
#include "caml/mlvalues.h"
#include "caml/custom.h"
#include "caml/memory.h"
#include "caml/hash.h"
/* The implementation based on MurmurHash 3,
https://github.com/aappleby/smhasher/ */
#define ROTL32(x,n) ((x) << n | (x) >> (32-n))
#define MIX(h,d) \
d *= 0xcc9e2d51; \
d = ROTL32(d, 15); \
d *= 0x1b873593; \
h ^= d; \
h = ROTL32(h, 13); \
h = h * 5 + 0xe6546b64;
#define FINAL_MIX(h) \
h ^= h >> 16; \
h *= 0x85ebca6b; \
h ^= h >> 13; \
h *= 0xc2b2ae35; \
h ^= h >> 16;
CAMLexport uint32_t caml_hash_mix_uint32(uint32_t h, uint32_t d)
{
MIX(h, d);
return h;
}
/* Mix a platform-native integer. */
CAMLexport uint32_t caml_hash_mix_intnat(uint32_t h, intnat d)
{
uint32_t n;
#ifdef ARCH_SIXTYFOUR
/* Mix the low 32 bits and the high 32 bits, in a way that preserves
32/64 compatibility: we want n = (uint32_t) d
if d is in the range [-2^31, 2^31-1]. */
n = (d >> 32) ^ (d >> 63) ^ d;
/* If 0 <= d < 2^31: d >> 32 = 0 d >> 63 = 0
If -2^31 <= d < 0: d >> 32 = -1 d >> 63 = -1
In both cases, n = (uint32_t) d. */
#else
n = d;
#endif
MIX(h, n);
return h;
}
/* Mix a 64-bit integer. */
CAMLexport uint32_t caml_hash_mix_int64(uint32_t h, int64_t d)
{
uint32_t hi = (uint32_t) (d >> 32), lo = (uint32_t) d;
MIX(h, lo);
MIX(h, hi);
return h;
}
/* Mix a double-precision float.
Treats +0.0 and -0.0 identically.
Treats all NaNs identically.
*/
CAMLexport uint32_t caml_hash_mix_double(uint32_t hash, double d)
{
union {
double d;
#if defined(ARCH_BIG_ENDIAN) || (defined(__arm__) && !defined(__ARM_EABI__))
struct { uint32_t h; uint32_t l; } i;
#else
struct { uint32_t l; uint32_t h; } i;
#endif
} u;
uint32_t h, l;
/* Convert to two 32-bit halves */
u.d = d;
h = u.i.h; l = u.i.l;
/* Normalize NaNs */
if ((h & 0x7FF00000) == 0x7FF00000 && (l | (h & 0xFFFFF)) != 0) {
h = 0x7FF00000;
l = 0x00000001;
}
/* Normalize -0 into +0 */
else if (h == 0x80000000 && l == 0) {
h = 0;
}
MIX(hash, l);
MIX(hash, h);
return hash;
}
/* Mix a single-precision float.
Treats +0.0 and -0.0 identically.
Treats all NaNs identically.
*/
CAMLexport uint32_t caml_hash_mix_float(uint32_t hash, float d)
{
union {
float f;
uint32_t i;
} u;
uint32_t n;
/* Convert to int32_t */
u.f = d; n = u.i;
/* Normalize NaNs */
if ((n & 0x7F800000) == 0x7F800000 && (n & 0x007FFFFF) != 0) {
n = 0x7F800001;
}
/* Normalize -0 into +0 */
else if (n == 0x80000000) {
n = 0;
}
MIX(hash, n);
return hash;
}
/* Mix an OCaml string */
CAMLexport uint32_t caml_hash_mix_string(uint32_t h, value s)
{
mlsize_t len = caml_string_length(s);
mlsize_t i;
uint32_t w;
/* Mix by 32-bit blocks (little-endian) */
for (i = 0; i + 4 <= len; i += 4) {
#ifdef ARCH_BIG_ENDIAN
w = Byte_u(s, i)
| (Byte_u(s, i+1) << 8)
| (Byte_u(s, i+2) << 16)
| (Byte_u(s, i+3) << 24);
#else
w = *((uint32_t *) &Byte_u(s, i));
#endif
MIX(h, w);
}
/* Finish with up to 3 bytes */
w = 0;
switch (len & 3) {
case 3: w = Byte_u(s, i+2) << 16; /* fallthrough */
case 2: w |= Byte_u(s, i+1) << 8; /* fallthrough */
case 1: w |= Byte_u(s, i);
MIX(h, w);
default: /*skip*/; /* len & 3 == 0, no extra bytes, do nothing */
}
/* Finally, mix in the length. Ignore the upper 32 bits, generally 0. */
h ^= (uint32_t) len;
return h;
}
/* Maximal size of the queue used for breadth-first traversal. */
#define HASH_QUEUE_SIZE 256
/* Maximal number of Forward_tag links followed in one step */
#define MAX_FORWARD_DEREFERENCE 1000
/* The generic hash function */
CAMLprim value caml_hash(value count, value limit, value seed, value obj)
{
value queue[HASH_QUEUE_SIZE]; /* Queue of values to examine */
intnat rd; /* Position of first value in queue */
intnat wr; /* One past position of last value in queue */
intnat sz; /* Max number of values to put in queue */
intnat num; /* Max number of meaningful values to see */
uint32_t h; /* Rolling hash */
value v;
mlsize_t i, len;
sz = Long_val(limit);
if (sz < 0 || sz > HASH_QUEUE_SIZE) sz = HASH_QUEUE_SIZE;
num = Long_val(count);
h = Int_val(seed);
queue[0] = obj; rd = 0; wr = 1;
while (rd < wr && num > 0) {
v = queue[rd++];
again:
if (Is_long(v)) {
h = caml_hash_mix_intnat(h, v);
num--;
}
else if (!Is_in_value_area(v)) {
/* v is a pointer outside the heap, probably a code pointer.
Shall we count it? Let's say yes by compatibility with old code. */
h = caml_hash_mix_intnat(h, v);
num--;
}
else {
switch (Tag_val(v)) {
case String_tag:
h = caml_hash_mix_string(h, v);
num--;
break;
case Double_tag:
h = caml_hash_mix_double(h, Double_val(v));
num--;
break;
case Double_array_tag:
for (i = 0, len = Wosize_val(v) / Double_wosize; i < len; i++) {
h = caml_hash_mix_double(h, Double_flat_field(v, i));
num--;
if (num <= 0) break;
}
break;
case Abstract_tag:
/* Block contents unknown. Do nothing. */
break;
case Infix_tag:
/* Mix in the offset to distinguish different functions from
the same mutually-recursive definition */
h = caml_hash_mix_uint32(h, Infix_offset_val(v));
v = v - Infix_offset_val(v);
goto again;
case Forward_tag:
/* PR#6361: we can have a loop here, so limit the number of
Forward_tag links being followed */
for (i = MAX_FORWARD_DEREFERENCE; i > 0; i--) {
v = Forward_val(v);
if (Is_long(v) || !Is_in_value_area(v) || Tag_val(v) != Forward_tag)
goto again;
}
/* Give up on this object and move to the next */
break;
case Object_tag:
h = caml_hash_mix_intnat(h, Oid_val(v));
num--;
break;
case Custom_tag:
/* If no hashing function provided, do nothing. */
/* Only use low 32 bits of custom hash, for 32/64 compatibility */
if (Custom_ops_val(v)->hash != NULL) {
uint32_t n = (uint32_t) Custom_ops_val(v)->hash(v);
h = caml_hash_mix_uint32(h, n);
num--;
}
break;
#ifdef NO_NAKED_POINTERS
case Closure_tag: {
mlsize_t startenv;
len = Wosize_val(v);
startenv = Start_env_closinfo(Closinfo_val(v));
CAMLassert (startenv <= len);
/* Mix in the tag and size, but do not count this towards [num] */
h = caml_hash_mix_uint32(h, Whitehd_hd(Hd_val(v)));
/* Mix the code pointers, closure info fields, and infix headers */
for (i = 0; i < startenv; i++) {
h = caml_hash_mix_intnat(h, Field(v, i));
num--;
}
/* Copy environment fields into queue,
not exceeding the total size [sz] */
for (/*nothing*/; i < len; i++) {
if (wr >= sz) break;
queue[wr++] = Field(v, i);
}
break;
}
#endif
default:
/* Mix in the tag and size, but do not count this towards [num] */
h = caml_hash_mix_uint32(h, Whitehd_hd(Hd_val(v)));
/* Copy fields into queue, not exceeding the total size [sz] */
for (i = 0, len = Wosize_val(v); i < len; i++) {
if (wr >= sz) break;
queue[wr++] = Field(v, i);
}
break;
}
}
}
/* Final mixing of bits */
FINAL_MIX(h);
/* Fold result to the range [0, 2^30-1] so that it is a nonnegative
OCaml integer both on 32 and 64-bit platforms. */
return Val_int(h & 0x3FFFFFFFU);
}
/* Hashing variant tags */
CAMLexport value caml_hash_variant(char const * tag)
{
value accu;
/* Same hashing algorithm as in ../typing/btype.ml, function hash_variant */
for (accu = Val_int(0); *tag != 0; tag++)
accu = Val_int(223 * Int_val(accu) + *((unsigned char *) tag));
#ifdef ARCH_SIXTYFOUR
accu = accu & Val_long(0x7FFFFFFFL);
#endif
/* Force sign extension of bit 31 for compatibility between 32 and 64-bit
platforms */
return (int32_t) accu;
}