Minor clean up

This commit is contained in:
Stella Lau 2017-07-24 10:18:58 -07:00
parent 1a188fe864
commit eb16da647d
2 changed files with 188 additions and 24 deletions

View File

@ -13,21 +13,26 @@
// Defines the size of the hash table.
// Note that this is not the number of buckets.
// Currently this should be less than WINDOW_SIZE_LOG + 4?
#define LDM_MEMORY_USAGE 22
#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now
#define LDM_MEMORY_USAGE 24
#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now
// Defines the lag in inserting elements into the hash table.
#define LDM_LAG 0
#define LDM_WINDOW_SIZE_LOG 28
#define LDM_WINDOW_SIZE_LOG 28 // Max value is 30
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
//These should be multiples of four (and perhaps set to the same value?).
#define LDM_MIN_MATCH_LENGTH 64
#define LDM_HASH_LENGTH 64
#define TMP_EVICTION
// Experimental.
//:w
//#define TMP_EVICTION
#define TMP_TAG_INSERT
//#define TMP_SIMPLE_LOWER
//#define TMP_FORCE_HASH_ONLY
typedef struct LDM_compressStats LDM_compressStats;
typedef struct LDM_CCtx LDM_CCtx;
typedef struct LDM_DCtx LDM_DCtx;

View File

@ -12,7 +12,11 @@
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
// Insert every (HASH_ONLY_EVERY + 1) into the hash table.
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG)))
#ifdef TMP_FORCE_HASH_ONLY
#define HASH_ONLY_EVERY_LOG 7
#else
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG)))
#endif
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
/* Hash table stuff. */
@ -26,12 +30,15 @@
#define COMPUTE_STATS
#define OUTPUT_CONFIGURATION
#define CHECKSUM_CHAR_OFFSET 10
#define CHECKSUM_CHAR_OFFSET 1
// Take first match only.
//#define ZSTD_SKIP
//#define RUN_CHECKS
//
//
static const U64 prime8bytes = 11400714785074694791ULL;
/* Hash table stuff */
@ -56,6 +63,14 @@ struct LDM_compressStats {
U32 numHashInserts;
U32 offsetHistogram[32];
U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG];
U64 TMP_totalHashCount;
U64 TMP_totalInWindow;
U64 TMP_totalInserts;
U64 TMP_matchCount;
};
typedef struct LDM_hashTable LDM_hashTable;
@ -311,10 +326,80 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
#ifdef TMP_EVICTION
void HASH_insert(LDM_hashTable *table,
const hash_t hash, const LDM_hashEntry entry) {
const hash_t hash, const LDM_hashEntry entry,
LDM_CCtx *cctx) {
// Overwrite based on part of checksum.
/*
LDM_hashEntry *toOverwrite =
getBucket(table, hash) + table->bucketOffsets[hash];
const BYTE *pMatch = toOverwrite->offset + cctx->ibase;
if (toOverwrite->offset != 0 &&
cctx->ip - pMatch <= LDM_WINDOW_SIZE) {
cctx->stats.TMP_totalInWindow++;
}
cctx->stats.TMP_totalInserts++;
*(toOverwrite) = entry;
*/
/*
int i;
LDM_hashEntry *bucket = getBucket(table, hash);
for (i = 0; i < HASH_BUCKET_SIZE; i++) {
if (bucket[i].checksum == entry.checksum) {
bucket[i] = entry;
cctx->stats.TMP_matchCount++;
return;
}
}
*/
// Find entry beyond window size, replace. Else, random.
int i;
LDM_hashEntry *bucket = getBucket(table, hash);
for (i = 0; i < HASH_BUCKET_SIZE; i++) {
if (cctx->ip - cctx->ibase - bucket[i].offset > LDM_WINDOW_SIZE) {
bucket[i] = entry;
return;
}
}
i = rand() & (HASH_BUCKET_SIZE - 1);
*(bucket + i) = entry;
/**
* Sliding buffer style pointer
* Keep old entry as temporary. If the old entry is outside the window,
* overwrite and we are done.
*
* Backwards (insert at x):
* x, a, b b, c c c c, d d d d d d d d
* x, d d d d d d d d, c c c c, b b, a
*
* Else, find something to evict.
* If old entry has more ones, it takes
* the next spot. <-- reversed order?
*
* If window size > LDM_WINDOW_SIZE,
* overwrite,
*
* Insert forwards. If > tag, keep. Else evict.
*
*
*
*
*/
/*
*(getBucket(table, hash) + table->bucketOffsets[hash]) = entry;
table->bucketOffsets[hash]++;
table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1;
*/
// U16 mask = entry.checksum & (HASH_BUCKET_SIZE - 1);
// *(getBucket(table, hash) + mask) = entry;
}
#else
@ -348,8 +433,9 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) {
}
}
printf("Num buckets, bucket size: %d, %d\n",
table->numBuckets, HASH_BUCKET_SIZE);
// TODO: repeat numBuckets as a check for now.
printf("Num buckets, bucket size: %d (2^%d), %d\n",
table->numBuckets, LDM_HASHLOG, HASH_BUCKET_SIZE);
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
table->numEntries, ctr,
100.0 * (double)(ctr) / table->numEntries);
@ -396,6 +482,24 @@ void LDM_printCompressStats(const LDM_compressStats *stats) {
(double) stats->numMatches);
}
printf("\n");
#ifdef TMP_TAG_INSERT
/*
printf("Lower bit distribution\n");
for (i = 0; i < (1 << HASH_ONLY_EVERY_LOG); i++) {
printf("%5d %5llu %6.3f\n", i, stats->TMP_hashCount[i],
100.0 * (double) stats->TMP_hashCount[i] /
(double) stats->TMP_totalHashCount);
}
*/
#endif
#ifdef TMP_EVICTION
printf("Evicted something in window: %llu %6.3f\n",
stats->TMP_totalInWindow,
100.0 * (double)stats->TMP_totalInWindow /
(double)stats->TMP_totalInserts);
printf("Match count: %llu\n", stats->TMP_matchCount);
#endif
printf("=====================\n");
}
@ -418,7 +522,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
return 1;
}
// Upper LDM_HASH_LOG bits.
// Upper LDM_HASHLOG bits.
static hash_t checksumToHash(U64 sum) {
return sum >> (64 - LDM_HASHLOG);
}
@ -428,9 +532,30 @@ static U32 checksumFromHfHash(U64 hfHash) {
return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF;
}
static U64 getChecksum(const BYTE *buf, U32 len) {
static const U64 prime8bytes = 11400714785074694791ULL;
#ifdef TMP_TAG_INSERT
static U32 lowerBitsFromHfHash(U64 hfHash) {
// The number of bits used so far is LDM_HASHLOG + 32.
// So there are 32 - LDM_HASHLOG bits left.
// Occasional hashing requires HASH_ONLY_EVERY_LOG bits.
// So if 32 - LDMHASHLOG < HASH_ONLY_EVERY_LOG, just return lower bits
// allowing for reuse of bits.
#ifdef TMP_SIMPLE_LOWER
return hfHash & HASH_ONLY_EVERY;
#else
if (32 - LDM_HASHLOG < HASH_ONLY_EVERY_LOG) {
return hfHash & HASH_ONLY_EVERY;
} else {
// Otherwise shift by (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG) bits first.
return (hfHash >> (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG)) &
HASH_ONLY_EVERY;
}
#endif
}
#endif
static U64 getChecksum(const BYTE *buf, U32 len) {
U64 ret = 0;
U32 i;
for (i = 0; i < len; i++) {
@ -455,11 +580,8 @@ static U64 ipow(U64 base, U64 exp) {
static U64 updateChecksum(U64 sum, U32 len,
BYTE toRemove, BYTE toAdd) {
// TODO: deduplicate.
static const U64 prime8bytes = 11400714785074694791ULL;
// TODO: relying on compiler optimization here.
// The exponential can be calculated explicitly.
// The exponential can (should?) be calculated explicitly.
sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) *
ipow(prime8bytes, len - 1));
sum *= prime8bytes;
@ -492,6 +614,14 @@ static void setNextHash(LDM_CCtx *cctx) {
cctx->lastPosHashed[LDM_HASH_LENGTH]);
cctx->nextPosHashed = cctx->nextIp;
#ifdef TMP_TAG_INSERT
{
U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextSum);
cctx->stats.TMP_totalHashCount++;
cctx->stats.TMP_hashCount[hashEveryMask]++;
}
#endif
#if LDM_LAG
// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp);
if (cctx->ip - cctx->ibase > LDM_LAG) {
@ -520,31 +650,48 @@ static void setNextHash(LDM_CCtx *cctx) {
static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) {
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
// Note: this works only when cctx->step is 1.
// printf("TMP %u %u %llu\n", hash, sum, hfHash);
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
#if LDM_LAG
// TODO: off by 1, but whatever
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
// TODO: off by 1, but whatever.
if (cctx->lagIp - cctx->ibase > 0) {
U32 hash = checksumToHash(cctx->lagSum);
U32 sum = checksumFromHfHash(cctx->lagSum);
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, sum };
#ifdef TMP_EVICTION
HASH_insert(cctx->hashTable, hash, entry, cctx);
#else
HASH_insert(cctx->hashTable, hash, entry);
#endif
} else {
U32 hash = checksumToHash(hfHash);
U32 sum = checksumFromHfHash(hfHash);
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
HASH_insert(cctx->hashTable, hash, entry);
}
#ifdef TMP_EVICTION
HASH_insert(cctx->hashTable, hash, entry, cctx);
#else
HASH_insert(cctx->hashTable, hash, entry);
#endif
}
}
#else
#ifdef TMP_TAG_INSERT
U32 hashEveryMask = lowerBitsFromHfHash(hfHash);
// TODO: look at stats.
if (hashEveryMask == HASH_ONLY_EVERY) {
#else
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
#endif
U32 hash = checksumToHash(hfHash);
U32 sum = checksumFromHfHash(hfHash);
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
#ifdef TMP_EVICTION
HASH_insert(cctx->hashTable, hash, entry, cctx);
#else
HASH_insert(cctx->hashTable, hash, entry);
#endif
}
#endif
cctx->lastPosHashed = cctx->ip;
cctx->lastSum = hfHash;
@ -676,10 +823,16 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
hash_t h;
U64 hash;
U32 sum;
#ifdef TMP_TAG_INSERT
U32 hashEveryMask;
#endif
setNextHash(cctx);
hash = cctx->nextSum;
h = checksumToHash(hash);
sum = checksumFromHfHash(hash);
#ifdef TMP_TAG_INSERT
hashEveryMask = lowerBitsFromHfHash(hash);
#endif
cctx->ip = cctx->nextIp;
cctx->nextIp += cctx->step;
@ -687,9 +840,15 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
if (cctx->ip > cctx->imatchLimit) {
return 1;
}
#ifdef TMP_TAG_INSERT
if (hashEveryMask == HASH_ONLY_EVERY) {
entry = HASH_getBestEntry(cctx, h, sum,
forwardMatchLength, backwardMatchLength);
}
#else
entry = HASH_getBestEntry(cctx, h, sum,
forwardMatchLength, backwardMatchLength);
#endif
if (entry != NULL) {
*match = entry->offset + cctx->ibase;