Minor clean up
This commit is contained in:
parent
1a188fe864
commit
eb16da647d
@ -13,21 +13,26 @@
|
|||||||
// Defines the size of the hash table.
|
// Defines the size of the hash table.
|
||||||
// Note that this is not the number of buckets.
|
// Note that this is not the number of buckets.
|
||||||
// Currently this should be less than WINDOW_SIZE_LOG + 4?
|
// Currently this should be less than WINDOW_SIZE_LOG + 4?
|
||||||
#define LDM_MEMORY_USAGE 22
|
#define LDM_MEMORY_USAGE 24
|
||||||
#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now
|
#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now
|
||||||
|
|
||||||
// Defines the lag in inserting elements into the hash table.
|
// Defines the lag in inserting elements into the hash table.
|
||||||
#define LDM_LAG 0
|
#define LDM_LAG 0
|
||||||
|
|
||||||
#define LDM_WINDOW_SIZE_LOG 28
|
#define LDM_WINDOW_SIZE_LOG 28 // Max value is 30
|
||||||
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
|
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
|
||||||
|
|
||||||
//These should be multiples of four (and perhaps set to the same value?).
|
//These should be multiples of four (and perhaps set to the same value?).
|
||||||
#define LDM_MIN_MATCH_LENGTH 64
|
#define LDM_MIN_MATCH_LENGTH 64
|
||||||
#define LDM_HASH_LENGTH 64
|
#define LDM_HASH_LENGTH 64
|
||||||
|
|
||||||
#define TMP_EVICTION
|
// Experimental.
|
||||||
|
//:w
|
||||||
|
//#define TMP_EVICTION
|
||||||
#define TMP_TAG_INSERT
|
#define TMP_TAG_INSERT
|
||||||
|
//#define TMP_SIMPLE_LOWER
|
||||||
|
//#define TMP_FORCE_HASH_ONLY
|
||||||
|
|
||||||
typedef struct LDM_compressStats LDM_compressStats;
|
typedef struct LDM_compressStats LDM_compressStats;
|
||||||
typedef struct LDM_CCtx LDM_CCtx;
|
typedef struct LDM_CCtx LDM_CCtx;
|
||||||
typedef struct LDM_DCtx LDM_DCtx;
|
typedef struct LDM_DCtx LDM_DCtx;
|
||||||
|
@ -12,7 +12,11 @@
|
|||||||
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
|
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
|
||||||
|
|
||||||
// Insert every (HASH_ONLY_EVERY + 1) into the hash table.
|
// Insert every (HASH_ONLY_EVERY + 1) into the hash table.
|
||||||
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG)))
|
#ifdef TMP_FORCE_HASH_ONLY
|
||||||
|
#define HASH_ONLY_EVERY_LOG 7
|
||||||
|
#else
|
||||||
|
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG)))
|
||||||
|
#endif
|
||||||
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
|
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
|
||||||
|
|
||||||
/* Hash table stuff. */
|
/* Hash table stuff. */
|
||||||
@ -26,12 +30,15 @@
|
|||||||
|
|
||||||
#define COMPUTE_STATS
|
#define COMPUTE_STATS
|
||||||
#define OUTPUT_CONFIGURATION
|
#define OUTPUT_CONFIGURATION
|
||||||
#define CHECKSUM_CHAR_OFFSET 10
|
#define CHECKSUM_CHAR_OFFSET 1
|
||||||
|
|
||||||
// Take first match only.
|
// Take first match only.
|
||||||
//#define ZSTD_SKIP
|
//#define ZSTD_SKIP
|
||||||
|
|
||||||
//#define RUN_CHECKS
|
//#define RUN_CHECKS
|
||||||
|
//
|
||||||
|
//
|
||||||
|
static const U64 prime8bytes = 11400714785074694791ULL;
|
||||||
|
|
||||||
/* Hash table stuff */
|
/* Hash table stuff */
|
||||||
|
|
||||||
@ -56,6 +63,14 @@ struct LDM_compressStats {
|
|||||||
U32 numHashInserts;
|
U32 numHashInserts;
|
||||||
|
|
||||||
U32 offsetHistogram[32];
|
U32 offsetHistogram[32];
|
||||||
|
|
||||||
|
U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG];
|
||||||
|
U64 TMP_totalHashCount;
|
||||||
|
|
||||||
|
U64 TMP_totalInWindow;
|
||||||
|
U64 TMP_totalInserts;
|
||||||
|
|
||||||
|
U64 TMP_matchCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct LDM_hashTable LDM_hashTable;
|
typedef struct LDM_hashTable LDM_hashTable;
|
||||||
@ -311,10 +326,80 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
|
|||||||
#ifdef TMP_EVICTION
|
#ifdef TMP_EVICTION
|
||||||
|
|
||||||
void HASH_insert(LDM_hashTable *table,
|
void HASH_insert(LDM_hashTable *table,
|
||||||
const hash_t hash, const LDM_hashEntry entry) {
|
const hash_t hash, const LDM_hashEntry entry,
|
||||||
|
LDM_CCtx *cctx) {
|
||||||
|
// Overwrite based on part of checksum.
|
||||||
|
/*
|
||||||
|
LDM_hashEntry *toOverwrite =
|
||||||
|
getBucket(table, hash) + table->bucketOffsets[hash];
|
||||||
|
const BYTE *pMatch = toOverwrite->offset + cctx->ibase;
|
||||||
|
if (toOverwrite->offset != 0 &&
|
||||||
|
cctx->ip - pMatch <= LDM_WINDOW_SIZE) {
|
||||||
|
cctx->stats.TMP_totalInWindow++;
|
||||||
|
}
|
||||||
|
|
||||||
|
cctx->stats.TMP_totalInserts++;
|
||||||
|
*(toOverwrite) = entry;
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
int i;
|
||||||
|
LDM_hashEntry *bucket = getBucket(table, hash);
|
||||||
|
for (i = 0; i < HASH_BUCKET_SIZE; i++) {
|
||||||
|
if (bucket[i].checksum == entry.checksum) {
|
||||||
|
bucket[i] = entry;
|
||||||
|
cctx->stats.TMP_matchCount++;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Find entry beyond window size, replace. Else, random.
|
||||||
|
int i;
|
||||||
|
LDM_hashEntry *bucket = getBucket(table, hash);
|
||||||
|
for (i = 0; i < HASH_BUCKET_SIZE; i++) {
|
||||||
|
if (cctx->ip - cctx->ibase - bucket[i].offset > LDM_WINDOW_SIZE) {
|
||||||
|
bucket[i] = entry;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
i = rand() & (HASH_BUCKET_SIZE - 1);
|
||||||
|
*(bucket + i) = entry;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sliding buffer style pointer
|
||||||
|
* Keep old entry as temporary. If the old entry is outside the window,
|
||||||
|
* overwrite and we are done.
|
||||||
|
*
|
||||||
|
* Backwards (insert at x):
|
||||||
|
* x, a, b b, c c c c, d d d d d d d d
|
||||||
|
* x, d d d d d d d d, c c c c, b b, a
|
||||||
|
*
|
||||||
|
* Else, find something to evict.
|
||||||
|
* If old entry has more ones, it takes
|
||||||
|
* the next spot. <-- reversed order?
|
||||||
|
*
|
||||||
|
* If window size > LDM_WINDOW_SIZE,
|
||||||
|
* overwrite,
|
||||||
|
*
|
||||||
|
* Insert forwards. If > tag, keep. Else evict.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
*(getBucket(table, hash) + table->bucketOffsets[hash]) = entry;
|
*(getBucket(table, hash) + table->bucketOffsets[hash]) = entry;
|
||||||
table->bucketOffsets[hash]++;
|
table->bucketOffsets[hash]++;
|
||||||
table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1;
|
table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1;
|
||||||
|
*/
|
||||||
|
|
||||||
|
// U16 mask = entry.checksum & (HASH_BUCKET_SIZE - 1);
|
||||||
|
// *(getBucket(table, hash) + mask) = entry;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
@ -348,8 +433,9 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Num buckets, bucket size: %d, %d\n",
|
// TODO: repeat numBuckets as a check for now.
|
||||||
table->numBuckets, HASH_BUCKET_SIZE);
|
printf("Num buckets, bucket size: %d (2^%d), %d\n",
|
||||||
|
table->numBuckets, LDM_HASHLOG, HASH_BUCKET_SIZE);
|
||||||
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
|
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
|
||||||
table->numEntries, ctr,
|
table->numEntries, ctr,
|
||||||
100.0 * (double)(ctr) / table->numEntries);
|
100.0 * (double)(ctr) / table->numEntries);
|
||||||
@ -396,6 +482,24 @@ void LDM_printCompressStats(const LDM_compressStats *stats) {
|
|||||||
(double) stats->numMatches);
|
(double) stats->numMatches);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
#ifdef TMP_TAG_INSERT
|
||||||
|
/*
|
||||||
|
printf("Lower bit distribution\n");
|
||||||
|
for (i = 0; i < (1 << HASH_ONLY_EVERY_LOG); i++) {
|
||||||
|
printf("%5d %5llu %6.3f\n", i, stats->TMP_hashCount[i],
|
||||||
|
100.0 * (double) stats->TMP_hashCount[i] /
|
||||||
|
(double) stats->TMP_totalHashCount);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef TMP_EVICTION
|
||||||
|
printf("Evicted something in window: %llu %6.3f\n",
|
||||||
|
stats->TMP_totalInWindow,
|
||||||
|
100.0 * (double)stats->TMP_totalInWindow /
|
||||||
|
(double)stats->TMP_totalInserts);
|
||||||
|
printf("Match count: %llu\n", stats->TMP_matchCount);
|
||||||
|
#endif
|
||||||
printf("=====================\n");
|
printf("=====================\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -418,7 +522,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Upper LDM_HASH_LOG bits.
|
// Upper LDM_HASHLOG bits.
|
||||||
static hash_t checksumToHash(U64 sum) {
|
static hash_t checksumToHash(U64 sum) {
|
||||||
return sum >> (64 - LDM_HASHLOG);
|
return sum >> (64 - LDM_HASHLOG);
|
||||||
}
|
}
|
||||||
@ -428,9 +532,30 @@ static U32 checksumFromHfHash(U64 hfHash) {
|
|||||||
return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF;
|
return (hfHash >> (64 - 32 - LDM_HASHLOG)) & 0xFFFFFFFF;
|
||||||
}
|
}
|
||||||
|
|
||||||
static U64 getChecksum(const BYTE *buf, U32 len) {
|
#ifdef TMP_TAG_INSERT
|
||||||
static const U64 prime8bytes = 11400714785074694791ULL;
|
static U32 lowerBitsFromHfHash(U64 hfHash) {
|
||||||
|
// The number of bits used so far is LDM_HASHLOG + 32.
|
||||||
|
// So there are 32 - LDM_HASHLOG bits left.
|
||||||
|
// Occasional hashing requires HASH_ONLY_EVERY_LOG bits.
|
||||||
|
// So if 32 - LDMHASHLOG < HASH_ONLY_EVERY_LOG, just return lower bits
|
||||||
|
// allowing for reuse of bits.
|
||||||
|
#ifdef TMP_SIMPLE_LOWER
|
||||||
|
return hfHash & HASH_ONLY_EVERY;
|
||||||
|
#else
|
||||||
|
if (32 - LDM_HASHLOG < HASH_ONLY_EVERY_LOG) {
|
||||||
|
return hfHash & HASH_ONLY_EVERY;
|
||||||
|
} else {
|
||||||
|
// Otherwise shift by (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG) bits first.
|
||||||
|
return (hfHash >> (32 - LDM_HASHLOG - HASH_ONLY_EVERY_LOG)) &
|
||||||
|
HASH_ONLY_EVERY;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static U64 getChecksum(const BYTE *buf, U32 len) {
|
||||||
U64 ret = 0;
|
U64 ret = 0;
|
||||||
U32 i;
|
U32 i;
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0; i < len; i++) {
|
||||||
@ -455,11 +580,8 @@ static U64 ipow(U64 base, U64 exp) {
|
|||||||
|
|
||||||
static U64 updateChecksum(U64 sum, U32 len,
|
static U64 updateChecksum(U64 sum, U32 len,
|
||||||
BYTE toRemove, BYTE toAdd) {
|
BYTE toRemove, BYTE toAdd) {
|
||||||
// TODO: deduplicate.
|
|
||||||
static const U64 prime8bytes = 11400714785074694791ULL;
|
|
||||||
|
|
||||||
// TODO: relying on compiler optimization here.
|
// TODO: relying on compiler optimization here.
|
||||||
// The exponential can be calculated explicitly.
|
// The exponential can (should?) be calculated explicitly.
|
||||||
sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) *
|
sum -= ((toRemove + CHECKSUM_CHAR_OFFSET) *
|
||||||
ipow(prime8bytes, len - 1));
|
ipow(prime8bytes, len - 1));
|
||||||
sum *= prime8bytes;
|
sum *= prime8bytes;
|
||||||
@ -492,6 +614,14 @@ static void setNextHash(LDM_CCtx *cctx) {
|
|||||||
cctx->lastPosHashed[LDM_HASH_LENGTH]);
|
cctx->lastPosHashed[LDM_HASH_LENGTH]);
|
||||||
cctx->nextPosHashed = cctx->nextIp;
|
cctx->nextPosHashed = cctx->nextIp;
|
||||||
|
|
||||||
|
#ifdef TMP_TAG_INSERT
|
||||||
|
{
|
||||||
|
U32 hashEveryMask = lowerBitsFromHfHash(cctx->nextSum);
|
||||||
|
cctx->stats.TMP_totalHashCount++;
|
||||||
|
cctx->stats.TMP_hashCount[hashEveryMask]++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if LDM_LAG
|
#if LDM_LAG
|
||||||
// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp);
|
// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp);
|
||||||
if (cctx->ip - cctx->ibase > LDM_LAG) {
|
if (cctx->ip - cctx->ibase > LDM_LAG) {
|
||||||
@ -520,31 +650,48 @@ static void setNextHash(LDM_CCtx *cctx) {
|
|||||||
static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) {
|
static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hfHash) {
|
||||||
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
|
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
|
||||||
// Note: this works only when cctx->step is 1.
|
// Note: this works only when cctx->step is 1.
|
||||||
// printf("TMP %u %u %llu\n", hash, sum, hfHash);
|
|
||||||
|
|
||||||
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
|
|
||||||
|
|
||||||
#if LDM_LAG
|
#if LDM_LAG
|
||||||
// TODO: off by 1, but whatever
|
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
|
||||||
|
// TODO: off by 1, but whatever.
|
||||||
if (cctx->lagIp - cctx->ibase > 0) {
|
if (cctx->lagIp - cctx->ibase > 0) {
|
||||||
U32 hash = checksumToHash(cctx->lagSum);
|
U32 hash = checksumToHash(cctx->lagSum);
|
||||||
U32 sum = checksumFromHfHash(cctx->lagSum);
|
U32 sum = checksumFromHfHash(cctx->lagSum);
|
||||||
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, sum };
|
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, sum };
|
||||||
|
#ifdef TMP_EVICTION
|
||||||
|
HASH_insert(cctx->hashTable, hash, entry, cctx);
|
||||||
|
#else
|
||||||
HASH_insert(cctx->hashTable, hash, entry);
|
HASH_insert(cctx->hashTable, hash, entry);
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
U32 hash = checksumToHash(hfHash);
|
U32 hash = checksumToHash(hfHash);
|
||||||
U32 sum = checksumFromHfHash(hfHash);
|
U32 sum = checksumFromHfHash(hfHash);
|
||||||
|
|
||||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
||||||
HASH_insert(cctx->hashTable, hash, entry);
|
#ifdef TMP_EVICTION
|
||||||
}
|
HASH_insert(cctx->hashTable, hash, entry, cctx);
|
||||||
#else
|
#else
|
||||||
|
HASH_insert(cctx->hashTable, hash, entry);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#ifdef TMP_TAG_INSERT
|
||||||
|
U32 hashEveryMask = lowerBitsFromHfHash(hfHash);
|
||||||
|
// TODO: look at stats.
|
||||||
|
if (hashEveryMask == HASH_ONLY_EVERY) {
|
||||||
|
#else
|
||||||
|
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
|
||||||
|
#endif
|
||||||
U32 hash = checksumToHash(hfHash);
|
U32 hash = checksumToHash(hfHash);
|
||||||
U32 sum = checksumFromHfHash(hfHash);
|
U32 sum = checksumFromHfHash(hfHash);
|
||||||
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
|
||||||
|
#ifdef TMP_EVICTION
|
||||||
|
HASH_insert(cctx->hashTable, hash, entry, cctx);
|
||||||
|
#else
|
||||||
HASH_insert(cctx->hashTable, hash, entry);
|
HASH_insert(cctx->hashTable, hash, entry);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
cctx->lastPosHashed = cctx->ip;
|
cctx->lastPosHashed = cctx->ip;
|
||||||
cctx->lastSum = hfHash;
|
cctx->lastSum = hfHash;
|
||||||
@ -676,10 +823,16 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
|
|||||||
hash_t h;
|
hash_t h;
|
||||||
U64 hash;
|
U64 hash;
|
||||||
U32 sum;
|
U32 sum;
|
||||||
|
#ifdef TMP_TAG_INSERT
|
||||||
|
U32 hashEveryMask;
|
||||||
|
#endif
|
||||||
setNextHash(cctx);
|
setNextHash(cctx);
|
||||||
hash = cctx->nextSum;
|
hash = cctx->nextSum;
|
||||||
h = checksumToHash(hash);
|
h = checksumToHash(hash);
|
||||||
sum = checksumFromHfHash(hash);
|
sum = checksumFromHfHash(hash);
|
||||||
|
#ifdef TMP_TAG_INSERT
|
||||||
|
hashEveryMask = lowerBitsFromHfHash(hash);
|
||||||
|
#endif
|
||||||
|
|
||||||
cctx->ip = cctx->nextIp;
|
cctx->ip = cctx->nextIp;
|
||||||
cctx->nextIp += cctx->step;
|
cctx->nextIp += cctx->step;
|
||||||
@ -687,9 +840,15 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
|
|||||||
if (cctx->ip > cctx->imatchLimit) {
|
if (cctx->ip > cctx->imatchLimit) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
#ifdef TMP_TAG_INSERT
|
||||||
|
if (hashEveryMask == HASH_ONLY_EVERY) {
|
||||||
|
entry = HASH_getBestEntry(cctx, h, sum,
|
||||||
|
forwardMatchLength, backwardMatchLength);
|
||||||
|
}
|
||||||
|
#else
|
||||||
entry = HASH_getBestEntry(cctx, h, sum,
|
entry = HASH_getBestEntry(cctx, h, sum,
|
||||||
forwardMatchLength, backwardMatchLength);
|
forwardMatchLength, backwardMatchLength);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (entry != NULL) {
|
if (entry != NULL) {
|
||||||
*match = entry->offset + cctx->ibase;
|
*match = entry->offset + cctx->ibase;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user