Experiment with not using a checksum

dev
Stella Lau 2017-07-24 15:26:44 -07:00
parent 08a6e9a141
commit 0295a27133
5 changed files with 151 additions and 46 deletions

View File

@ -10,6 +10,14 @@
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
#define LDM_HASH_ENTRY_SIZE_LOG 3
//#define HASH_ONLY_EVERY_LOG 7
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)))
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
#define COMPUTE_STATS #define COMPUTE_STATS
#define OUTPUT_CONFIGURATION #define OUTPUT_CONFIGURATION
#define CHECKSUM_CHAR_OFFSET 10 #define CHECKSUM_CHAR_OFFSET 10
@ -510,6 +518,21 @@ size_t LDM_compress(const void *src, size_t srcSize,
} }
} }
void LDM_outputConfiguration(void) {
printf("=====================\n");
printf("Configuration\n");
printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG);
printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n",
LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH);
printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE);
printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG);
printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG);
printf("LDM_LAG %d\n", LDM_LAG);
printf("=====================\n");
}
void LDM_test(const BYTE *src) { void LDM_test(const BYTE *src) {
(void)src; (void)src;
} }

View File

@ -32,23 +32,15 @@
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
//These should be multiples of four (and perhaps set to the same value?). //These should be multiples of four (and perhaps set to the same value?).
#define LDM_MIN_MATCH_LENGTH 16 #define LDM_MIN_MATCH_LENGTH 64
#define LDM_HASH_LENGTH 16 #define LDM_HASH_LENGTH 64
// Experimental. // Experimental.
//#define TMP_EVICTION //#define TMP_EVICTION // Experiment with eviction policies.
#define TMP_TAG_INSERT #define TMP_TAG_INSERT // Insertion policy based on hash.
//#define TMP_FORCE_HASH_ONLY
#define LDM_HASH_ENTRY_SIZE_LOG 3 #define USE_CHECKSUM 1
//#define USE_CHECKSUM (HASH_BUCKET_SIZE_LOG)
// Insert every (HASH_ONLY_EVERY + 1) into the hash table.
#ifdef TMP_FORCE_HASH_ONLY
#define HASH_ONLY_EVERY_LOG 7
#else
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)))
#endif
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_compressStats LDM_compressStats;
typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_CCtx LDM_CCtx;

View File

@ -7,9 +7,20 @@
#include "ldm.h" #include "ldm.h"
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
/* Hash table stuff. */ #if USE_CHECKSUM
#define LDM_HASH_ENTRY_SIZE_LOG 3
#else
#define LDM_HASH_ENTRY_SIZE_LOG 2
#endif
//#define HASH_ONLY_EVERY_LOG 7
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)))
#define HASH_ONLY_EVERY ((1 << (HASH_ONLY_EVERY_LOG)) - 1)
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
@ -27,10 +38,16 @@ static const U64 prime8bytes = 11400714785074694791ULL;
// Type of the small hash used to index into the hash table. // Type of the small hash used to index into the hash table.
typedef U32 hash_t; typedef U32 hash_t;
#if USE_CHECKSUM
typedef struct LDM_hashEntry { typedef struct LDM_hashEntry {
U32 offset; U32 offset;
U32 checksum; U32 checksum;
} LDM_hashEntry; } LDM_hashEntry;
#else
typedef struct LDM_hashEntry {
U32 offset;
} LDM_hashEntry;
#endif
struct LDM_compressStats { struct LDM_compressStats {
U32 windowSizeLog, hashTableSizeLog; U32 windowSizeLog, hashTableSizeLog;
@ -39,6 +56,8 @@ struct LDM_compressStats {
U64 totalLiteralLength; U64 totalLiteralLength;
U64 totalOffset; U64 totalOffset;
U32 matchLengthHistogram[32];
U32 minOffset, maxOffset; U32 minOffset, maxOffset;
U32 offsetHistogram[32]; U32 offsetHistogram[32];
@ -262,12 +281,19 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
LDM_hashEntry *cur = bucket; LDM_hashEntry *cur = bucket;
LDM_hashEntry *bestEntry = NULL; LDM_hashEntry *bestEntry = NULL;
U64 bestMatchLength = 0; U64 bestMatchLength = 0;
#if !(USE_CHECKSUM)
(void)checksum;
#endif
for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
const BYTE *pMatch = cur->offset + cctx->ibase; const BYTE *pMatch = cur->offset + cctx->ibase;
// Check checksum for faster check. // Check checksum for faster check.
#if USE_CHECKSUM
if (cur->checksum == checksum && if (cur->checksum == checksum &&
cctx->ip - pMatch <= LDM_WINDOW_SIZE) { cctx->ip - pMatch <= LDM_WINDOW_SIZE) {
#else
if (cctx->ip - pMatch <= LDM_WINDOW_SIZE) {
#endif
U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend);
U64 backwardMatchLength, totalMatchLength; U64 backwardMatchLength, totalMatchLength;
@ -448,12 +474,18 @@ void LDM_printCompressStats(const LDM_compressStats *stats) {
stats->minOffset, stats->maxOffset); stats->minOffset, stats->maxOffset);
printf("\n"); printf("\n");
printf("offset histogram: offset, num matches, %% of matches\n"); printf("offset histogram | match length histogram\n");
printf("offset/ML, num matches, %% of matches | num matches, %% of matches\n");
for (; i <= intLog2(stats->maxOffset); i++) { for (; i <= intLog2(stats->maxOffset); i++) {
printf("2^%*d: %10u %6.3f%%\n", 2, i, printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n",
2, i,
stats->offsetHistogram[i], stats->offsetHistogram[i],
100.0 * (double) stats->offsetHistogram[i] / 100.0 * (double) stats->offsetHistogram[i] /
(double) stats->numMatches,
2, i,
stats->matchLengthHistogram[i],
100.0 * (double) stats->matchLengthHistogram[i] /
(double) stats->numMatches); (double) stats->numMatches);
} }
printf("\n"); printf("\n");
@ -619,23 +651,32 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) {
// TODO: Off by one, but not important. // TODO: Off by one, but not important.
if (cctx->lagIp - cctx->ibase > 0) { if (cctx->lagIp - cctx->ibase > 0) {
U32 smallHash = getSmallHash(cctx->lagHash); U32 smallHash = getSmallHash(cctx->lagHash);
# if USE_CHECKSUM
U32 checksum = getChecksum(cctx->lagHash); U32 checksum = getChecksum(cctx->lagHash);
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, checksum }; const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, checksum };
#ifdef TMP_EVICTION # else
HASH_insert(cctx->hashTable, smallHash, entry, cctx); const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase };
#else # endif
HASH_insert(cctx->hashTable, smallHash, entry);
#endif
} else {
U32 smallHash = getSmallHash(hash);
U32 checksum = getChecksum(hash);
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; # ifdef TMP_EVICTION
#ifdef TMP_EVICTION
HASH_insert(cctx->hashTable, smallHash, entry, cctx); HASH_insert(cctx->hashTable, smallHash, entry, cctx);
#else # else
HASH_insert(cctx->hashTable, smallHash, entry); HASH_insert(cctx->hashTable, smallHash, entry);
#endif # endif
} else {
# if USE_CHECKSUM
U32 checksum = getChecksum(hash);
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, checksum };
# else
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase };
# endif
# ifdef TMP_EVICTION
HASH_insert(cctx->hashTable, smallHash, entry, cctx);
# else
HASH_insert(cctx->hashTable, smallHash, entry);
# endif
} }
} }
#else #else
@ -646,8 +687,12 @@ static void putHashOfCurrentPositionFromHash(LDM_CCtx *cctx, U64 hash) {
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
#endif #endif
U32 smallHash = getSmallHash(hash); U32 smallHash = getSmallHash(hash);
#if USE_CHECKSUM
U32 checksum = getChecksum(hash); U32 checksum = getChecksum(hash);
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum }; const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum };
#else
const LDM_hashEntry entry = { cctx->ip - cctx->ibase };
#endif
#ifdef TMP_EVICTION #ifdef TMP_EVICTION
HASH_insert(cctx->hashTable, smallHash, entry, cctx); HASH_insert(cctx->hashTable, smallHash, entry, cctx);
#else #else
@ -711,8 +756,11 @@ void LDM_initializeCCtx(LDM_CCtx *cctx,
cctx->anchor = cctx->ibase; cctx->anchor = cctx->ibase;
memset(&(cctx->stats), 0, sizeof(cctx->stats)); memset(&(cctx->stats), 0, sizeof(cctx->stats));
#if USE_CHECKSUM
cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64); cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64);
#else
cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32);
#endif
cctx->stats.minOffset = UINT_MAX; cctx->stats.minOffset = UINT_MAX;
cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG;
cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE;
@ -755,6 +803,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
U32 hashEveryMask; U32 hashEveryMask;
#endif #endif
setNextHash(cctx); setNextHash(cctx);
hash = cctx->nextHash; hash = cctx->nextHash;
smallHash = getSmallHash(hash); smallHash = getSmallHash(hash);
checksum = getChecksum(hash); checksum = getChecksum(hash);
@ -770,6 +819,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
} }
#ifdef TMP_TAG_INSERT #ifdef TMP_TAG_INSERT
if (hashEveryMask == HASH_ONLY_EVERY) { if (hashEveryMask == HASH_ONLY_EVERY) {
entry = HASH_getBestEntry(cctx, smallHash, checksum, entry = HASH_getBestEntry(cctx, smallHash, checksum,
forwardMatchLength, backwardMatchLength); forwardMatchLength, backwardMatchLength);
} }
@ -781,7 +831,9 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
if (entry != NULL) { if (entry != NULL) {
*match = entry->offset + cctx->ibase; *match = entry->offset + cctx->ibase;
} }
putHashOfCurrentPositionFromHash(cctx, hash); putHashOfCurrentPositionFromHash(cctx, hash);
} }
setNextHash(cctx); setNextHash(cctx);
return 0; return 0;
@ -850,6 +902,7 @@ size_t LDM_compress(const void *src, size_t srcSize,
U64 backwardsMatchLength = 0; U64 backwardsMatchLength = 0;
LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize);
#ifdef OUTPUT_CONFIGURATION #ifdef OUTPUT_CONFIGURATION
LDM_outputConfiguration(); LDM_outputConfiguration();
#endif #endif
@ -869,6 +922,7 @@ size_t LDM_compress(const void *src, size_t srcSize,
*/ */
while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength, while (LDM_findBestMatch(&cctx, &match, &forwardMatchLength,
&backwardsMatchLength) == 0) { &backwardsMatchLength) == 0) {
#ifdef COMPUTE_STATS #ifdef COMPUTE_STATS
cctx.stats.numMatches++; cctx.stats.numMatches++;
#endif #endif
@ -898,6 +952,8 @@ size_t LDM_compress(const void *src, size_t srcSize,
cctx.stats.maxOffset = cctx.stats.maxOffset =
offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset;
cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; cctx.stats.offsetHistogram[(U32)intLog2(offset)]++;
cctx.stats.matchLengthHistogram[
(U32)intLog2(matchLength + LDM_MIN_MATCH_LENGTH)]++;
#endif #endif
// Move ip to end of block, inserting hashes at each position. // Move ip to end of block, inserting hashes at each position.
@ -938,6 +994,22 @@ size_t LDM_compress(const void *src, size_t srcSize,
} }
} }
void LDM_outputConfiguration(void) {
printf("=====================\n");
printf("Configuration\n");
printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG);
printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n",
LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH);
printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE);
printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG);
printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG);
printf("LDM_LAG %d\n", LDM_LAG);
printf("USE_CHECKSUM %d\n", USE_CHECKSUM);
printf("=====================\n");
}
// TODO: implement and test hash function // TODO: implement and test hash function
void LDM_test(const BYTE *src) { void LDM_test(const BYTE *src) {
const U32 diff = 100; const U32 diff = 100;

View File

@ -2,19 +2,6 @@
#include "ldm.h" #include "ldm.h"
void LDM_outputConfiguration(void) {
printf("=====================\n");
printf("Configuration\n");
printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG);
printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n",
LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH);
printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE);
printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG);
printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG);
printf("LDM_LAG %d\n", LDM_LAG);
printf("=====================\n");
}
void LDM_readHeader(const void *src, U64 *compressedSize, void LDM_readHeader(const void *src, U64 *compressedSize,
U64 *decompressedSize) { U64 *decompressedSize) {
const BYTE *ip = (const BYTE *)src; const BYTE *ip = (const BYTE *)src;

View File

@ -7,10 +7,16 @@
#include "ldm.h" #include "ldm.h"
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
#define LDM_HASH_ENTRY_SIZE_LOG 3
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
#define LDM_HASH_ENTRY_SIZE_LOG 3
//#define HASH_ONLY_EVERY_LOG 7
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)))
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
/* Hash table stuff. */ /* Hash table stuff. */
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
@ -38,6 +44,8 @@ struct LDM_compressStats {
U64 totalLiteralLength; U64 totalLiteralLength;
U64 totalOffset; U64 totalOffset;
U32 matchLengthHistogram[32];
U32 minOffset, maxOffset; U32 minOffset, maxOffset;
U32 offsetHistogram[32]; U32 offsetHistogram[32];
@ -358,12 +366,18 @@ void LDM_printCompressStats(const LDM_compressStats *stats) {
stats->minOffset, stats->maxOffset); stats->minOffset, stats->maxOffset);
printf("\n"); printf("\n");
printf("offset histogram: offset, num matches, %% of matches\n"); printf("offset histogram | match length histogram\n");
printf("offset/ML, num matches, %% of matches | num matches, %% of matches\n");
for (; i <= intLog2(stats->maxOffset); i++) { for (; i <= intLog2(stats->maxOffset); i++) {
printf("2^%*d: %10u %6.3f%%\n", 2, i, printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n",
2, i,
stats->offsetHistogram[i], stats->offsetHistogram[i],
100.0 * (double) stats->offsetHistogram[i] / 100.0 * (double) stats->offsetHistogram[i] /
(double) stats->numMatches,
2, i,
stats->matchLengthHistogram[i],
100.0 * (double) stats->matchLengthHistogram[i] /
(double) stats->numMatches); (double) stats->numMatches);
} }
printf("\n"); printf("\n");
@ -742,6 +756,8 @@ size_t LDM_compress(const void *src, size_t srcSize,
cctx.stats.maxOffset = cctx.stats.maxOffset =
offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset; offset > cctx.stats.maxOffset ? offset : cctx.stats.maxOffset;
cctx.stats.offsetHistogram[(U32)intLog2(offset)]++; cctx.stats.offsetHistogram[(U32)intLog2(offset)]++;
cctx.stats.matchLengthHistogram[
(U32)intLog2(matchLength + LDM_MIN_MATCH_LENGTH)]++;
#endif #endif
// Move ip to end of block, inserting hashes at each position. // Move ip to end of block, inserting hashes at each position.
@ -784,6 +800,21 @@ size_t LDM_compress(const void *src, size_t srcSize,
} }
} }
void LDM_outputConfiguration(void) {
printf("=====================\n");
printf("Configuration\n");
printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG);
printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n",
LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH);
printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE);
printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG);
printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG);
printf("LDM_LAG %d\n", LDM_LAG);
printf("=====================\n");
}
// TODO: implement and test hash function // TODO: implement and test hash function
void LDM_test(const BYTE *src) { void LDM_test(const BYTE *src) {
(void)src; (void)src;