Minor refactoring
parent
030264ca51
commit
2427a154cb
|
@ -1,109 +0,0 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "ldm.h"
|
||||
#include "ldm_hashtable.h"
|
||||
#include "mem.h"
|
||||
|
||||
#define LDM_HASHLOG ((LDM_MEMORY_USAGE) - 4)
|
||||
|
||||
struct LDM_hashTable {
|
||||
U32 size;
|
||||
LDM_hashEntry *entries;
|
||||
const BYTE *offsetBase;
|
||||
};
|
||||
|
||||
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) {
|
||||
LDM_hashTable *table = malloc(sizeof(LDM_hashTable));
|
||||
table->size = size;
|
||||
table->entries = calloc(size, sizeof(LDM_hashEntry));
|
||||
table->offsetBase = offsetBase;
|
||||
return table;
|
||||
}
|
||||
|
||||
void HASH_initializeTable(LDM_hashTable *table, U32 size) {
|
||||
table->size = size;
|
||||
table->entries = calloc(size, sizeof(LDM_hashEntry));
|
||||
}
|
||||
|
||||
LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
|
||||
return table->entries + hash;
|
||||
}
|
||||
|
||||
LDM_hashEntry *HASH_getEntryFromHash(
|
||||
const LDM_hashTable *table, const hash_t hash, const U32 checksum) {
|
||||
(void)checksum;
|
||||
return getBucket(table, hash);
|
||||
}
|
||||
|
||||
static int isValidMatch(const BYTE *pIn, const BYTE *pMatch,
|
||||
U32 minMatchLength, U32 maxWindowSize) {
|
||||
U32 lengthLeft = minMatchLength;
|
||||
const BYTE *curIn = pIn;
|
||||
const BYTE *curMatch = pMatch;
|
||||
|
||||
if (pIn - pMatch > maxWindowSize) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (; lengthLeft >= 4; lengthLeft -= 4) {
|
||||
if (MEM_read32(curIn) != MEM_read32(curMatch)) {
|
||||
return 0;
|
||||
}
|
||||
curIn += 4;
|
||||
curMatch += 4;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
||||
const hash_t hash,
|
||||
const U32 checksum,
|
||||
const BYTE *pIn,
|
||||
const BYTE *pEnd,
|
||||
U32 minMatchLength,
|
||||
U32 maxWindowSize,
|
||||
U32 *matchLength) {
|
||||
LDM_hashEntry *entry = getBucket(table, hash);
|
||||
(void)checksum;
|
||||
(void)pEnd;
|
||||
(void)matchLength;
|
||||
// TODO: Count the entire forward match length rather than check if valid.
|
||||
if (isValidMatch(pIn, entry->offset + table->offsetBase,
|
||||
minMatchLength, maxWindowSize)) {
|
||||
|
||||
return entry;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
hash_t HASH_hashU32(U32 value) {
|
||||
return ((value * 2654435761U) >> (32 - LDM_HASHLOG));
|
||||
}
|
||||
|
||||
void HASH_insert(LDM_hashTable *table,
|
||||
const hash_t hash, const LDM_hashEntry entry) {
|
||||
*getBucket(table, hash) = entry;
|
||||
}
|
||||
|
||||
U32 HASH_getSize(const LDM_hashTable *table) {
|
||||
return table->size;
|
||||
}
|
||||
|
||||
void HASH_destroyTable(LDM_hashTable *table) {
|
||||
free(table->entries);
|
||||
free(table);
|
||||
}
|
||||
|
||||
void HASH_outputTableOccupancy(const LDM_hashTable *hashTable) {
|
||||
U32 i = 0;
|
||||
U32 ctr = 0;
|
||||
for (; i < HASH_getSize(hashTable); i++) {
|
||||
if (getBucket(hashTable, i)->offset == 0) {
|
||||
ctr++;
|
||||
}
|
||||
}
|
||||
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
|
||||
HASH_getSize(hashTable), ctr,
|
||||
100.0 * (double)(ctr) / (double)HASH_getSize(hashTable));
|
||||
}
|
|
@ -5,22 +5,19 @@
|
|||
#include "ldm_hashtable.h"
|
||||
#include "mem.h"
|
||||
|
||||
//TODO: move def somewhere else.
|
||||
|
||||
// Number of elements per hash bucket.
|
||||
// HASH_BUCKET_SIZE_LOG defined in ldm.h
|
||||
#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now
|
||||
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
|
||||
|
||||
// TODO: rename. Number of hash buckets.
|
||||
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG)
|
||||
#define ZSTD_SKIP
|
||||
//#define TMP_TST
|
||||
//#define ZSTD_SKIP
|
||||
|
||||
struct LDM_hashTable {
|
||||
U32 size; // Number of buckets
|
||||
U32 maxEntries; // Rename...
|
||||
LDM_hashEntry *entries; // 1-D array for now.
|
||||
U32 numBuckets;
|
||||
U32 numEntries;
|
||||
LDM_hashEntry *entries;
|
||||
BYTE *bucketOffsets; // Pointer to current insert position.
|
||||
|
||||
// Position corresponding to offset=0 in LDM_hashEntry.
|
||||
|
@ -32,8 +29,8 @@ struct LDM_hashTable {
|
|||
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase,
|
||||
U32 minMatchLength, U32 maxWindowSize) {
|
||||
LDM_hashTable *table = malloc(sizeof(LDM_hashTable));
|
||||
table->size = size >> HASH_BUCKET_SIZE_LOG;
|
||||
table->maxEntries = size;
|
||||
table->numBuckets = size >> HASH_BUCKET_SIZE_LOG;
|
||||
table->numEntries = size;
|
||||
table->entries = calloc(size, sizeof(LDM_hashEntry));
|
||||
table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE));
|
||||
table->offsetBase = offsetBase;
|
||||
|
@ -46,7 +43,6 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
|
|||
return table->entries + (hash << HASH_BUCKET_SIZE_LOG);
|
||||
}
|
||||
|
||||
#if TMP_ZSTDTOGGLE
|
||||
static unsigned ZSTD_NbCommonBytes (register size_t val)
|
||||
{
|
||||
if (MEM_isLittleEndian()) {
|
||||
|
@ -159,26 +155,22 @@ U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
|
|||
return matchLength;
|
||||
}
|
||||
|
||||
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
||||
const hash_t hash,
|
||||
const U32 checksum,
|
||||
const BYTE *pIn,
|
||||
const BYTE *pEnd,
|
||||
U32 *matchLength,
|
||||
U32 *backwardsMatchLength,
|
||||
const BYTE *pAnchor) {
|
||||
LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table,
|
||||
const hash_t hash,
|
||||
const U32 checksum,
|
||||
const BYTE *pIn,
|
||||
const BYTE *pEnd,
|
||||
const BYTE *pAnchor,
|
||||
U32 *pForwardMatchLength,
|
||||
U32 *pBackwardMatchLength) {
|
||||
LDM_hashEntry *bucket = getBucket(table, hash);
|
||||
LDM_hashEntry *cur = bucket;
|
||||
LDM_hashEntry *bestEntry = NULL;
|
||||
U32 bestMatchLength = 0;
|
||||
U32 forwardMatch = 0;
|
||||
U32 backwardMatch = 0;
|
||||
#ifdef TMP_TST
|
||||
U32 numBetter = 0;
|
||||
#endif
|
||||
for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
|
||||
// Check checksum for faster check.
|
||||
const BYTE *pMatch = cur->offset + table->offsetBase;
|
||||
|
||||
// Check checksum for faster check.
|
||||
if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) {
|
||||
U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd);
|
||||
U32 backwardMatchLength, totalMatchLength;
|
||||
|
@ -193,105 +185,27 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
|||
|
||||
if (totalMatchLength >= bestMatchLength) {
|
||||
bestMatchLength = totalMatchLength;
|
||||
forwardMatch = forwardMatchLength;
|
||||
backwardMatch = backwardMatchLength;
|
||||
*pForwardMatchLength = forwardMatchLength;
|
||||
*pBackwardMatchLength = backwardMatchLength;
|
||||
|
||||
bestEntry = cur;
|
||||
#ifdef TMP_TST
|
||||
numBetter++;
|
||||
#endif
|
||||
|
||||
#ifdef ZSTD_SKIP
|
||||
*matchLength = forwardMatchLength;
|
||||
*backwardsMatchLength = backwardMatchLength;
|
||||
|
||||
return cur;
|
||||
#endif
|
||||
// *matchLength = forwardMatchLength;
|
||||
// return cur;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (bestEntry != NULL && bestMatchLength > table->minMatchLength) {
|
||||
#ifdef TMP_TST
|
||||
printf("Num better %u\n", numBetter - 1);
|
||||
#endif
|
||||
*matchLength = forwardMatch;
|
||||
*backwardsMatchLength = backwardMatch;
|
||||
if (bestEntry != NULL) {
|
||||
return bestEntry;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static int isValidMatch(const BYTE *pIn, const BYTE *pMatch,
|
||||
U32 minMatchLength, U32 maxWindowSize) {
|
||||
printf("HERE\n");
|
||||
U32 lengthLeft = minMatchLength;
|
||||
const BYTE *curIn = pIn;
|
||||
const BYTE *curMatch = pMatch;
|
||||
|
||||
if (pIn - pMatch > maxWindowSize) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (; lengthLeft >= 4; lengthLeft -= 4) {
|
||||
if (MEM_read32(curIn) != MEM_read32(curMatch)) {
|
||||
return 0;
|
||||
}
|
||||
curIn += 4;
|
||||
curMatch += 4;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
//TODO: clean up function call. This is not at all decoupled from LDM.
|
||||
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
||||
const hash_t hash,
|
||||
const U32 checksum,
|
||||
const BYTE *pIn,
|
||||
const BYTE *pEnd,
|
||||
U32 *matchLength,
|
||||
U32 *backwardsMatchLength,
|
||||
const BYTE *pAnchor) {
|
||||
LDM_hashEntry *bucket = getBucket(table, hash);
|
||||
LDM_hashEntry *cur = bucket;
|
||||
(void)matchLength;
|
||||
(void)backwardsMatchLength;
|
||||
(void)pAnchor; for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
|
||||
// Check checksum for faster check.
|
||||
const BYTE *pMatch = cur->offset + table->offsetBase;
|
||||
(void)pEnd;
|
||||
|
||||
if (cur->checksum == checksum &&
|
||||
isValidMatch(pIn, pMatch, table->minMatchLength, table->maxWindowSize)) {
|
||||
return cur;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif
|
||||
hash_t HASH_hashU32(U32 value) {
|
||||
return ((value * 2654435761U) >> (32 - LDM_HASHLOG));
|
||||
}
|
||||
|
||||
|
||||
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
|
||||
const hash_t hash,
|
||||
const U32 checksum) {
|
||||
// Loop through bucket.
|
||||
// TODO: in order of recency???
|
||||
LDM_hashEntry *bucket = getBucket(table, hash);
|
||||
LDM_hashEntry *cur = bucket;
|
||||
for(; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
|
||||
if (cur->checksum == checksum) {
|
||||
return cur;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void HASH_insert(LDM_hashTable *table,
|
||||
const hash_t hash, const LDM_hashEntry entry) {
|
||||
*(getBucket(table, hash) + table->bucketOffsets[hash]) = entry;
|
||||
|
@ -300,7 +214,7 @@ void HASH_insert(LDM_hashTable *table,
|
|||
}
|
||||
|
||||
U32 HASH_getSize(const LDM_hashTable *table) {
|
||||
return table->size;
|
||||
return table->numBuckets;
|
||||
}
|
||||
|
||||
void HASH_destroyTable(LDM_hashTable *table) {
|
||||
|
@ -312,15 +226,16 @@ void HASH_destroyTable(LDM_hashTable *table) {
|
|||
void HASH_outputTableOccupancy(const LDM_hashTable *table) {
|
||||
U32 ctr = 0;
|
||||
LDM_hashEntry *cur = table->entries;
|
||||
LDM_hashEntry *end = table->entries + (table->size * HASH_BUCKET_SIZE);
|
||||
LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE);
|
||||
for (; cur < end; ++cur) {
|
||||
if (cur->offset == 0) {
|
||||
ctr++;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Num buckets, bucket size: %d, %d\n", table->size, HASH_BUCKET_SIZE);
|
||||
printf("Num buckets, bucket size: %d, %d\n",
|
||||
table->numBuckets, HASH_BUCKET_SIZE);
|
||||
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
|
||||
table->maxEntries, ctr,
|
||||
100.0 * (double)(ctr) / table->maxEntries);
|
||||
table->numEntries, ctr,
|
||||
100.0 * (double)(ctr) / table->numEntries);
|
||||
}
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - 4))
|
||||
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
|
||||
|
||||
|
||||
#define ML_BITS 4
|
||||
#define ML_MASK ((1U<<ML_BITS)-1)
|
||||
#define RUN_BITS (8-ML_BITS)
|
||||
|
@ -24,7 +23,6 @@
|
|||
#define OUTPUT_CONFIGURATION
|
||||
#define CHECKSUM_CHAR_OFFSET 10
|
||||
|
||||
//#define HASH_CHECK
|
||||
//#define RUN_CHECKS
|
||||
//#define TMP_RECOMPUTE_LENGTHS
|
||||
|
||||
|
@ -135,7 +133,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats) {
|
|||
int i = 0;
|
||||
printf("=====================\n");
|
||||
printf("Compression statistics\n");
|
||||
//TODO: compute percentage matched?
|
||||
printf("Window size, hash table size (bytes): 2^%u, 2^%u\n",
|
||||
stats->windowSizeLog, stats->hashTableSizeLog);
|
||||
printf("num matches, total match length, %% matched: %u, %llu, %.3f\n",
|
||||
|
@ -191,7 +188,6 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
|
|||
*/
|
||||
static hash_t checksumToHash(U32 sum) {
|
||||
return HASH_hashU32(sum);
|
||||
// return ((sum * 2654435761U) >> (32 - LDM_HASHLOG));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -455,22 +451,14 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
|
|||
if (cctx->ip > cctx->imatchLimit) {
|
||||
return 1;
|
||||
}
|
||||
#ifdef HASH_CHECK
|
||||
entry = HASH_getEntryFromHash(cctx->hashTable, h, sum);
|
||||
#else
|
||||
entry = HASH_getValidEntry(cctx->hashTable, h, sum,
|
||||
cctx->ip, cctx->iend,
|
||||
matchLength, backwardMatchLength,
|
||||
cctx->anchor);
|
||||
#endif
|
||||
|
||||
entry = HASH_getBestEntry(cctx->hashTable, h, sum,
|
||||
cctx->ip, cctx->iend,
|
||||
cctx->anchor,
|
||||
matchLength, backwardMatchLength);
|
||||
|
||||
if (entry != NULL) {
|
||||
*match = entry->offset + cctx->ibase;
|
||||
#ifdef HASH_CHECK
|
||||
if (!LDM_isValidMatch(cctx->ip, *match)) {
|
||||
entry = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
putHashOfCurrentPositionFromHash(cctx, h, sum);
|
||||
}
|
||||
|
@ -563,21 +551,8 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||
cctx.stats.numMatches++;
|
||||
#endif
|
||||
|
||||
#if TMP_RECOMPUTE_LENGTHS
|
||||
backwardsMatchLength = 0;
|
||||
/**
|
||||
* Catch up: look back to extend the match backwards from the found match.
|
||||
*/
|
||||
while (cctx.ip > cctx.anchor && match > cctx.ibase &&
|
||||
cctx.ip[-1] == match[-1]) {
|
||||
cctx.ip--;
|
||||
match--;
|
||||
backwardsMatchLength++;
|
||||
}
|
||||
#else
|
||||
cctx.ip -= backwardsMatchLength;
|
||||
match -= backwardsMatchLength;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Write current block (literals, literal length, match offset, match
|
||||
|
@ -586,16 +561,9 @@ size_t LDM_compress(const void *src, size_t srcSize,
|
|||
{
|
||||
const U32 literalLength = cctx.ip - cctx.anchor;
|
||||
const U32 offset = cctx.ip - match;
|
||||
#if TMP_RECOMPUTE_LENGTHS
|
||||
const U32 matchLength = LDM_countMatchLength(
|
||||
cctx.ip + LDM_MIN_MATCH_LENGTH + backwardsMatchLength,
|
||||
match + LDM_MIN_MATCH_LENGTH + backwardsMatchLength,
|
||||
cctx.ihashLimit) + backwardsMatchLength;
|
||||
#else
|
||||
const U32 matchLength = forwardMatchLength +
|
||||
backwardsMatchLength -
|
||||
LDM_MIN_MATCH_LENGTH;
|
||||
#endif
|
||||
|
||||
LDM_outputBlock(&cctx, literalLength, offset, matchLength);
|
||||
|
||||
|
|
|
@ -11,21 +11,21 @@
|
|||
#define LDM_OFFSET_SIZE 4
|
||||
|
||||
// Defines the size of the hash table.
|
||||
// Note that this is not the number of buckets.
|
||||
// Currently this should be less than WINDOW_SIZE_LOG + 4?
|
||||
#define LDM_MEMORY_USAGE 23
|
||||
#define HASH_BUCKET_SIZE_LOG 3 // MAX is 4 for now
|
||||
|
||||
//#define LDM_LAG (1 << 20)
|
||||
#define LDM_LAG (0)
|
||||
// Defines the lag in inserting elements into the hash table.
|
||||
#define LDM_LAG 0
|
||||
|
||||
#define LDM_WINDOW_SIZE_LOG 28
|
||||
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
|
||||
|
||||
//These should be multiples of four (and perhaps set to the same value?).
|
||||
#define LDM_MIN_MATCH_LENGTH 1024
|
||||
#define LDM_HASH_LENGTH 1024
|
||||
#define LDM_MIN_MATCH_LENGTH 64
|
||||
#define LDM_HASH_LENGTH 64
|
||||
|
||||
#define TMP_ZSTDTOGGLE 1
|
||||
#define TMP_RECOMPUTE_LENGTHS (!(TMP_ZSTDTOGGLE))
|
||||
|
||||
typedef struct LDM_compressStats LDM_compressStats;
|
||||
typedef struct LDM_CCtx LDM_CCtx;
|
||||
|
|
|
@ -14,37 +14,17 @@ typedef struct LDM_hashEntry {
|
|||
|
||||
typedef struct LDM_hashTable LDM_hashTable;
|
||||
|
||||
/**
|
||||
* Create a hash table with size hash buckets.
|
||||
* LDM_hashEntry.offset is added to offsetBase to calculate pMatch in
|
||||
* HASH_getValidEntry.
|
||||
*/
|
||||
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase,
|
||||
U32 minMatchLength, U32 maxWindowSize);
|
||||
|
||||
/**
|
||||
* Returns an LDM_hashEntry from the table that matches the checksum.
|
||||
* Returns NULL if one does not exist.
|
||||
*/
|
||||
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
|
||||
const hash_t hash,
|
||||
const U32 checksum);
|
||||
|
||||
/**
|
||||
* Gets a valid entry that matches the checksum. A valid entry is defined by
|
||||
* *isValid.
|
||||
*
|
||||
* The function finds an entry matching the checksum, computes pMatch as
|
||||
* offset + table.offsetBase, and calls isValid.
|
||||
*/
|
||||
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
|
||||
const hash_t hash,
|
||||
const U32 checksum,
|
||||
const BYTE *pIn,
|
||||
const BYTE *pEnd,
|
||||
U32 *matchLength,
|
||||
U32 *backwardsMatchLength,
|
||||
const BYTE *pAnchor);
|
||||
LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table,
|
||||
const hash_t hash,
|
||||
const U32 checksum,
|
||||
const BYTE *pIn,
|
||||
const BYTE *pEnd,
|
||||
const BYTE *pAnchor,
|
||||
U32 *matchLength,
|
||||
U32 *backwardsMatchLength);
|
||||
|
||||
hash_t HASH_hashU32(U32 value);
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ldm.h"
|
||||
|
||||
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
|
||||
//#define LDM_HASH_ENTRY_SIZE 4
|
||||
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
|
||||
|
@ -14,7 +16,6 @@
|
|||
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
|
||||
|
||||
/* Hash table stuff. */
|
||||
#define HASH_BUCKET_SIZE_LOG 3 // MAX is 4 for now
|
||||
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
|
||||
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG)
|
||||
|
||||
|
@ -32,18 +33,15 @@
|
|||
|
||||
//#define RUN_CHECKS
|
||||
|
||||
#include "ldm.h"
|
||||
|
||||
/* Hash table stuff */
|
||||
|
||||
typedef U32 hash_t;
|
||||
|
||||
typedef struct LDM_hashEntry {
|
||||
U32 offset; // TODO: Replace with pointer?
|
||||
U32 offset;
|
||||
U32 checksum;
|
||||
} LDM_hashEntry;
|
||||
|
||||
// TODO: Scanning speed
|
||||
// TODO: Memory usage
|
||||
struct LDM_compressStats {
|
||||
U32 windowSizeLog, hashTableSizeLog;
|
||||
|
@ -110,18 +108,22 @@ struct LDM_CCtx {
|
|||
};
|
||||
|
||||
struct LDM_hashTable {
|
||||
U32 size; // Number of buckets
|
||||
U32 maxEntries; // Rename...
|
||||
LDM_hashEntry *entries; // 1-D array for now.
|
||||
U32 numBuckets; // Number of buckets
|
||||
U32 numEntries; // Rename...
|
||||
LDM_hashEntry *entries;
|
||||
|
||||
BYTE *bucketOffsets;
|
||||
// Position corresponding to offset=0 in LDM_hashEntry.
|
||||
};
|
||||
|
||||
/**
|
||||
* Create a hash table that can contain size elements.
|
||||
* The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG.
|
||||
*/
|
||||
LDM_hashTable *HASH_createTable(U32 size) {
|
||||
LDM_hashTable *table = malloc(sizeof(LDM_hashTable));
|
||||
table->size = size >> HASH_BUCKET_SIZE_LOG;
|
||||
table->maxEntries = size;
|
||||
table->numBuckets = size >> HASH_BUCKET_SIZE_LOG;
|
||||
table->numEntries = size;
|
||||
table->entries = calloc(size, sizeof(LDM_hashEntry));
|
||||
table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE));
|
||||
return table;
|
||||
|
@ -131,10 +133,7 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
|
|||
return table->entries + (hash << HASH_BUCKET_SIZE_LOG);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static unsigned ZSTD_NbCommonBytes (register size_t val)
|
||||
{
|
||||
static unsigned ZSTD_NbCommonBytes (register size_t val) {
|
||||
if (MEM_isLittleEndian()) {
|
||||
if (MEM_64bits()) {
|
||||
# if defined(_MSC_VER) && defined(_WIN64)
|
||||
|
@ -234,6 +233,11 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch,
|
|||
return (size_t)(pIn - pStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Count number of bytes that match backwards before pIn and pMatch.
|
||||
*
|
||||
* We count only bytes where pMatch > pBaes and pIn > pAnchor.
|
||||
*/
|
||||
U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
|
||||
const BYTE *pMatch, const BYTE *pBase) {
|
||||
U32 matchLength = 0;
|
||||
|
@ -245,20 +249,32 @@ U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
|
|||
return matchLength;
|
||||
}
|
||||
|
||||
LDM_hashEntry *HASH_getValidEntry(const LDM_CCtx *cctx,
|
||||
const hash_t hash,
|
||||
const U32 checksum,
|
||||
U32 *matchLength,
|
||||
U32 *backwardsMatchLength) {
|
||||
/**
|
||||
* Returns a pointer to the entry in the hash table matching the hash and
|
||||
* checksum with the "longest match length" as defined below. The forward and
|
||||
* backward match lengths are written to *pForwardMatchLength and
|
||||
* *pBackwardMatchLength.
|
||||
*
|
||||
* The match length is defined based on cctx->ip and the entry's offset.
|
||||
* The forward match is computed from cctx->ip and entry->offset + cctx->ibase.
|
||||
* The backward match is computed backwards from cctx->ip and
|
||||
* cctx->ibase only if the forward match is longer than LDM_MIN_MATCH_LENGTH.
|
||||
*
|
||||
*/
|
||||
LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
|
||||
const hash_t hash,
|
||||
const U32 checksum,
|
||||
U32 *pForwardMatchLength,
|
||||
U32 *pBackwardMatchLength) {
|
||||
LDM_hashTable *table = cctx->hashTable;
|
||||
LDM_hashEntry *bucket = getBucket(table, hash);
|
||||
LDM_hashEntry *cur = bucket;
|
||||
LDM_hashEntry *bestEntry = NULL;
|
||||
U32 bestMatchLength = 0;
|
||||
for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
|
||||
// Check checksum for faster check.
|
||||
const BYTE *pMatch = cur->offset + cctx->ibase;
|
||||
|
||||
// Check checksum for faster check.
|
||||
if (cur->checksum == checksum &&
|
||||
cctx->ip - pMatch <= LDM_WINDOW_SIZE) {
|
||||
U32 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend);
|
||||
|
@ -279,8 +295,8 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_CCtx *cctx,
|
|||
if (totalMatchLength >= bestMatchLength &&
|
||||
totalMatchLength >= LDM_MIN_MATCH_LENGTH) {
|
||||
bestMatchLength = totalMatchLength;
|
||||
*matchLength = forwardMatchLength;
|
||||
*backwardsMatchLength = backwardMatchLength;
|
||||
*pForwardMatchLength = forwardMatchLength;
|
||||
*pBackwardMatchLength = backwardMatchLength;
|
||||
|
||||
bestEntry = cur;
|
||||
#ifdef ZSTD_SKIP
|
||||
|
@ -303,7 +319,7 @@ void HASH_insert(LDM_hashTable *table,
|
|||
}
|
||||
|
||||
U32 HASH_getSize(const LDM_hashTable *table) {
|
||||
return table->size;
|
||||
return table->numBuckets;
|
||||
}
|
||||
|
||||
void HASH_destroyTable(LDM_hashTable *table) {
|
||||
|
@ -315,20 +331,20 @@ void HASH_destroyTable(LDM_hashTable *table) {
|
|||
void HASH_outputTableOccupancy(const LDM_hashTable *table) {
|
||||
U32 ctr = 0;
|
||||
LDM_hashEntry *cur = table->entries;
|
||||
LDM_hashEntry *end = table->entries + (table->size * HASH_BUCKET_SIZE);
|
||||
LDM_hashEntry *end = table->entries + (table->numBuckets * HASH_BUCKET_SIZE);
|
||||
for (; cur < end; ++cur) {
|
||||
if (cur->offset == 0) {
|
||||
ctr++;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Num buckets, bucket size: %d, %d\n", table->size, HASH_BUCKET_SIZE);
|
||||
printf("Num buckets, bucket size: %d, %d\n",
|
||||
table->numBuckets, HASH_BUCKET_SIZE);
|
||||
printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
|
||||
table->maxEntries, ctr,
|
||||
100.0 * (double)(ctr) / table->maxEntries);
|
||||
table->numEntries, ctr,
|
||||
100.0 * (double)(ctr) / table->numEntries);
|
||||
}
|
||||
|
||||
|
||||
// TODO: This can be done more efficiently (but it is not that important as it
|
||||
// is only used for computing stats).
|
||||
static int intLog2(U32 x) {
|
||||
|
@ -339,7 +355,7 @@ static int intLog2(U32 x) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
// TODO: Maybe we would eventually prefer to have linear rather than
|
||||
// Maybe we would eventually prefer to have linear rather than
|
||||
// exponential buckets.
|
||||
/**
|
||||
void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) {
|
||||
|
@ -369,7 +385,6 @@ void LDM_printCompressStats(const LDM_compressStats *stats) {
|
|||
int i = 0;
|
||||
printf("=====================\n");
|
||||
printf("Compression statistics\n");
|
||||
//TODO: compute percentage matched?
|
||||
printf("Window size, hash table size (bytes): 2^%u, 2^%u\n",
|
||||
stats->windowSizeLog, stats->hashTableSizeLog);
|
||||
printf("num matches, total match length, %% matched: %u, %llu, %.3f\n",
|
||||
|
@ -429,7 +444,6 @@ hash_t HASH_hashU32(U32 value) {
|
|||
*/
|
||||
static hash_t checksumToHash(U32 sum) {
|
||||
return HASH_hashU32(sum);
|
||||
// return ((sum * 2654435761U) >> (32 - LDM_HASHLOG));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -672,10 +686,10 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) {
|
|||
* Returns 0 if successful and 1 otherwise (i.e. no match can be found
|
||||
* in the remaining input that is long enough).
|
||||
*
|
||||
* matchLength contains the forward length of the match.
|
||||
* forwardMatchLength contains the forward length of the match.
|
||||
*/
|
||||
static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
|
||||
U32 *matchLength, U32 *backwardMatchLength) {
|
||||
U32 *forwardMatchLength, U32 *backwardMatchLength) {
|
||||
|
||||
LDM_hashEntry *entry = NULL;
|
||||
cctx->nextIp = cctx->ip + cctx->step;
|
||||
|
@ -693,8 +707,8 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
|
|||
return 1;
|
||||
}
|
||||
|
||||
entry = HASH_getValidEntry(cctx, h, sum,
|
||||
matchLength, backwardMatchLength);
|
||||
entry = HASH_getBestEntry(cctx, h, sum,
|
||||
forwardMatchLength, backwardMatchLength);
|
||||
|
||||
if (entry != NULL) {
|
||||
*match = entry->offset + cctx->ibase;
|
||||
|
|
|
@ -29,6 +29,7 @@ static int compress(const char *fname, const char *oname) {
|
|||
size_t maxCompressedSize, compressedSize;
|
||||
|
||||
struct timeval tv1, tv2;
|
||||
double timeTaken;
|
||||
|
||||
/* Open the input file. */
|
||||
if ((fdin = open(fname, O_RDONLY)) < 0) {
|
||||
|
@ -53,18 +54,7 @@ static int compress(const char *fname, const char *oname) {
|
|||
// The compress function should check before writing or buffer writes.
|
||||
maxCompressedSize += statbuf.st_size / 255;
|
||||
|
||||
/* Go to the location corresponding to the last byte. */
|
||||
/* TODO: fallocate? */
|
||||
if (lseek(fdout, maxCompressedSize - 1, SEEK_SET) == -1) {
|
||||
perror("lseek error");
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Write a dummy byte at the last location. */
|
||||
if (write(fdout, "", 1) != 1) {
|
||||
perror("write error");
|
||||
return 1;
|
||||
}
|
||||
ftruncate(fdout, maxCompressedSize);
|
||||
|
||||
/* mmap the input file. */
|
||||
if ((src = mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, fdin, 0))
|
||||
|
@ -103,12 +93,12 @@ static int compress(const char *fname, const char *oname) {
|
|||
(unsigned)statbuf.st_size, (unsigned)compressedSize, oname,
|
||||
(double)compressedSize / (statbuf.st_size) * 100);
|
||||
|
||||
timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 +
|
||||
(double) (tv2.tv_sec - tv1.tv_sec),
|
||||
|
||||
printf("Total compress time = %.3f seconds, Average compression speed: %.3f MB/s\n",
|
||||
(double) (tv2.tv_usec - tv1.tv_usec) / 1000000 +
|
||||
(double) (tv2.tv_sec - tv1.tv_sec),
|
||||
((double)statbuf.st_size / (double) (1 << 20)) /
|
||||
((double) (tv2.tv_usec - tv1.tv_usec) / 1000000 +
|
||||
(double) (tv2.tv_sec - tv1.tv_sec)));
|
||||
timeTaken,
|
||||
((double)statbuf.st_size / (double) (1 << 20)) / timeTaken);
|
||||
|
||||
|
||||
// Close files.
|
||||
|
@ -156,17 +146,7 @@ static int decompress(const char *fname, const char *oname) {
|
|||
/* Read the header. */
|
||||
LDM_readHeader(src, &compressedSize, &decompressedSize);
|
||||
|
||||
/* Go to the location corresponding to the last byte. */
|
||||
if (lseek(fdout, decompressedSize - 1, SEEK_SET) == -1) {
|
||||
perror("lseek error");
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* write a dummy byte at the last location */
|
||||
if (write(fdout, "", 1) != 1) {
|
||||
perror("write error");
|
||||
return 1;
|
||||
}
|
||||
ftruncate(fdout, decompressedSize);
|
||||
|
||||
/* mmap the output file */
|
||||
if ((dst = mmap(0, decompressedSize, PROT_READ | PROT_WRITE,
|
||||
|
|
Loading…
Reference in New Issue