Experiment with eviction policies and minor code cleanup

This commit is contained in:
Stella Lau 2017-07-24 12:05:43 -07:00
parent eb16da647d
commit 8ed9220102
8 changed files with 139 additions and 169 deletions

View File

@ -27,20 +27,17 @@ default: all
all: main-circular-buffer main-integrated main-64 all: main-circular-buffer main-integrated main-64
#main-basic : basic_table.c ldm.c main-ldm.c
# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-64: ldm_64_hash.c main-ldm.c main-64: ldm_64_hash.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-integrated: ldm_with_table.c main-ldm.c main-integrated: ldm_integrated.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
clean: clean:
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \ @rm -f core *.o tmp* result* *.ldm *.ldm.dec \
main-circular-buffer main-integrated main-64 main-circular-buffer main-64 main-integrated
@echo Cleaning completed @echo Cleaning completed

View File

@ -5,26 +5,24 @@
#include "ldm_hashtable.h" #include "ldm_hashtable.h"
#include "mem.h" #include "mem.h"
// Number of elements per hash bucket. // THe number of elements per hash bucket.
// HASH_BUCKET_SIZE_LOG defined in ldm.h // HASH_BUCKET_SIZE_LOG is defined in ldm.h.
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
// The number of hash buckets.
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
// If ZSTD_SKIP is defined, then the first entry is returned in HASH_getBestEntry
// (without looking at other entries in the bucket).
// TODO: rename. Number of hash buckets.
// TODO: Link to HASH_ENTRY_SIZE_LOG
//#define ZSTD_SKIP //#define ZSTD_SKIP
struct LDM_hashTable { struct LDM_hashTable {
U32 numBuckets; U32 numBuckets; // The number of buckets.
U32 numEntries; U32 numEntries; // numBuckets * HASH_BUCKET_SIZE.
LDM_hashEntry *entries; LDM_hashEntry *entries;
BYTE *bucketOffsets; // Pointer to current insert position. BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position.
// Position corresponding to offset=0 in LDM_hashEntry. const BYTE *offsetBase; // Corresponds to offset=0 in LDM_hashEntry.
const BYTE *offsetBase;
U32 minMatchLength; U32 minMatchLength;
U32 maxWindowSize; U32 maxWindowSize;
}; };
@ -46,6 +44,7 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
return table->entries + (hash << HASH_BUCKET_SIZE_LOG); return table->entries + (hash << HASH_BUCKET_SIZE_LOG);
} }
// From lib/compress/zstd_compress.c
static unsigned ZSTD_NbCommonBytes (register size_t val) static unsigned ZSTD_NbCommonBytes (register size_t val)
{ {
if (MEM_isLittleEndian()) { if (MEM_isLittleEndian()) {
@ -114,7 +113,11 @@ static unsigned ZSTD_NbCommonBytes (register size_t val)
} } } }
} }
// From lib/compress/zstd_compress.c /**
* From lib/compress/zstd_compress.c
* Returns the number of bytes (consecutively) in common between pIn and pMatch
* up to pInLimit.
*/
static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch,
const BYTE *const pInLimit) { const BYTE *const pInLimit) {
const BYTE * const pStart = pIn; const BYTE * const pStart = pIn;
@ -147,9 +150,14 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch,
return (size_t)(pIn - pStart); return (size_t)(pIn - pStart);
} }
U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, /**
* Returns the number of bytes in common between pIn and pMatch,
* counting backwards, with pIn having a lower limit of pAnchor and
* pMatch having a lower limit of pBase.
*/
static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
const BYTE *pMatch, const BYTE *pBase) { const BYTE *pMatch, const BYTE *pBase) {
U32 matchLength = 0; size_t matchLength = 0;
while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) {
pIn--; pIn--;
pMatch--; pMatch--;
@ -178,6 +186,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table,
U64 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); U64 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd);
U64 backwardMatchLength, totalMatchLength; U64 backwardMatchLength, totalMatchLength;
// Only take matches where the forwardMatchLength is large enough
// for speed.
if (forwardMatchLength < table->minMatchLength) { if (forwardMatchLength < table->minMatchLength) {
continue; continue;
} }
@ -212,6 +222,7 @@ hash_t HASH_hashU32(U32 value) {
void HASH_insert(LDM_hashTable *table, void HASH_insert(LDM_hashTable *table,
const hash_t hash, const LDM_hashEntry entry) { const hash_t hash, const LDM_hashEntry entry) {
// Circular buffer.
*(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry;
table->bucketOffsets[hash]++; table->bucketOffsets[hash]++;
table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1;

View File

@ -29,7 +29,6 @@
typedef U32 checksum_t; typedef U32 checksum_t;
// TODO: Scanning speed
// TODO: Memory usage // TODO: Memory usage
struct LDM_compressStats { struct LDM_compressStats {
U32 windowSizeLog, hashTableSizeLog; U32 windowSizeLog, hashTableSizeLog;
@ -40,9 +39,6 @@ struct LDM_compressStats {
U32 minOffset, maxOffset; U32 minOffset, maxOffset;
U32 numCollisions;
U32 numHashInserts;
U32 offsetHistogram[32]; U32 offsetHistogram[32];
}; };
@ -80,15 +76,12 @@ struct LDM_CCtx {
hash_t nextHash; /* Hash corresponding to nextPosHashed */ hash_t nextHash; /* Hash corresponding to nextPosHashed */
checksum_t nextSum; checksum_t nextSum;
unsigned step; // ip step, should be 1. unsigned step; // ip step, should be 1.
const BYTE *lagIp; const BYTE *lagIp;
hash_t lagHash; hash_t lagHash;
checksum_t lagSum; checksum_t lagSum;
U64 numHashInserts;
// DEBUG // DEBUG
const BYTE *DEBUG_setNextHash; const BYTE *DEBUG_setNextHash;
}; };
@ -103,32 +96,6 @@ static int intLog2(U32 x) {
return ret; return ret;
} }
// TODO: Maybe we would eventually prefer to have linear rather than
// exponential buckets.
/**
void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) {
U32 i = 0;
int buckets[32] = { 0 };
printf("\n");
printf("Hash table histogram\n");
for (; i < HASH_getSize(cctx->hashTable); i++) {
int offset = (cctx->ip - cctx->ibase) -
HASH_getEntryFromHash(cctx->hashTable, i)->offset;
buckets[intLog2(offset)]++;
}
i = 0;
for (; i < 32; i++) {
printf("2^%*d: %10u %6.3f%%\n", 2, i,
buckets[i],
100.0 * (double) buckets[i] /
(double) HASH_getSize(cctx->hashTable));
}
printf("\n");
}
*/
void LDM_printCompressStats(const LDM_compressStats *stats) { void LDM_printCompressStats(const LDM_compressStats *stats) {
int i = 0; int i = 0;
printf("=====================\n"); printf("=====================\n");
@ -163,7 +130,8 @@ void LDM_printCompressStats(const LDM_compressStats *stats) {
printf("=====================\n"); printf("=====================\n");
} }
int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { /*
static int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
U32 lengthLeft = LDM_MIN_MATCH_LENGTH; U32 lengthLeft = LDM_MIN_MATCH_LENGTH;
const BYTE *curIn = pIn; const BYTE *curIn = pIn;
const BYTE *curMatch = pMatch; const BYTE *curMatch = pMatch;
@ -181,6 +149,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
} }
return 1; return 1;
} }
*/
/** /**
* Convert a sum computed from getChecksum to a hash value in the range * Convert a sum computed from getChecksum to a hash value in the range
@ -253,7 +222,6 @@ static void setNextHash(LDM_CCtx *cctx) {
cctx->DEBUG_setNextHash = cctx->nextIp; cctx->DEBUG_setNextHash = cctx->nextIp;
#endif #endif
// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH);
cctx->nextSum = updateChecksum( cctx->nextSum = updateChecksum(
cctx->lastSum, LDM_HASH_LENGTH, cctx->lastSum, LDM_HASH_LENGTH,
cctx->lastPosHashed[0], cctx->lastPosHashed[0],
@ -292,7 +260,7 @@ static void putHashOfCurrentPositionFromHash(
// Note: this works only when cctx->step is 1. // Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
#if LDM_LAG #if LDM_LAG
// TODO: off by 1, but whatever // Off by 1, but whatever
if (cctx->lagIp - cctx->ibase > 0) { if (cctx->lagIp - cctx->ibase > 0) {
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum };
HASH_insert(cctx->hashTable, cctx->lagHash, entry); HASH_insert(cctx->hashTable, cctx->lagHash, entry);
@ -344,6 +312,7 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) {
putHashOfCurrentPositionFromHash(cctx, hash, sum); putHashOfCurrentPositionFromHash(cctx, hash, sum);
} }
/*
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
const BYTE *pInLimit) { const BYTE *pInLimit) {
const BYTE * const pStart = pIn; const BYTE * const pStart = pIn;
@ -358,6 +327,7 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
} }
return (U32)(pIn - pStart); return (U32)(pIn - pStart);
} }
*/
void LDM_outputConfiguration(void) { void LDM_outputConfiguration(void) {
printf("=====================\n"); printf("=====================\n");
@ -380,6 +350,12 @@ void LDM_readHeader(const void *src, U64 *compressedSize,
// ip += sizeof(U64); // ip += sizeof(U64);
} }
void LDM_writeHeader(void *memPtr, U64 compressedSize,
U64 decompressedSize) {
MEM_write64(memPtr, compressedSize);
MEM_write64((BYTE *)memPtr + 8, decompressedSize);
}
void LDM_initializeCCtx(LDM_CCtx *cctx, void LDM_initializeCCtx(LDM_CCtx *cctx,
const void *src, size_t srcSize, const void *src, size_t srcSize,
void *dst, size_t maxDstSize) { void *dst, size_t maxDstSize) {
@ -592,8 +568,6 @@ size_t LDM_compress(const void *src, size_t srcSize,
LDM_updateLastHashFromNextHash(&cctx); LDM_updateLastHashFromNextHash(&cctx);
} }
// HASH_outputTableOffsetHistogram(&cctx);
/* Encode the last literals (no more matches). */ /* Encode the last literals (no more matches). */
{ {
const U32 lastRun = cctx.iend - cctx.anchor; const U32 lastRun = cctx.iend - cctx.anchor;
@ -692,7 +666,6 @@ size_t LDM_decompress(const void *src, size_t compressedSize,
return dctx.op - (BYTE *)dst; return dctx.op - (BYTE *)dst;
} }
// TODO: implement and test hash function
void LDM_test(const BYTE *src) { void LDM_test(const BYTE *src) {
(void)src; (void)src;
} }

View File

@ -1,20 +1,24 @@
#ifndef LDM_H #ifndef LDM_H
#define LDM_H #define LDM_H
#include <stddef.h> /* size_t */
#include "mem.h" // from /lib/common/mem.h #include "mem.h" // from /lib/common/mem.h
#define LDM_COMPRESS_SIZE 8 // The number of bytes storing the compressed and decompressed size
#define LDM_DECOMPRESS_SIZE 8 // in the header.
#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) #define LDM_COMPRESSED_SIZE 8
#define LDM_DECOMPRESSED_SIZE 8
#define LDM_HEADER_SIZE ((LDM_COMPRESSED_SIZE)+(LDM_DECOMPRESSED_SIZE))
// THe number of bytes storing the offset.
#define LDM_OFFSET_SIZE 4 #define LDM_OFFSET_SIZE 4
// Defines the size of the hash table. // Defines the size of the hash table.
// Note that this is not the number of buckets. // Note that this is not the number of buckets.
// Currently this should be less than WINDOW_SIZE_LOG + 4? // Currently this should be less than WINDOW_SIZE_LOG + 4?
#define LDM_MEMORY_USAGE 24 #define LDM_MEMORY_USAGE 23
#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now
// The number of entries in a hash bucket.
#define HASH_BUCKET_SIZE_LOG 0 // The maximum is 4 for now.
// Defines the lag in inserting elements into the hash table. // Defines the lag in inserting elements into the hash table.
#define LDM_LAG 0 #define LDM_LAG 0
@ -23,11 +27,10 @@
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
//These should be multiples of four (and perhaps set to the same value?). //These should be multiples of four (and perhaps set to the same value?).
#define LDM_MIN_MATCH_LENGTH 64 #define LDM_MIN_MATCH_LENGTH 16
#define LDM_HASH_LENGTH 64 #define LDM_HASH_LENGTH 16
// Experimental. // Experimental.
//:w
//#define TMP_EVICTION //#define TMP_EVICTION
#define TMP_TAG_INSERT #define TMP_TAG_INSERT
//#define TMP_SIMPLE_LOWER //#define TMP_SIMPLE_LOWER
@ -37,7 +40,6 @@ typedef struct LDM_compressStats LDM_compressStats;
typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_CCtx LDM_CCtx;
typedef struct LDM_DCtx LDM_DCtx; typedef struct LDM_DCtx LDM_DCtx;
/** /**
* Compresses src into dst. * Compresses src into dst.
* *
@ -94,17 +96,6 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx);
* Outputs compression statistics to stdout. * Outputs compression statistics to stdout.
*/ */
void LDM_printCompressStats(const LDM_compressStats *stats); void LDM_printCompressStats(const LDM_compressStats *stats);
/**
* Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the
* LDM_MIN_MATCH_LENGTH bytes from match and also if
* pIn - pMatch <= LDM_WINDOW_SIZE.
*
* This assumes LDM_MIN_MATCH_LENGTH is a multiple of four.
*
* Return 1 if valid, 0 otherwise.
*/
int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch);
/** /**
* Encode the literal length followed by the literals. * Encode the literal length followed by the literals.
@ -150,6 +141,15 @@ void LDM_initializeDCtx(LDM_DCtx *dctx,
void LDM_readHeader(const void *src, U64 *compressedSize, void LDM_readHeader(const void *src, U64 *compressedSize,
U64 *decompressedSize); U64 *decompressedSize);
/**
* Write the compressed and decompressed size.
*/
void LDM_writeHeader(void *memPtr, U64 compressedSize,
U64 decompressedSize);
/**
* Output the configuration used.
*/
void LDM_outputConfiguration(void); void LDM_outputConfiguration(void);
void LDM_test(const BYTE *src); void LDM_test(const BYTE *src);

View File

@ -36,8 +36,7 @@
//#define ZSTD_SKIP //#define ZSTD_SKIP
//#define RUN_CHECKS //#define RUN_CHECKS
//
//
static const U64 prime8bytes = 11400714785074694791ULL; static const U64 prime8bytes = 11400714785074694791ULL;
/* Hash table stuff */ /* Hash table stuff */
@ -49,7 +48,6 @@ typedef struct LDM_hashEntry {
U32 checksum; U32 checksum;
} LDM_hashEntry; } LDM_hashEntry;
// TODO: Memory usage
struct LDM_compressStats { struct LDM_compressStats {
U32 windowSizeLog, hashTableSizeLog; U32 windowSizeLog, hashTableSizeLog;
U32 numMatches; U32 numMatches;
@ -59,9 +57,6 @@ struct LDM_compressStats {
U32 minOffset, maxOffset; U32 minOffset, maxOffset;
U32 numCollisions;
U32 numHashInserts;
U32 offsetHistogram[32]; U32 offsetHistogram[32];
U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG]; U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG];
@ -115,20 +110,19 @@ struct LDM_CCtx {
const BYTE *lagIp; const BYTE *lagIp;
U64 lagSum; U64 lagSum;
U64 numHashInserts;
// DEBUG // DEBUG
const BYTE *DEBUG_setNextHash; const BYTE *DEBUG_setNextHash;
}; };
struct LDM_hashTable { struct LDM_hashTable {
U32 numBuckets; // Number of buckets U32 numBuckets; // The number of buckets.
U32 numEntries; // Rename... U32 numEntries; // numBuckets * HASH_BUCKET_SIZE.
LDM_hashEntry *entries;
BYTE *bucketOffsets; LDM_hashEntry *entries;
// Position corresponding to offset=0 in LDM_hashEntry. BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position.
}; };
/** /**
* Create a hash table that can contain size elements. * Create a hash table that can contain size elements.
* The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG.
@ -251,9 +245,9 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch,
* *
* We count only bytes where pMatch > pBaes and pIn > pAnchor. * We count only bytes where pMatch > pBaes and pIn > pAnchor.
*/ */
U64 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor,
const BYTE *pMatch, const BYTE *pBase) { const BYTE *pMatch, const BYTE *pBase) {
U64 matchLength = 0; size_T matchLength = 0;
while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) {
pIn--; pIn--;
pMatch--; pMatch--;
@ -293,7 +287,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend);
U64 backwardMatchLength, totalMatchLength; U64 backwardMatchLength, totalMatchLength;
// For speed. // Only take matches where the forward match length is large enough
// for speed.
if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) {
continue; continue;
} }
@ -766,6 +761,13 @@ void LDM_readHeader(const void *src, U64 *compressedSize,
// ip += sizeof(U64); // ip += sizeof(U64);
} }
void LDM_writeHeader(void *memPtr, U64 compressedSize,
U64 decompressedSize) {
MEM_write64(memPtr, compressedSize);
MEM_write64((BYTE *)memPtr + 8, decompressedSize);
}
void LDM_initializeCCtx(LDM_CCtx *cctx, void LDM_initializeCCtx(LDM_CCtx *cctx,
const void *src, size_t srcSize, const void *src, size_t srcSize,
void *dst, size_t maxDstSize) { void *dst, size_t maxDstSize) {

View File

@ -1,37 +1,73 @@
/**
* A "hash" table used in LDM compression.
*
* This is not exactly a hash table in the sense that inserted entries
* are not guaranteed to remain in the hash table.
*/
#ifndef LDM_HASHTABLE_H #ifndef LDM_HASHTABLE_H
#define LDM_HASHTABLE_H #define LDM_HASHTABLE_H
#include "mem.h" #include "mem.h"
// The log size of LDM_hashEntry in bytes.
#define LDM_HASH_ENTRY_SIZE_LOG 3 #define LDM_HASH_ENTRY_SIZE_LOG 3
// TODO: clean up comments
typedef U32 hash_t; typedef U32 hash_t;
typedef struct LDM_hashEntry { typedef struct LDM_hashEntry {
U32 offset; // TODO: Replace with pointer? U32 offset; // Represents the offset of the entry from offsetBase.
U32 checksum; U32 checksum; // A checksum to select entries with the same hash value.
} LDM_hashEntry; } LDM_hashEntry;
typedef struct LDM_hashTable LDM_hashTable; typedef struct LDM_hashTable LDM_hashTable;
/**
* Create a table that can contain size elements. This does not necessarily
* correspond to the number of hash buckets. The number of hash buckets
* is size / (1 << HASH_BUCKET_SIZE_LOG)
*
* minMatchLength is the minimum match length required in HASH_getBestEntry.
*
* maxWindowSize is the maximum distance from pIn in HASH_getBestEntry.
* The window is defined to be (pIn - offsetBase - offset).
*/
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase,
U32 minMatchLength, U32 maxWindowSize); U32 minMatchLength, U32 maxWindowSize);
/**
* Return the "best" entry from the table with the same hash and checksum.
*
* pIn: a pointer to the current input position.
* pEnd: a pointer to the maximum input position.
* pAnchor: a pointer to the minimum input position.
*
* This function computes the forward and backward match length from pIn
* and writes it to forwardMatchLength and backwardsMatchLength.
*
* E.g. for the two strings "aaabbbb" "aaabbbb" with pIn and the
* entry pointing at the first "b", the forward match length would be
* four (representing the "b" matches) and the backward match length would
* three (representing the "a" matches before the pointer).
*/
LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table,
const hash_t hash, const hash_t hash,
const U32 checksum, const U32 checksum,
const BYTE *pIn, const BYTE *pIn,
const BYTE *pEnd, const BYTE *pEnd,
const BYTE *pAnchor, const BYTE *pAnchor,
U64 *matchLength, U64 *forwardMatchLength,
U64 *backwardsMatchLength); U64 *backwardsMatchLength);
/**
* Return a hash of the value.
*/
hash_t HASH_hashU32(U32 value); hash_t HASH_hashU32(U32 value);
/** /**
* Insert an LDM_hashEntry into the bucket corresponding to hash. * Insert an LDM_hashEntry into the bucket corresponding to hash.
*
* An entry may be evicted in the process.
*/ */
void HASH_insert(LDM_hashTable *table, const hash_t hash, void HASH_insert(LDM_hashTable *table, const hash_t hash,
const LDM_hashEntry entry); const LDM_hashEntry entry);
@ -41,6 +77,9 @@ void HASH_insert(LDM_hashTable *table, const hash_t hash,
*/ */
U32 HASH_getSize(const LDM_hashTable *table); U32 HASH_getSize(const LDM_hashTable *table);
/**
* Destroy the table.
*/
void HASH_destroyTable(LDM_hashTable *table); void HASH_destroyTable(LDM_hashTable *table);
/** /**

View File

@ -33,8 +33,6 @@
//#define RUN_CHECKS //#define RUN_CHECKS
/* Hash table stuff */
typedef U32 hash_t; typedef U32 hash_t;
typedef struct LDM_hashEntry { typedef struct LDM_hashEntry {
@ -42,7 +40,6 @@ typedef struct LDM_hashEntry {
U32 checksum; U32 checksum;
} LDM_hashEntry; } LDM_hashEntry;
// TODO: Memory usage
struct LDM_compressStats { struct LDM_compressStats {
U32 windowSizeLog, hashTableSizeLog; U32 windowSizeLog, hashTableSizeLog;
U32 numMatches; U32 numMatches;
@ -52,9 +49,6 @@ struct LDM_compressStats {
U32 minOffset, maxOffset; U32 minOffset, maxOffset;
U32 numCollisions;
U32 numHashInserts;
U32 offsetHistogram[32]; U32 offsetHistogram[32];
}; };
@ -85,8 +79,6 @@ struct LDM_CCtx {
LDM_hashTable *hashTable; LDM_hashTable *hashTable;
// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
const BYTE *lastPosHashed; /* Last position hashed */ const BYTE *lastPosHashed; /* Last position hashed */
hash_t lastHash; /* Hash corresponding to lastPosHashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */
U32 lastSum; U32 lastSum;
@ -109,11 +101,10 @@ struct LDM_CCtx {
struct LDM_hashTable { struct LDM_hashTable {
U32 numBuckets; // Number of buckets U32 numBuckets; // Number of buckets
U32 numEntries; // Rename... U32 numEntries;
LDM_hashEntry *entries; LDM_hashEntry *entries;
BYTE *bucketOffsets; BYTE *bucketOffsets;
// Position corresponding to offset=0 in LDM_hashEntry.
}; };
/** /**
@ -354,32 +345,6 @@ static int intLog2(U32 x) {
return ret; return ret;
} }
// Maybe we would eventually prefer to have linear rather than
// exponential buckets.
/**
void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) {
U32 i = 0;
int buckets[32] = { 0 };
printf("\n");
printf("Hash table histogram\n");
for (; i < HASH_getSize(cctx->hashTable); i++) {
int offset = (cctx->ip - cctx->ibase) -
HASH_getEntryFromHash(cctx->hashTable, i)->offset;
buckets[intLog2(offset)]++;
}
i = 0;
for (; i < 32; i++) {
printf("2^%*d: %10u %6.3f%%\n", 2, i,
buckets[i],
100.0 * (double) buckets[i] /
(double) HASH_getSize(cctx->hashTable));
}
printf("\n");
}
*/
void LDM_printCompressStats(const LDM_compressStats *stats) { void LDM_printCompressStats(const LDM_compressStats *stats) {
int i = 0; int i = 0;
printf("=====================\n"); printf("=====================\n");
@ -508,7 +473,6 @@ static void setNextHash(LDM_CCtx *cctx) {
cctx->DEBUG_setNextHash = cctx->nextIp; cctx->DEBUG_setNextHash = cctx->nextIp;
#endif #endif
// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH);
cctx->nextSum = updateChecksum( cctx->nextSum = updateChecksum(
cctx->lastSum, LDM_HASH_LENGTH, cctx->lastSum, LDM_HASH_LENGTH,
cctx->lastPosHashed[0], cctx->lastPosHashed[0],
@ -517,7 +481,6 @@ static void setNextHash(LDM_CCtx *cctx) {
cctx->nextHash = checksumToHash(cctx->nextSum); cctx->nextHash = checksumToHash(cctx->nextSum);
#if LDM_LAG #if LDM_LAG
// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp);
if (cctx->ip - cctx->ibase > LDM_LAG) { if (cctx->ip - cctx->ibase > LDM_LAG) {
cctx->lagSum = updateChecksum( cctx->lagSum = updateChecksum(
cctx->lagSum, LDM_HASH_LENGTH, cctx->lagSum, LDM_HASH_LENGTH,
@ -547,10 +510,6 @@ static void putHashOfCurrentPositionFromHash(
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
// Note: this works only when cctx->step is 1. // Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
/**
const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
MEM_read32(cctx->ip) };
*/
#if LDM_LAG #if LDM_LAG
// TODO: off by 1, but whatever // TODO: off by 1, but whatever
if (cctx->lagIp - cctx->ibase > 0) { if (cctx->lagIp - cctx->ibase > 0) {
@ -604,21 +563,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) {
putHashOfCurrentPositionFromHash(cctx, hash, sum); putHashOfCurrentPositionFromHash(cctx, hash, sum);
} }
U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch,
const BYTE *pInLimit) {
const BYTE * const pStart = pIn;
while (pIn < pInLimit - 1) {
BYTE const diff = (*pMatch) ^ *(pIn);
if (!diff) {
pIn++;
pMatch++;
continue;
}
return (U32)(pIn - pStart);
}
return (U32)(pIn - pStart);
}
void LDM_outputConfiguration(void) { void LDM_outputConfiguration(void) {
printf("=====================\n"); printf("=====================\n");
printf("Configuration\n"); printf("Configuration\n");
@ -640,6 +584,13 @@ void LDM_readHeader(const void *src, U64 *compressedSize,
// ip += sizeof(U64); // ip += sizeof(U64);
} }
void LDM_writeHeader(void *memPtr, U64 compressedSize,
U64 decompressedSize) {
MEM_write64(memPtr, compressedSize);
MEM_write64((BYTE *)memPtr + 8, decompressedSize);
}
void LDM_initializeCCtx(LDM_CCtx *cctx, void LDM_initializeCCtx(LDM_CCtx *cctx,
const void *src, size_t srcSize, const void *src, size_t srcSize,
void *dst, size_t maxDstSize) { void *dst, size_t maxDstSize) {

View File

@ -12,13 +12,12 @@
#include "ldm.h" #include "ldm.h"
#include "zstd.h" #include "zstd.h"
#define DEBUG
//#define TEST //#define TEST
/* Compress file given by fname and output to oname. /* Compress file given by fname and output to oname.
* Returns 0 if successful, error code otherwise. * Returns 0 if successful, error code otherwise.
* *
* TODO: This might seg fault if the compressed size is > the decompress * This might seg fault if the compressed size is > the decompress
* size due to the mmapping and output file size allocated to be the input size * size due to the mmapping and output file size allocated to be the input size
* The compress function should check before writing or buffer writes. * The compress function should check before writing or buffer writes.
*/ */
@ -31,6 +30,7 @@ static int compress(const char *fname, const char *oname) {
struct timeval tv1, tv2; struct timeval tv1, tv2;
double timeTaken; double timeTaken;
/* Open the input file. */ /* Open the input file. */
if ((fdin = open(fname, O_RDONLY)) < 0) { if ((fdin = open(fname, O_RDONLY)) < 0) {
perror("Error in file opening"); perror("Error in file opening");
@ -50,6 +50,7 @@ static int compress(const char *fname, const char *oname) {
} }
maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE); maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE);
// Handle case where compressed size is > decompressed size. // Handle case where compressed size is > decompressed size.
// The compress function should check before writing or buffer writes. // The compress function should check before writing or buffer writes.
maxCompressedSize += statbuf.st_size / 255; maxCompressedSize += statbuf.st_size / 255;
@ -79,21 +80,17 @@ static int compress(const char *fname, const char *oname) {
compressedSize = LDM_HEADER_SIZE + compressedSize = LDM_HEADER_SIZE +
LDM_compress(src, statbuf.st_size, LDM_compress(src, statbuf.st_size,
dst + LDM_HEADER_SIZE, maxCompressedSize); dst + LDM_HEADER_SIZE, maxCompressedSize);
gettimeofday(&tv2, NULL); gettimeofday(&tv2, NULL);
// Write compress and decompress size to header // Write compress and decompress size to header
// TODO: should depend on LDM_DECOMPRESS_SIZE write32 // TODO: should depend on LDM_DECOMPRESS_SIZE write32
memcpy(dst, &compressedSize, 8); LDM_writeHeader(dst, compressedSize, statbuf.st_size);
memcpy(dst + 8, &(statbuf.st_size), 8);
#ifdef DEBUG
printf("Compressed size: %zu\n", compressedSize);
printf("Decompressed size: %zu\n", (size_t)statbuf.st_size);
#endif
// Truncate file to compressedSize. // Truncate file to compressedSize.
ftruncate(fdout, compressedSize); ftruncate(fdout, compressedSize);
printf("%25s : %10lu -> %10lu - %s (%.2fx --- %.1f%%)\n", fname, printf("%25s : %10lu -> %10lu - %s (%.2fx --- %.1f%%)\n", fname,
(size_t)statbuf.st_size, (size_t)compressedSize, oname, (size_t)statbuf.st_size, (size_t)compressedSize, oname,
(statbuf.st_size) / (double)compressedSize, (statbuf.st_size) / (double)compressedSize,
@ -102,7 +99,7 @@ static int compress(const char *fname, const char *oname) {
timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 +
(double) (tv2.tv_sec - tv1.tv_sec), (double) (tv2.tv_sec - tv1.tv_sec),
printf("Total compress time = %.3f seconds, Average compression speed: %.3f MB/s\n", printf("Total compress time = %.3f seconds, Average scanning speed: %.3f MB/s\n",
timeTaken, timeTaken,
((double)statbuf.st_size / (double) (1 << 20)) / timeTaken); ((double)statbuf.st_size / (double) (1 << 20)) / timeTaken);