From 8ed92201024e889333a9ac89dd10b188d28d8647 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 24 Jul 2017 12:05:43 -0700 Subject: [PATCH] Experiment with eviction policies and minor code cleanup --- contrib/long_distance_matching/Makefile | 7 +- .../circular_buffer_table.c | 43 +++++++----- contrib/long_distance_matching/ldm.c | 51 ++++----------- contrib/long_distance_matching/ldm.h | 44 ++++++------- contrib/long_distance_matching/ldm_64_hash.c | 32 ++++----- .../long_distance_matching/ldm_hashtable.h | 49 ++++++++++++-- .../{ldm_with_table.c => ldm_integrated.c} | 65 +++---------------- contrib/long_distance_matching/main-ldm.c | 17 ++--- 8 files changed, 139 insertions(+), 169 deletions(-) rename contrib/long_distance_matching/{ldm_with_table.c => ldm_integrated.c} (94%) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 16844297..c8129f67 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -27,20 +27,17 @@ default: all all: main-circular-buffer main-integrated main-64 -#main-basic : basic_table.c ldm.c main-ldm.c -# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ main-64: ldm_64_hash.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-integrated: ldm_with_table.c main-ldm.c +main-integrated: ldm_integrated.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-circular-buffer main-integrated main-64 + main-circular-buffer main-64 main-integrated @echo Cleaning completed diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index fb6c19d2..92ffc55b 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -5,26 +5,24 @@ #include "ldm_hashtable.h" #include "mem.h" -// Number of elements per hash bucket. -// HASH_BUCKET_SIZE_LOG defined in ldm.h +// THe number of elements per hash bucket. +// HASH_BUCKET_SIZE_LOG is defined in ldm.h. #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) + +// The number of hash buckets. #define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG)) - - -// TODO: rename. Number of hash buckets. -// TODO: Link to HASH_ENTRY_SIZE_LOG - +// If ZSTD_SKIP is defined, then the first entry is returned in HASH_getBestEntry +// (without looking at other entries in the bucket). //#define ZSTD_SKIP struct LDM_hashTable { - U32 numBuckets; - U32 numEntries; + U32 numBuckets; // The number of buckets. + U32 numEntries; // numBuckets * HASH_BUCKET_SIZE. LDM_hashEntry *entries; - BYTE *bucketOffsets; // Pointer to current insert position. + BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position. - // Position corresponding to offset=0 in LDM_hashEntry. - const BYTE *offsetBase; + const BYTE *offsetBase; // Corresponds to offset=0 in LDM_hashEntry. U32 minMatchLength; U32 maxWindowSize; }; @@ -46,6 +44,7 @@ static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + (hash << HASH_BUCKET_SIZE_LOG); } +// From lib/compress/zstd_compress.c static unsigned ZSTD_NbCommonBytes (register size_t val) { if (MEM_isLittleEndian()) { @@ -114,7 +113,11 @@ static unsigned ZSTD_NbCommonBytes (register size_t val) } } } -// From lib/compress/zstd_compress.c +/** + * From lib/compress/zstd_compress.c + * Returns the number of bytes (consecutively) in common between pIn and pMatch + * up to pInLimit. + */ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, const BYTE *const pInLimit) { const BYTE * const pStart = pIn; @@ -147,9 +150,14 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, return (size_t)(pIn - pStart); } -U32 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, - const BYTE *pMatch, const BYTE *pBase) { - U32 matchLength = 0; +/** + * Returns the number of bytes in common between pIn and pMatch, + * counting backwards, with pIn having a lower limit of pAnchor and + * pMatch having a lower limit of pBase. + */ +static size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, + const BYTE *pMatch, const BYTE *pBase) { + size_t matchLength = 0; while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; @@ -178,6 +186,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, U64 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); U64 backwardMatchLength, totalMatchLength; + // Only take matches where the forwardMatchLength is large enough + // for speed. if (forwardMatchLength < table->minMatchLength) { continue; } @@ -212,6 +222,7 @@ hash_t HASH_hashU32(U32 value) { void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { + // Circular buffer. *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; table->bucketOffsets[hash]++; table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index bfaff1f5..a5594ff6 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -29,7 +29,6 @@ typedef U32 checksum_t; -// TODO: Scanning speed // TODO: Memory usage struct LDM_compressStats { U32 windowSizeLog, hashTableSizeLog; @@ -40,9 +39,6 @@ struct LDM_compressStats { U32 minOffset, maxOffset; - U32 numCollisions; - U32 numHashInserts; - U32 offsetHistogram[32]; }; @@ -80,15 +76,12 @@ struct LDM_CCtx { hash_t nextHash; /* Hash corresponding to nextPosHashed */ checksum_t nextSum; - - unsigned step; // ip step, should be 1. const BYTE *lagIp; hash_t lagHash; checksum_t lagSum; - U64 numHashInserts; // DEBUG const BYTE *DEBUG_setNextHash; }; @@ -103,32 +96,6 @@ static int intLog2(U32 x) { return ret; } -// TODO: Maybe we would eventually prefer to have linear rather than -// exponential buckets. -/** -void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { - U32 i = 0; - int buckets[32] = { 0 }; - - printf("\n"); - printf("Hash table histogram\n"); - for (; i < HASH_getSize(cctx->hashTable); i++) { - int offset = (cctx->ip - cctx->ibase) - - HASH_getEntryFromHash(cctx->hashTable, i)->offset; - buckets[intLog2(offset)]++; - } - - i = 0; - for (; i < 32; i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - buckets[i], - 100.0 * (double) buckets[i] / - (double) HASH_getSize(cctx->hashTable)); - } - printf("\n"); -} -*/ - void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); @@ -163,7 +130,8 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { printf("=====================\n"); } -int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { +/* +static int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { U32 lengthLeft = LDM_MIN_MATCH_LENGTH; const BYTE *curIn = pIn; const BYTE *curMatch = pMatch; @@ -181,6 +149,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { } return 1; } +*/ /** * Convert a sum computed from getChecksum to a hash value in the range @@ -253,7 +222,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif -// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, cctx->lastPosHashed[0], @@ -292,7 +260,7 @@ static void putHashOfCurrentPositionFromHash( // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { #if LDM_LAG - // TODO: off by 1, but whatever + // Off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; HASH_insert(cctx->hashTable, cctx->lagHash, entry); @@ -344,6 +312,7 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, hash, sum); } +/* U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE * const pStart = pIn; @@ -358,6 +327,7 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, } return (U32)(pIn - pStart); } +*/ void LDM_outputConfiguration(void) { printf("=====================\n"); @@ -380,6 +350,12 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize) { + MEM_write64(memPtr, compressedSize); + MEM_write64((BYTE *)memPtr + 8, decompressedSize); +} + void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { @@ -592,8 +568,6 @@ size_t LDM_compress(const void *src, size_t srcSize, LDM_updateLastHashFromNextHash(&cctx); } - // HASH_outputTableOffsetHistogram(&cctx); - /* Encode the last literals (no more matches). */ { const U32 lastRun = cctx.iend - cctx.anchor; @@ -692,7 +666,6 @@ size_t LDM_decompress(const void *src, size_t compressedSize, return dctx.op - (BYTE *)dst; } -// TODO: implement and test hash function void LDM_test(const BYTE *src) { (void)src; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 840824c4..adbe35bf 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -1,20 +1,24 @@ #ifndef LDM_H #define LDM_H -#include /* size_t */ - #include "mem.h" // from /lib/common/mem.h -#define LDM_COMPRESS_SIZE 8 -#define LDM_DECOMPRESS_SIZE 8 -#define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) +// The number of bytes storing the compressed and decompressed size +// in the header. +#define LDM_COMPRESSED_SIZE 8 +#define LDM_DECOMPRESSED_SIZE 8 +#define LDM_HEADER_SIZE ((LDM_COMPRESSED_SIZE)+(LDM_DECOMPRESSED_SIZE)) + +// THe number of bytes storing the offset. #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. // Note that this is not the number of buckets. // Currently this should be less than WINDOW_SIZE_LOG + 4? -#define LDM_MEMORY_USAGE 24 -#define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now +#define LDM_MEMORY_USAGE 23 + +// The number of entries in a hash bucket. +#define HASH_BUCKET_SIZE_LOG 0 // The maximum is 4 for now. // Defines the lag in inserting elements into the hash table. #define LDM_LAG 0 @@ -23,11 +27,10 @@ #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four (and perhaps set to the same value?). -#define LDM_MIN_MATCH_LENGTH 64 -#define LDM_HASH_LENGTH 64 +#define LDM_MIN_MATCH_LENGTH 16 +#define LDM_HASH_LENGTH 16 // Experimental. -//:w //#define TMP_EVICTION #define TMP_TAG_INSERT //#define TMP_SIMPLE_LOWER @@ -37,7 +40,6 @@ typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_DCtx LDM_DCtx; - /** * Compresses src into dst. * @@ -94,17 +96,6 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); * Outputs compression statistics to stdout. */ void LDM_printCompressStats(const LDM_compressStats *stats); -/** - * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the - * LDM_MIN_MATCH_LENGTH bytes from match and also if - * pIn - pMatch <= LDM_WINDOW_SIZE. - * - * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four. - * - * Return 1 if valid, 0 otherwise. - */ -int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch); - /** * Encode the literal length followed by the literals. @@ -150,6 +141,15 @@ void LDM_initializeDCtx(LDM_DCtx *dctx, void LDM_readHeader(const void *src, U64 *compressedSize, U64 *decompressedSize); +/** + * Write the compressed and decompressed size. + */ +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize); + +/** + * Output the configuration used. + */ void LDM_outputConfiguration(void); void LDM_test(const BYTE *src); diff --git a/contrib/long_distance_matching/ldm_64_hash.c b/contrib/long_distance_matching/ldm_64_hash.c index bdbdd199..d0080efd 100644 --- a/contrib/long_distance_matching/ldm_64_hash.c +++ b/contrib/long_distance_matching/ldm_64_hash.c @@ -36,8 +36,7 @@ //#define ZSTD_SKIP //#define RUN_CHECKS -// -// + static const U64 prime8bytes = 11400714785074694791ULL; /* Hash table stuff */ @@ -49,7 +48,6 @@ typedef struct LDM_hashEntry { U32 checksum; } LDM_hashEntry; -// TODO: Memory usage struct LDM_compressStats { U32 windowSizeLog, hashTableSizeLog; U32 numMatches; @@ -59,9 +57,6 @@ struct LDM_compressStats { U32 minOffset, maxOffset; - U32 numCollisions; - U32 numHashInserts; - U32 offsetHistogram[32]; U64 TMP_hashCount[1 << HASH_ONLY_EVERY_LOG]; @@ -115,20 +110,19 @@ struct LDM_CCtx { const BYTE *lagIp; U64 lagSum; - U64 numHashInserts; // DEBUG const BYTE *DEBUG_setNextHash; }; struct LDM_hashTable { - U32 numBuckets; // Number of buckets - U32 numEntries; // Rename... - LDM_hashEntry *entries; + U32 numBuckets; // The number of buckets. + U32 numEntries; // numBuckets * HASH_BUCKET_SIZE. - BYTE *bucketOffsets; - // Position corresponding to offset=0 in LDM_hashEntry. + LDM_hashEntry *entries; + BYTE *bucketOffsets; // A pointer (per bucket) to the next insert position. }; + /** * Create a hash table that can contain size elements. * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG. @@ -251,9 +245,9 @@ static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, * * We count only bytes where pMatch > pBaes and pIn > pAnchor. */ -U64 countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, +size_t countBackwardsMatch(const BYTE *pIn, const BYTE *pAnchor, const BYTE *pMatch, const BYTE *pBase) { - U64 matchLength = 0; + size_T matchLength = 0; while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; @@ -293,7 +287,8 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx, U64 forwardMatchLength = ZSTD_count(cctx->ip, pMatch, cctx->iend); U64 backwardMatchLength, totalMatchLength; - // For speed. + // Only take matches where the forward match length is large enough + // for speed. if (forwardMatchLength < LDM_MIN_MATCH_LENGTH) { continue; } @@ -766,6 +761,13 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize) { + MEM_write64(memPtr, compressedSize); + MEM_write64((BYTE *)memPtr + 8, decompressedSize); +} + + void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index d59f401e..6093197d 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -1,37 +1,73 @@ +/** + * A "hash" table used in LDM compression. + * + * This is not exactly a hash table in the sense that inserted entries + * are not guaranteed to remain in the hash table. + */ + #ifndef LDM_HASHTABLE_H #define LDM_HASHTABLE_H #include "mem.h" +// The log size of LDM_hashEntry in bytes. #define LDM_HASH_ENTRY_SIZE_LOG 3 -// TODO: clean up comments - typedef U32 hash_t; typedef struct LDM_hashEntry { - U32 offset; // TODO: Replace with pointer? - U32 checksum; + U32 offset; // Represents the offset of the entry from offsetBase. + U32 checksum; // A checksum to select entries with the same hash value. } LDM_hashEntry; typedef struct LDM_hashTable LDM_hashTable; +/** + * Create a table that can contain size elements. This does not necessarily + * correspond to the number of hash buckets. The number of hash buckets + * is size / (1 << HASH_BUCKET_SIZE_LOG) + * + * minMatchLength is the minimum match length required in HASH_getBestEntry. + * + * maxWindowSize is the maximum distance from pIn in HASH_getBestEntry. + * The window is defined to be (pIn - offsetBase - offset). + */ LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase, U32 minMatchLength, U32 maxWindowSize); +/** + * Return the "best" entry from the table with the same hash and checksum. + * + * pIn: a pointer to the current input position. + * pEnd: a pointer to the maximum input position. + * pAnchor: a pointer to the minimum input position. + * + * This function computes the forward and backward match length from pIn + * and writes it to forwardMatchLength and backwardsMatchLength. + * + * E.g. for the two strings "aaabbbb" "aaabbbb" with pIn and the + * entry pointing at the first "b", the forward match length would be + * four (representing the "b" matches) and the backward match length would + * three (representing the "a" matches before the pointer). + */ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table, const hash_t hash, const U32 checksum, const BYTE *pIn, const BYTE *pEnd, const BYTE *pAnchor, - U64 *matchLength, + U64 *forwardMatchLength, U64 *backwardsMatchLength); +/** + * Return a hash of the value. + */ hash_t HASH_hashU32(U32 value); /** * Insert an LDM_hashEntry into the bucket corresponding to hash. + * + * An entry may be evicted in the process. */ void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry); @@ -41,6 +77,9 @@ void HASH_insert(LDM_hashTable *table, const hash_t hash, */ U32 HASH_getSize(const LDM_hashTable *table); +/** + * Destroy the table. + */ void HASH_destroyTable(LDM_hashTable *table); /** diff --git a/contrib/long_distance_matching/ldm_with_table.c b/contrib/long_distance_matching/ldm_integrated.c similarity index 94% rename from contrib/long_distance_matching/ldm_with_table.c rename to contrib/long_distance_matching/ldm_integrated.c index babfdf3f..7733d4e9 100644 --- a/contrib/long_distance_matching/ldm_with_table.c +++ b/contrib/long_distance_matching/ldm_integrated.c @@ -33,8 +33,6 @@ //#define RUN_CHECKS -/* Hash table stuff */ - typedef U32 hash_t; typedef struct LDM_hashEntry { @@ -42,7 +40,6 @@ typedef struct LDM_hashEntry { U32 checksum; } LDM_hashEntry; -// TODO: Memory usage struct LDM_compressStats { U32 windowSizeLog, hashTableSizeLog; U32 numMatches; @@ -52,9 +49,6 @@ struct LDM_compressStats { U32 minOffset, maxOffset; - U32 numCollisions; - U32 numHashInserts; - U32 offsetHistogram[32]; }; @@ -85,8 +79,6 @@ struct LDM_CCtx { LDM_hashTable *hashTable; -// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32]; - const BYTE *lastPosHashed; /* Last position hashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */ U32 lastSum; @@ -109,11 +101,10 @@ struct LDM_CCtx { struct LDM_hashTable { U32 numBuckets; // Number of buckets - U32 numEntries; // Rename... + U32 numEntries; LDM_hashEntry *entries; BYTE *bucketOffsets; - // Position corresponding to offset=0 in LDM_hashEntry. }; /** @@ -354,32 +345,6 @@ static int intLog2(U32 x) { return ret; } -// Maybe we would eventually prefer to have linear rather than -// exponential buckets. -/** -void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { - U32 i = 0; - int buckets[32] = { 0 }; - - printf("\n"); - printf("Hash table histogram\n"); - for (; i < HASH_getSize(cctx->hashTable); i++) { - int offset = (cctx->ip - cctx->ibase) - - HASH_getEntryFromHash(cctx->hashTable, i)->offset; - buckets[intLog2(offset)]++; - } - - i = 0; - for (; i < 32; i++) { - printf("2^%*d: %10u %6.3f%%\n", 2, i, - buckets[i], - 100.0 * (double) buckets[i] / - (double) HASH_getSize(cctx->hashTable)); - } - printf("\n"); -} -*/ - void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; printf("=====================\n"); @@ -508,7 +473,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->DEBUG_setNextHash = cctx->nextIp; #endif -// cctx->nextSum = getChecksum((const char *)cctx->nextIp, LDM_HASH_LENGTH); cctx->nextSum = updateChecksum( cctx->lastSum, LDM_HASH_LENGTH, cctx->lastPosHashed[0], @@ -517,7 +481,6 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->nextHash = checksumToHash(cctx->nextSum); #if LDM_LAG -// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); if (cctx->ip - cctx->ibase > LDM_LAG) { cctx->lagSum = updateChecksum( cctx->lagSum, LDM_HASH_LENGTH, @@ -547,10 +510,6 @@ static void putHashOfCurrentPositionFromHash( // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - /** - const LDM_hashEntry entry = { cctx->ip - cctx->ibase , - MEM_read32(cctx->ip) }; - */ #if LDM_LAG // TODO: off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { @@ -604,21 +563,6 @@ static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { putHashOfCurrentPositionFromHash(cctx, hash, sum); } -U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, - const BYTE *pInLimit) { - const BYTE * const pStart = pIn; - while (pIn < pInLimit - 1) { - BYTE const diff = (*pMatch) ^ *(pIn); - if (!diff) { - pIn++; - pMatch++; - continue; - } - return (U32)(pIn - pStart); - } - return (U32)(pIn - pStart); -} - void LDM_outputConfiguration(void) { printf("=====================\n"); printf("Configuration\n"); @@ -640,6 +584,13 @@ void LDM_readHeader(const void *src, U64 *compressedSize, // ip += sizeof(U64); } +void LDM_writeHeader(void *memPtr, U64 compressedSize, + U64 decompressedSize) { + MEM_write64(memPtr, compressedSize); + MEM_write64((BYTE *)memPtr + 8, decompressedSize); +} + + void LDM_initializeCCtx(LDM_CCtx *cctx, const void *src, size_t srcSize, void *dst, size_t maxDstSize) { diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index 9769f10e..232c14a2 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -12,13 +12,12 @@ #include "ldm.h" #include "zstd.h" -#define DEBUG //#define TEST /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. * - * TODO: This might seg fault if the compressed size is > the decompress + * This might seg fault if the compressed size is > the decompress * size due to the mmapping and output file size allocated to be the input size * The compress function should check before writing or buffer writes. */ @@ -31,6 +30,7 @@ static int compress(const char *fname, const char *oname) { struct timeval tv1, tv2; double timeTaken; + /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { perror("Error in file opening"); @@ -50,6 +50,7 @@ static int compress(const char *fname, const char *oname) { } maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE); + // Handle case where compressed size is > decompressed size. // The compress function should check before writing or buffer writes. maxCompressedSize += statbuf.st_size / 255; @@ -79,21 +80,17 @@ static int compress(const char *fname, const char *oname) { compressedSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, dst + LDM_HEADER_SIZE, maxCompressedSize); + gettimeofday(&tv2, NULL); // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 - memcpy(dst, &compressedSize, 8); - memcpy(dst + 8, &(statbuf.st_size), 8); - -#ifdef DEBUG - printf("Compressed size: %zu\n", compressedSize); - printf("Decompressed size: %zu\n", (size_t)statbuf.st_size); -#endif + LDM_writeHeader(dst, compressedSize, statbuf.st_size); // Truncate file to compressedSize. ftruncate(fdout, compressedSize); + printf("%25s : %10lu -> %10lu - %s (%.2fx --- %.1f%%)\n", fname, (size_t)statbuf.st_size, (size_t)compressedSize, oname, (statbuf.st_size) / (double)compressedSize, @@ -102,7 +99,7 @@ static int compress(const char *fname, const char *oname) { timeTaken = (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec), - printf("Total compress time = %.3f seconds, Average compression speed: %.3f MB/s\n", + printf("Total compress time = %.3f seconds, Average scanning speed: %.3f MB/s\n", timeTaken, ((double)statbuf.st_size / (double) (1 << 20)) / timeTaken);