From 15a041adbf8b59bc88838fe07297d8399319cec0 Mon Sep 17 00:00:00 2001 From: Stella Lau Date: Mon, 17 Jul 2017 15:16:58 -0700 Subject: [PATCH] Add function to get valid entries only from table --- contrib/long_distance_matching/Makefile | 6 +-- contrib/long_distance_matching/basic_table.c | 17 ++++++ ...aining_table.c => circular_buffer_table.c} | 21 +++++++- contrib/long_distance_matching/ldm.c | 54 ++++++------------- contrib/long_distance_matching/ldm.h | 11 ++-- .../long_distance_matching/ldm_hashtable.h | 9 +++- 6 files changed, 70 insertions(+), 48 deletions(-) rename contrib/long_distance_matching/{chaining_table.c => circular_buffer_table.c} (79%) diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 3159df75..47085022 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,17 +25,17 @@ LDFLAGS += -lzstd default: all -all: main-basic main-chaining +all: main-basic main-circular-buffer main-basic : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-chaining : chaining_table.c ldm.c main-ldm.c +main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-chaining + main-basic main-circular-buffer @echo Cleaning completed diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index c6a5040e..859bf061 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -27,12 +27,29 @@ LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { return table->entries + hash; } + LDM_hashEntry *HASH_getEntryFromHash( const LDM_hashTable *table, const hash_t hash, const U32 checksum) { (void)checksum; return getBucket(table, hash); } +LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + int (*isValid)(const BYTE *pIn, const BYTE *pMatch)) { + LDM_hashEntry *entry = getBucket(table, hash); + (void)checksum; + if ((*isValid)(pIn, entry->offset + table->offsetBase)) { + return entry; + } else { + return NULL; + } +} + + + void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *getBucket(table, hash) = entry; diff --git a/contrib/long_distance_matching/chaining_table.c b/contrib/long_distance_matching/circular_buffer_table.c similarity index 79% rename from contrib/long_distance_matching/chaining_table.c rename to contrib/long_distance_matching/circular_buffer_table.c index 226f7822..f45f945c 100644 --- a/contrib/long_distance_matching/chaining_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -9,7 +9,7 @@ // refactor code to scale the number of elements appropriately. // Number of elements per hash bucket. -#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now +#define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) struct LDM_hashTable { @@ -44,6 +44,25 @@ static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table, } */ +LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + int (*isValid)(const BYTE *pIn, const BYTE *pMatch)) { + LDM_hashEntry *bucket = getBucket(table, hash); + LDM_hashEntry *cur = bucket; + // TODO: in order of recency? + for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + // CHeck checksum for faster check. + if (cur->checksum == checksum && + (*isValid)(pIn, cur->offset + table->offsetBase)) { + return cur; + } + } + return NULL; +} + + LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum) { diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 3cb82ea6..bf54842f 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -5,7 +5,7 @@ #include // Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY 0 +#define HASH_ONLY_EVERY 31 #define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) @@ -38,8 +38,6 @@ struct LDM_compressStats { U32 numCollisions; U32 numHashInserts; -// U64 numInvalidHashes, numValidHashes; // tmp - U32 offsetHistogram[32]; }; @@ -153,45 +151,25 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { (double) stats->numMatches); } printf("\n"); - - /* - printf("Num invalid hashes, num valid hashes, %llu %llu\n", - stats->numInvalidHashes, stats->numValidHashes); - */ - /* - printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", - stats->numCollisions, stats->numHashInserts, - stats->numHashInserts == 0 ? - 1.0 : (100.0 * (double)stats->numCollisions) / - (double)stats->numHashInserts); - */ printf("=====================\n"); } int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { - /* - if (memcmp(pIn, pMatch, LDM_MIN_MATCH_LENGTH) == 0) { - return 1; - } - return 0; - */ - - //TODO: This seems to be faster for some reason? - U32 lengthLeft = LDM_MIN_MATCH_LENGTH; const BYTE *curIn = pIn; const BYTE *curMatch = pMatch; - for (; lengthLeft >= 8; lengthLeft -= 8) { - if (MEM_read64(curIn) != MEM_read64(curMatch)) { + if (pIn - pMatch > LDM_WINDOW_SIZE) { + return 0; + } + + for (; lengthLeft >= 4; lengthLeft -= 4) { + if (MEM_read32(curIn) != MEM_read32(curMatch)) { return 0; } - curIn += 8; - curMatch += 8; - } - if (lengthLeft > 0) { - return (MEM_read32(curIn) == MEM_read32(curMatch)); + curIn += 4; + curMatch += 4; } return 1; } @@ -307,8 +285,11 @@ static void putHashOfCurrentPositionFromHash( // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { + /** const LDM_hashEntry entry = { cctx->ip - cctx->ibase , MEM_read32(cctx->ip) }; + */ + const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; HASH_insert(cctx->hashTable, hash, entry); } @@ -438,7 +419,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; - do { + while (entry == NULL) { hash_t h; U32 sum; setNextHash(cctx); @@ -451,17 +432,14 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 1; } - entry = HASH_getEntryFromHash(cctx->hashTable, h, MEM_read32(cctx->ip)); + entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip, + &LDM_isValidMatch); if (entry != NULL) { *match = entry->offset + cctx->ibase; } - putHashOfCurrentPositionFromHash(cctx, h, sum); - - } while (entry == NULL || - (cctx->ip - *match > LDM_WINDOW_SIZE || - !LDM_isValidMatch(cctx->ip, *match))); + } setNextHash(cctx); return 0; } diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 6325d1b1..6d97bd56 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -11,14 +11,14 @@ #define LDM_OFFSET_SIZE 4 // Defines the size of the hash table. -#define LDM_MEMORY_USAGE 16 +#define LDM_MEMORY_USAGE 20 -#define LDM_WINDOW_SIZE_LOG 25 +#define LDM_WINDOW_SIZE_LOG 30 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 1024 -#define LDM_HASH_LENGTH 1024 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_LENGTH 64 typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; @@ -82,7 +82,8 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx); void LDM_printCompressStats(const LDM_compressStats *stats); /** * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the - * LDM_MIN_MATCH_LENGTH bytes from match. + * LDM_MIN_MATCH_LENGTH bytes from match and also if + * pIn - pMatch <= LDM_WINDOW_SIZE. * * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four. * diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 92add96f..88d19ae2 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -7,7 +7,7 @@ typedef U32 hash_t; typedef struct LDM_hashEntry { U32 offset; - U32 checksum; // Not needed? + U32 checksum; } LDM_hashEntry; typedef struct LDM_hashTable LDM_hashTable; @@ -17,10 +17,17 @@ typedef struct LDM_hashTable LDM_hashTable; LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); +//TODO: unneeded? LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum); +LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum, + const BYTE *pIn, + int (*isValid)(const BYTE *pIn, const BYTE *pMatch)); + void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry);