Add function to get valid entries only from table

dev
Stella Lau 2017-07-17 15:16:58 -07:00
parent 4bb42b02c1
commit 15a041adbf
6 changed files with 70 additions and 48 deletions

View File

@ -25,17 +25,17 @@ LDFLAGS += -lzstd
default: all default: all
all: main-basic main-chaining all: main-basic main-circular-buffer
main-basic : basic_table.c ldm.c main-ldm.c main-basic : basic_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-chaining : chaining_table.c ldm.c main-ldm.c main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
clean: clean:
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \ @rm -f core *.o tmp* result* *.ldm *.ldm.dec \
main-basic main-chaining main-basic main-circular-buffer
@echo Cleaning completed @echo Cleaning completed

View File

@ -27,12 +27,29 @@ LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) {
return table->entries + hash; return table->entries + hash;
} }
LDM_hashEntry *HASH_getEntryFromHash( LDM_hashEntry *HASH_getEntryFromHash(
const LDM_hashTable *table, const hash_t hash, const U32 checksum) { const LDM_hashTable *table, const hash_t hash, const U32 checksum) {
(void)checksum; (void)checksum;
return getBucket(table, hash); return getBucket(table, hash);
} }
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
const hash_t hash,
const U32 checksum,
const BYTE *pIn,
int (*isValid)(const BYTE *pIn, const BYTE *pMatch)) {
LDM_hashEntry *entry = getBucket(table, hash);
(void)checksum;
if ((*isValid)(pIn, entry->offset + table->offsetBase)) {
return entry;
} else {
return NULL;
}
}
void HASH_insert(LDM_hashTable *table, void HASH_insert(LDM_hashTable *table,
const hash_t hash, const LDM_hashEntry entry) { const hash_t hash, const LDM_hashEntry entry) {
*getBucket(table, hash) = entry; *getBucket(table, hash) = entry;

View File

@ -9,7 +9,7 @@
// refactor code to scale the number of elements appropriately. // refactor code to scale the number of elements appropriately.
// Number of elements per hash bucket. // Number of elements per hash bucket.
#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now #define HASH_BUCKET_SIZE_LOG 1 // MAX is 4 for now
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
struct LDM_hashTable { struct LDM_hashTable {
@ -44,6 +44,25 @@ static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table,
} }
*/ */
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
const hash_t hash,
const U32 checksum,
const BYTE *pIn,
int (*isValid)(const BYTE *pIn, const BYTE *pMatch)) {
LDM_hashEntry *bucket = getBucket(table, hash);
LDM_hashEntry *cur = bucket;
// TODO: in order of recency?
for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) {
// CHeck checksum for faster check.
if (cur->checksum == checksum &&
(*isValid)(pIn, cur->offset + table->offsetBase)) {
return cur;
}
}
return NULL;
}
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
const hash_t hash, const hash_t hash,
const U32 checksum) { const U32 checksum) {

View File

@ -5,7 +5,7 @@
#include <string.h> #include <string.h>
// Insert every (HASH_ONLY_EVERY + 1) into the hash table. // Insert every (HASH_ONLY_EVERY + 1) into the hash table.
#define HASH_ONLY_EVERY 0 #define HASH_ONLY_EVERY 31
#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHLOG (LDM_MEMORY_USAGE-2)
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
@ -38,8 +38,6 @@ struct LDM_compressStats {
U32 numCollisions; U32 numCollisions;
U32 numHashInserts; U32 numHashInserts;
// U64 numInvalidHashes, numValidHashes; // tmp
U32 offsetHistogram[32]; U32 offsetHistogram[32];
}; };
@ -153,45 +151,25 @@ void LDM_printCompressStats(const LDM_compressStats *stats) {
(double) stats->numMatches); (double) stats->numMatches);
} }
printf("\n"); printf("\n");
/*
printf("Num invalid hashes, num valid hashes, %llu %llu\n",
stats->numInvalidHashes, stats->numValidHashes);
*/
/*
printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n",
stats->numCollisions, stats->numHashInserts,
stats->numHashInserts == 0 ?
1.0 : (100.0 * (double)stats->numCollisions) /
(double)stats->numHashInserts);
*/
printf("=====================\n"); printf("=====================\n");
} }
int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) {
/*
if (memcmp(pIn, pMatch, LDM_MIN_MATCH_LENGTH) == 0) {
return 1;
}
return 0;
*/
//TODO: This seems to be faster for some reason?
U32 lengthLeft = LDM_MIN_MATCH_LENGTH; U32 lengthLeft = LDM_MIN_MATCH_LENGTH;
const BYTE *curIn = pIn; const BYTE *curIn = pIn;
const BYTE *curMatch = pMatch; const BYTE *curMatch = pMatch;
for (; lengthLeft >= 8; lengthLeft -= 8) { if (pIn - pMatch > LDM_WINDOW_SIZE) {
if (MEM_read64(curIn) != MEM_read64(curMatch)) {
return 0; return 0;
} }
curIn += 8;
curMatch += 8; for (; lengthLeft >= 4; lengthLeft -= 4) {
if (MEM_read32(curIn) != MEM_read32(curMatch)) {
return 0;
} }
if (lengthLeft > 0) { curIn += 4;
return (MEM_read32(curIn) == MEM_read32(curMatch)); curMatch += 4;
} }
return 1; return 1;
} }
@ -307,8 +285,11 @@ static void putHashOfCurrentPositionFromHash(
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
// Note: this works only when cctx->step is 1. // Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
/**
const LDM_hashEntry entry = { cctx->ip - cctx->ibase , const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
MEM_read32(cctx->ip) }; MEM_read32(cctx->ip) };
*/
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum };
HASH_insert(cctx->hashTable, hash, entry); HASH_insert(cctx->hashTable, hash, entry);
} }
@ -438,7 +419,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) {
LDM_hashEntry *entry = NULL; LDM_hashEntry *entry = NULL;
cctx->nextIp = cctx->ip + cctx->step; cctx->nextIp = cctx->ip + cctx->step;
do { while (entry == NULL) {
hash_t h; hash_t h;
U32 sum; U32 sum;
setNextHash(cctx); setNextHash(cctx);
@ -451,17 +432,14 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) {
return 1; return 1;
} }
entry = HASH_getEntryFromHash(cctx->hashTable, h, MEM_read32(cctx->ip)); entry = HASH_getValidEntry(cctx->hashTable, h, sum, cctx->ip,
&LDM_isValidMatch);
if (entry != NULL) { if (entry != NULL) {
*match = entry->offset + cctx->ibase; *match = entry->offset + cctx->ibase;
} }
putHashOfCurrentPositionFromHash(cctx, h, sum); putHashOfCurrentPositionFromHash(cctx, h, sum);
}
} while (entry == NULL ||
(cctx->ip - *match > LDM_WINDOW_SIZE ||
!LDM_isValidMatch(cctx->ip, *match)));
setNextHash(cctx); setNextHash(cctx);
return 0; return 0;
} }

View File

@ -11,14 +11,14 @@
#define LDM_OFFSET_SIZE 4 #define LDM_OFFSET_SIZE 4
// Defines the size of the hash table. // Defines the size of the hash table.
#define LDM_MEMORY_USAGE 16 #define LDM_MEMORY_USAGE 20
#define LDM_WINDOW_SIZE_LOG 25 #define LDM_WINDOW_SIZE_LOG 30
#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))
//These should be multiples of four. //These should be multiples of four.
#define LDM_MIN_MATCH_LENGTH 1024 #define LDM_MIN_MATCH_LENGTH 64
#define LDM_HASH_LENGTH 1024 #define LDM_HASH_LENGTH 64
typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_compressStats LDM_compressStats;
typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_CCtx LDM_CCtx;
@ -82,7 +82,8 @@ void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx);
void LDM_printCompressStats(const LDM_compressStats *stats); void LDM_printCompressStats(const LDM_compressStats *stats);
/** /**
* Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the * Checks whether the LDM_MIN_MATCH_LENGTH bytes from p are the same as the
* LDM_MIN_MATCH_LENGTH bytes from match. * LDM_MIN_MATCH_LENGTH bytes from match and also if
* pIn - pMatch <= LDM_WINDOW_SIZE.
* *
* This assumes LDM_MIN_MATCH_LENGTH is a multiple of four. * This assumes LDM_MIN_MATCH_LENGTH is a multiple of four.
* *

View File

@ -7,7 +7,7 @@ typedef U32 hash_t;
typedef struct LDM_hashEntry { typedef struct LDM_hashEntry {
U32 offset; U32 offset;
U32 checksum; // Not needed? U32 checksum;
} LDM_hashEntry; } LDM_hashEntry;
typedef struct LDM_hashTable LDM_hashTable; typedef struct LDM_hashTable LDM_hashTable;
@ -17,10 +17,17 @@ typedef struct LDM_hashTable LDM_hashTable;
LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase);
//TODO: unneeded?
LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table,
const hash_t hash, const hash_t hash,
const U32 checksum); const U32 checksum);
LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table,
const hash_t hash,
const U32 checksum,
const BYTE *pIn,
int (*isValid)(const BYTE *pIn, const BYTE *pMatch));
void HASH_insert(LDM_hashTable *table, const hash_t hash, void HASH_insert(LDM_hashTable *table, const hash_t hash,
const LDM_hashEntry entry); const LDM_hashEntry entry);