From 370b751e2424024f719f95e75b7f3d18f5f0499c Mon Sep 17 00:00:00 2001 From: Giuseppe Ottaviano Date: Mon, 30 May 2016 18:49:58 -0700 Subject: [PATCH] Expose function to add entropy tables to pre-built dictionary. In some cases a custom dictionary building algorithm tailored for a specific input can be more effective than the one produced by `ZDICT_trainFromBuffer`, but with the current API it's not possible encode the entropy tables into the custom-built dictionary. This commit extracts the logic to add entropy tables to a dictionary from `ZDICT_trainFromBuffer` and exposes it as a function `ZDICT_addEntropyTablesFromBuffer`. --- lib/dictBuilder/zdict.c | 72 +++++++++++++++++++++++++---------------- lib/dictBuilder/zdict.h | 15 +++++++++ 2 files changed, 60 insertions(+), 27 deletions(-) diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index be141ce1..09249e9e 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -807,6 +807,33 @@ static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize, return nbSegments * DIB_FASTSEGMENTSIZE; } +size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_params_t params) +{ + size_t hSize; + unsigned const compressionLevel = (params.compressionLevel == 0) ? g_compressionLevel_default : params.compressionLevel; + + /* dictionary header */ + MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); + { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); + U32 const dictID = params.dictID ? params.dictID : (U32)(randomID>>11); + MEM_writeLE32((char*)dictBuffer+4, dictID); + } + hSize = 8; + + /* entropy tables */ + DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ + DISPLAYLEVEL(2, "statistics ... \n"); + hSize += ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, + compressionLevel, + samplesBuffer, samplesSizes, nbSamples, + (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); + + if (hSize + dictContentSize < dictBufferCapacity) + memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); + return MIN(dictBufferCapacity, hSize+dictContentSize); +} #define DIB_MINSAMPLESSIZE (DIB_FASTSEGMENTSIZE*3) /*! ZDICT_trainFromBuffer_unsafe() : @@ -815,13 +842,12 @@ static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize, */ size_t ZDICT_trainFromBuffer_unsafe( void* dictBuffer, size_t maxDictSize, - const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t params) { U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16)); dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); unsigned selectivity = params.selectivityLevel; - unsigned compressionLevel = params.compressionLevel; size_t const targetDictSize = maxDictSize; size_t sBuffSize; size_t dictSize = 0; @@ -831,18 +857,17 @@ size_t ZDICT_trainFromBuffer_unsafe( if (!dictList) return ERROR(memory_allocation); /* init */ - { unsigned u; for (u=0, sBuffSize=0; u1) { /* selectivity == 1 => fast mode */ ZDICT_trainBuffer(dictList, dictListSize, samplesBuffer, sBuffSize, - sampleSizes, nbSamples, + samplesSizes, nbSamples, selectivity, (U32)targetDictSize); /* display best matches */ @@ -864,7 +889,6 @@ size_t ZDICT_trainFromBuffer_unsafe( /* create dictionary */ { U32 dictContentSize = ZDICT_dictSize(dictList); - size_t hSize; /* build dict content */ { U32 u; @@ -884,25 +908,9 @@ size_t ZDICT_trainFromBuffer_unsafe( samplesBuffer, sBuffSize); } - /* dictionary header */ - MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); - { U64 const randomID = XXH64((char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize, 0); - U32 const dictID = params.dictID ? params.dictID : (U32)(randomID>>11); - MEM_writeLE32((char*)dictBuffer+4, dictID); - } - hSize = 8; - - /* entropic tables */ - DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ - DISPLAYLEVEL(2, "statistics ... \n"); - hSize += ZDICT_analyzeEntropy((char*)dictBuffer+hSize, maxDictSize-hSize, - compressionLevel, - samplesBuffer, sampleSizes, nbSamples, - (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize); - - if (hSize + dictContentSize < maxDictSize) - memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize); - dictSize = MIN(maxDictSize, hSize+dictContentSize); + dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, + samplesBuffer, samplesSizes, nbSamples, + params); } /* clean up */ @@ -914,8 +922,8 @@ size_t ZDICT_trainFromBuffer_unsafe( /* issue : samplesBuffer need to be followed by a noisy guard band. * work around : duplicate the buffer, and add the noise */ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - ZDICT_params_t params) + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, + ZDICT_params_t params) { void* newBuff; size_t sBuffSize; @@ -946,3 +954,13 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, samplesBuffer, samplesSizes, nbSamples, params); } + +size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) +{ + ZDICT_params_t params; + memset(¶ms, 0, sizeof(params)); + return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, + samplesBuffer, samplesSizes, nbSamples, + params); +} diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h index 2ca190ce..3a724d0b 100644 --- a/lib/dictBuilder/zdict.h +++ b/lib/dictBuilder/zdict.h @@ -52,6 +52,21 @@ extern "C" { size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); +/*! ZDICT_addEntropyTablesFromBuffer() : + + Given a content-only dictionary (built for example from common strings in + the input), add entropy tables computed from the memory buffer + `samplesBuffer`, where `nbSamples` samples have been stored concatenated. + Each sample size is provided into an orderly table `samplesSizes`. + + The input dictionary is the last `dictContentSize` bytes of `dictBuffer`. The + resulting dictionary with added entropy tables will written back to + `dictBuffer`. + @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`). +*/ +size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); + /*-************************************* * Helper functions