From e8093dde09a737d39184827598fc1b67feecf241 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 11 Jan 2018 11:16:32 -0800 Subject: [PATCH] fixed #304 Pathological samples may result in literal section being incompressible. This case is now detected, and literal distribution is replaced by one that can be written into the dictionary. --- lib/common/huf.h | 4 ++-- lib/compress/huf_compress.c | 1 + lib/dictBuilder/zdict.c | 32 +++++++++++++++++++++++++------- tests/fuzzer.c | 18 ++++++++++-------- 4 files changed, 38 insertions(+), 17 deletions(-) diff --git a/lib/common/huf.h b/lib/common/huf.h index 522bf9b6..1cead357 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -206,10 +206,10 @@ The following API allows targeting specific sub-functions for advanced tasks. For example, it's possible to compress several blocks using the same 'CTable', or to save and regenerate 'CTable' using external methods. */ -/* FSE_count() : find it within "fse.h" */ +/* FSE_count() : exposed within "fse.h" */ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */ -size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); +size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap, in which case, CTable will overwrite count content */ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index 5692d56e..cfc5a98b 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -405,6 +405,7 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu } /** HUF_buildCTable() : + * @return : maxNbBits * Note : count is used before tree is written, so they can safely overlap */ size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits) diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index 3a164941..2380599c 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -666,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, } } +/* ZDICT_flatLit() : + * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals. + * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode. + */ +static void ZDICT_flatLit(U32* countLit) +{ + int u; + for (u=1; u<256; u++) countLit[u] = 2; + countLit[0] = 4; + countLit[253] = 1; + countLit[254] = 1; +} #define OFFCODE_MAX 30 /* only applicable to first block */ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, @@ -730,14 +742,20 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, pos += fileSizes[u]; } - /* analyze */ - errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog); - if (HUF_isError(errorCode)) { - eSize = ERROR(GENERIC); - DISPLAYLEVEL(1, " HUF_buildCTable error \n"); - goto _cleanup; + /* analyze, build stats, starting with literals */ + { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); + if (HUF_isError(maxNbBits)) { + eSize = ERROR(GENERIC); + DISPLAYLEVEL(1, " HUF_buildCTable error \n"); + goto _cleanup; + } + if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */ + ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */ + maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); + assert(maxNbBits==9); + } + huffLog = (U32)maxNbBits; } - huffLog = (U32)errorCode; /* looking for most common first offsets */ { U32 offset; diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 7198329a..acb670b1 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -659,12 +659,13 @@ static int basicUnitTests(U32 seed, double compressibility) /* Dictionary and dictBuilder tests */ { ZSTD_CCtx* const cctx = ZSTD_createCCtx(); - size_t dictSize = 16 KB; - void* dictBuffer = malloc(dictSize); + size_t const dictBufferCapacity = 16 KB; + void* dictBuffer = malloc(dictBufferCapacity); size_t const totalSampleSize = 1 MB; size_t const sampleUnitSize = 8 KB; U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize); size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t)); + size_t dictSize; U32 dictID; if (dictBuffer==NULL || samplesSizes==NULL) { @@ -675,16 +676,17 @@ static int basicUnitTests(U32 seed, double compressibility) DISPLAYLEVEL(4, "test%3i : dictBuilder on cyclic data : ", testNb++); assert(compressedBufferSize >= totalSampleSize); - { U32 u; for (u=0; u