fixed #304

Pathological samples may result in literal section being incompressible. This case is now detected, and literal distribution is replaced by one that can be written into the dictionary.
2018-01-11 11:16:32 -08:00 · 2018-01-11 11:16:32 -08:00 · e8093dde09
commit e8093dde09
parent 218e9fe0fc
4 changed files with 38 additions and 17 deletions
--- a/lib/common/huf.h
+++ b/lib/common/huf.h
@ -206,10 +206,10 @@ The following API allows targeting specific sub-functions for advanced tasks.
 For example, it's possible to compress several blocks using the same 'CTable',
 or to save and regenerate 'CTable' using external methods.
 */
-/* FSE_count() : find it within "fse.h" */
+/* FSE_count() : exposed within "fse.h" */
 unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
 typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap, in which case, CTable will overwrite count content */
 size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);

--- a/lib/compress/huf_compress.c
+++ b/lib/compress/huf_compress.c
@ -405,6 +405,7 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu
 }

 /** HUF_buildCTable() :
+ * @return : maxNbBits
 *  Note : count is used before tree is written, so they can safely overlap
 */
 size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@ -666,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
    }
 }

+/* ZDICT_flatLit() :
+ * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
+ * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
+ */
+static void ZDICT_flatLit(U32* countLit)
+{
+    int u;
+    for (u=1; u<256; u++) countLit[u] = 2;
+    countLit[0]   = 4;
+    countLit[253] = 1;
+    countLit[254] = 1;
+}

 #define OFFCODE_MAX 30  /* only applicable to first block */
 static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
@ -730,14 +742,20 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
        pos += fileSizes[u];
    }

-    /* analyze */
-    errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
-    if (HUF_isError(errorCode)) {
-        eSize = ERROR(GENERIC);
-        DISPLAYLEVEL(1, " HUF_buildCTable error \n");
-        goto _cleanup;
+    /* analyze, build stats, starting with literals */
+    {   size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+        if (HUF_isError(maxNbBits)) {
+            eSize = ERROR(GENERIC);
+            DISPLAYLEVEL(1, " HUF_buildCTable error \n");
+            goto _cleanup;
+        }
+        if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
+            ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
+            maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
+            assert(maxNbBits==9);
+        }
+        huffLog = (U32)maxNbBits;
    }
-    huffLog = (U32)errorCode;

    /* looking for most common first offsets */
    {   U32 offset;
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@ -659,12 +659,13 @@ static int basicUnitTests(U32 seed, double compressibility)

    /* Dictionary and dictBuilder tests */
    {   ZSTD_CCtx* const cctx = ZSTD_createCCtx();
-        size_t dictSize = 16 KB;
-        void* dictBuffer = malloc(dictSize);
+        size_t const dictBufferCapacity = 16 KB;
+        void* dictBuffer = malloc(dictBufferCapacity);
        size_t const totalSampleSize = 1 MB;
        size_t const sampleUnitSize = 8 KB;
        U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize);
        size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t));
+        size_t dictSize;
        U32 dictID;

        if (dictBuffer==NULL || samplesSizes==NULL) {
@ -675,16 +676,17 @@ static int basicUnitTests(U32 seed, double compressibility)

        DISPLAYLEVEL(4, "test%3i : dictBuilder on cyclic data : ", testNb++);
        assert(compressedBufferSize >= totalSampleSize);
-        { U32 u; for (u=0; u<totalSampleSize; u++) ((BYTE*)compressedBuffer)[u] = (BYTE)u; }
+        { U32 u; for (u=0; u<totalSampleSize; u++) ((BYTE*)decodedBuffer)[u] = (BYTE)u; }
        { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
-        dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize,
-                                         compressedBuffer, samplesSizes, nbSamples);
-        if (ZDICT_isError(dictSize)) goto _output_error;
-        DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);
+        {   size_t const sDictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity,
+                                         decodedBuffer, samplesSizes, nbSamples);
+            if (ZDICT_isError(sDictSize)) goto _output_error;
+            DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)sDictSize);
+        }

        DISPLAYLEVEL(4, "test%3i : dictBuilder : ", testNb++);
        { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
-        dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize,
+        dictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity,
                                         CNBuffer, samplesSizes, nbSamples);
        if (ZDICT_isError(dictSize)) goto _output_error;
        DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);