Pathological samples may result in literal section being incompressible.
This case is now detected,
and literal distribution is replaced by one that can be written into the dictionary.
dev
Yann Collet 2018-01-11 11:16:32 -08:00
parent 218e9fe0fc
commit e8093dde09
4 changed files with 38 additions and 17 deletions

View File

@ -206,10 +206,10 @@ The following API allows targeting specific sub-functions for advanced tasks.
For example, it's possible to compress several blocks using the same 'CTable',
or to save and regenerate 'CTable' using external methods.
*/
/* FSE_count() : find it within "fse.h" */
/* FSE_count() : exposed within "fse.h" */
unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);
size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap, in which case, CTable will overwrite count content */
size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);

View File

@ -405,6 +405,7 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu
}
/** HUF_buildCTable() :
* @return : maxNbBits
* Note : count is used before tree is written, so they can safely overlap
*/
size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)

View File

@ -666,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val,
}
}
/* ZDICT_flatLit() :
* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
*/
static void ZDICT_flatLit(U32* countLit)
{
int u;
for (u=1; u<256; u++) countLit[u] = 2;
countLit[0] = 4;
countLit[253] = 1;
countLit[254] = 1;
}
#define OFFCODE_MAX 30 /* only applicable to first block */
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
@ -730,14 +742,20 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
pos += fileSizes[u];
}
/* analyze */
errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
if (HUF_isError(errorCode)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
goto _cleanup;
/* analyze, build stats, starting with literals */
{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
if (HUF_isError(maxNbBits)) {
eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, " HUF_buildCTable error \n");
goto _cleanup;
}
if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */
ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
assert(maxNbBits==9);
}
huffLog = (U32)maxNbBits;
}
huffLog = (U32)errorCode;
/* looking for most common first offsets */
{ U32 offset;

View File

@ -659,12 +659,13 @@ static int basicUnitTests(U32 seed, double compressibility)
/* Dictionary and dictBuilder tests */
{ ZSTD_CCtx* const cctx = ZSTD_createCCtx();
size_t dictSize = 16 KB;
void* dictBuffer = malloc(dictSize);
size_t const dictBufferCapacity = 16 KB;
void* dictBuffer = malloc(dictBufferCapacity);
size_t const totalSampleSize = 1 MB;
size_t const sampleUnitSize = 8 KB;
U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize);
size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t));
size_t dictSize;
U32 dictID;
if (dictBuffer==NULL || samplesSizes==NULL) {
@ -675,16 +676,17 @@ static int basicUnitTests(U32 seed, double compressibility)
DISPLAYLEVEL(4, "test%3i : dictBuilder on cyclic data : ", testNb++);
assert(compressedBufferSize >= totalSampleSize);
{ U32 u; for (u=0; u<totalSampleSize; u++) ((BYTE*)compressedBuffer)[u] = (BYTE)u; }
{ U32 u; for (u=0; u<totalSampleSize; u++) ((BYTE*)decodedBuffer)[u] = (BYTE)u; }
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize,
compressedBuffer, samplesSizes, nbSamples);
if (ZDICT_isError(dictSize)) goto _output_error;
DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);
{ size_t const sDictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity,
decodedBuffer, samplesSizes, nbSamples);
if (ZDICT_isError(sDictSize)) goto _output_error;
DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)sDictSize);
}
DISPLAYLEVEL(4, "test%3i : dictBuilder : ", testNb++);
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize,
dictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity,
CNBuffer, samplesSizes, nbSamples);
if (ZDICT_isError(dictSize)) goto _output_error;
DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);