added a test case for dictBuilder failure

cyclic data set makes the entropy stage fails
now, onto a fix for #304 ...
This commit is contained in:
Yann Collet 2018-01-11 09:42:38 -08:00
parent 06995775b0
commit 218e9fe0fc
5 changed files with 120 additions and 94 deletions

View File

@ -2869,7 +2869,7 @@ size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
if (params.nbThreads > 1) { if (params.nbThreads > 1) {
if (cctx->mtctx == NULL || (params.nbThreads != ZSTDMT_getNbThreads(cctx->mtctx))) { if (cctx->mtctx == NULL || (params.nbThreads != ZSTDMT_getNbThreads(cctx->mtctx))) {
DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbThreads=%u (previous: %u)", DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbThreads=%u (previous: %u)",
params.nbThreads, ZSTDMT_getNbThreads(cctx->mtctx)); params.nbThreads, (U32)ZSTDMT_getNbThreads(cctx->mtctx));
ZSTDMT_freeCCtx(cctx->mtctx); ZSTDMT_freeCCtx(cctx->mtctx);
cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbThreads, cctx->customMem); cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbThreads, cctx->customMem);
if (cctx->mtctx == NULL) return ERROR(memory_allocation); if (cctx->mtctx == NULL) return ERROR(memory_allocation);

View File

@ -537,8 +537,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
/* Checks */ /* Checks */
if (totalSamplesSize < MAX(d, sizeof(U64)) || if (totalSamplesSize < MAX(d, sizeof(U64)) ||
totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n", DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
(COVER_MAX_SAMPLES_SIZE >> 20)); (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
return 0; return 0;
} }
/* Zero the context */ /* Zero the context */
@ -651,12 +651,16 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
} }
ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, void *dictBuffer, size_t dictBufferCapacity,
const size_t *samplesSizes, unsigned nbSamples, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t parameters) { ZDICT_cover_params_t parameters)
BYTE *const dict = (BYTE *)dictBuffer; {
BYTE* const dict = (BYTE*)dictBuffer;
COVER_ctx_t ctx; COVER_ctx_t ctx;
COVER_map_t activeDmers; COVER_map_t activeDmers;
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Checks */ /* Checks */
if (!COVER_checkParameters(parameters, dictBufferCapacity)) { if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
DISPLAYLEVEL(1, "Cover parameters incorrect\n"); DISPLAYLEVEL(1, "Cover parameters incorrect\n");
@ -671,8 +675,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
ZDICT_DICTSIZE_MIN); ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall); return ERROR(dstSize_tooSmall);
} }
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Initialize context and activeDmers */ /* Initialize context and activeDmers */
if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
parameters.d)) { parameters.d)) {
@ -947,6 +949,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
unsigned k; unsigned k;
COVER_best_t best; COVER_best_t best;
POOL_ctx *pool = NULL; POOL_ctx *pool = NULL;
/* Checks */ /* Checks */
if (kMinK < kMaxD || kMaxK < kMinK) { if (kMinK < kMaxD || kMaxK < kMinK) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
@ -1004,7 +1007,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
data->parameters.k = k; data->parameters.k = k;
data->parameters.d = d; data->parameters.d = d;
data->parameters.steps = kSteps; data->parameters.steps = kSteps;
data->parameters.zParams.notificationLevel = g_displayLevel; data->parameters.zParams.notificationLevel = displayLevel;
/* Check the parameters */ /* Check the parameters */
if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) { if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
DISPLAYLEVEL(1, "Cover parameters incorrect\n"); DISPLAYLEVEL(1, "Cover parameters incorrect\n");

View File

@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos(
U32 cumulLength[LLIMIT] = {0}; U32 cumulLength[LLIMIT] = {0};
U32 savings[LLIMIT] = {0}; U32 savings[LLIMIT] = {0};
const BYTE* b = (const BYTE*)buffer; const BYTE* b = (const BYTE*)buffer;
size_t length;
size_t maxLength = LLIMIT; size_t maxLength = LLIMIT;
size_t pos = suffix[start]; size_t pos = suffix[start];
U32 end = start; U32 end = start;
@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos(
||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
/* skip and mark segment */ /* skip and mark segment */
U16 u16 = MEM_read16(b+pos+4); U16 const pattern16 = MEM_read16(b+pos+4);
U32 u, e = 6; U32 u, patternEnd = 6;
while (MEM_read16(b+pos+e) == u16) e+=2 ; while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
if (b[pos+e] == b[pos+e-1]) e++; if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
for (u=1; u<e; u++) for (u=1; u<patternEnd; u++)
doneMarks[pos+u] = 1; doneMarks[pos+u] = 1;
return solution; return solution;
} }
/* look forward */ /* look forward */
do { { size_t length;
end++; do {
length = ZDICT_count(b + pos, b + suffix[end]); end++;
} while (length >=MINMATCHLENGTH); length = ZDICT_count(b + pos, b + suffix[end]);
} while (length >= MINMATCHLENGTH);
}
/* look backward */ /* look backward */
do { { size_t length;
length = ZDICT_count(b + pos, b + *(suffix+start-1)); do {
if (length >=MINMATCHLENGTH) start--; length = ZDICT_count(b + pos, b + *(suffix+start-1));
} while(length >= MINMATCHLENGTH); if (length >=MINMATCHLENGTH) start--;
} while(length >= MINMATCHLENGTH);
}
/* exit if not found a minimum nb of repetitions */ /* exit if not found a minimum nb of repetitions */
if (end-start < minRatio) { if (end-start < minRatio) {
@ -268,7 +271,7 @@ static dictItem ZDICT_analyzePos(
U32 selectedCount = 0; U32 selectedCount = 0;
U32 selectedID = currentID; U32 selectedID = currentID;
for (id =refinedStart; id < refinedEnd; id++) { for (id =refinedStart; id < refinedEnd; id++) {
if (b[ suffix[id] + searchLength] != currentChar) { if (b[suffix[id] + searchLength] != currentChar) {
if (currentCount > selectedCount) { if (currentCount > selectedCount) {
selectedCount = currentCount; selectedCount = currentCount;
selectedID = currentID; selectedID = currentID;
@ -297,20 +300,23 @@ static dictItem ZDICT_analyzePos(
memset(lengthList, 0, sizeof(lengthList)); memset(lengthList, 0, sizeof(lengthList));
/* look forward */ /* look forward */
do { { size_t length;
end++; do {
length = ZDICT_count(b + pos, b + suffix[end]); end++;
if (length >= LLIMIT) length = LLIMIT-1; length = ZDICT_count(b + pos, b + suffix[end]);
lengthList[length]++; if (length >= LLIMIT) length = LLIMIT-1;
} while (length >=MINMATCHLENGTH); lengthList[length]++;
} while (length >=MINMATCHLENGTH);
}
/* look backward */ /* look backward */
length = MINMATCHLENGTH; { size_t length = MINMATCHLENGTH;
while ((length >= MINMATCHLENGTH) & (start > 0)) { while ((length >= MINMATCHLENGTH) & (start > 0)) {
length = ZDICT_count(b + pos, b + suffix[start - 1]); length = ZDICT_count(b + pos, b + suffix[start - 1]);
if (length >= LLIMIT) length = LLIMIT - 1; if (length >= LLIMIT) length = LLIMIT - 1;
lengthList[length]++; lengthList[length]++;
if (length >= MINMATCHLENGTH) start--; if (length >= MINMATCHLENGTH) start--;
}
} }
/* largest useful length */ /* largest useful length */
@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos(
/* mark positions done */ /* mark positions done */
{ U32 id; { U32 id;
for (id=start; id<end; id++) { for (id=start; id<end; id++) {
U32 p, pEnd; U32 p, pEnd, length;
U32 const testedPos = suffix[id]; U32 const testedPos = suffix[id];
if (testedPos == pos) if (testedPos == pos)
length = solution.length; length = solution.length;
else { else {
length = ZDICT_count(b+pos, b+testedPos); length = (U32)ZDICT_count(b+pos, b+testedPos);
if (length > solution.length) length = solution.length; if (length > solution.length) length = solution.length;
} }
pEnd = (U32)(testedPos + length); pEnd = (U32)(testedPos + length);
@ -575,29 +581,30 @@ static void ZDICT_fillNoise(void* buffer, size_t length)
typedef struct typedef struct
{ {
ZSTD_CCtx* ref; ZSTD_CCtx* ref; /* contains reference to dictionary */
ZSTD_CCtx* zc; ZSTD_CCtx* zc; /* working context */
void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */
} EStats_ress_t; } EStats_ress_t;
#define MAXREPOFFSET 1024 #define MAXREPOFFSET 1024
static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
const void* src, size_t srcSize, U32 notificationLevel) const void* src, size_t srcSize,
U32 notificationLevel)
{ {
size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
size_t cSize; size_t cSize;
if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */
{ size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
} }
cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
if (cSize) { /* if == 0; block is not compressible */ if (cSize) { /* if == 0; block is not compressible */
const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
/* literals stats */ /* literals stats */
{ const BYTE* bytePtr; { const BYTE* bytePtr;
@ -688,6 +695,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
BYTE* dstPtr = (BYTE*)dstBuffer; BYTE* dstPtr = (BYTE*)dstBuffer;
/* init */ /* init */
DEBUGLOG(4, "ZDICT_analyzeEntropy");
esr.ref = ZSTD_createCCtx(); esr.ref = ZSTD_createCCtx();
esr.zc = ZSTD_createCCtx(); esr.zc = ZSTD_createCCtx();
esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
@ -713,7 +721,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
goto _cleanup; goto _cleanup;
} } } }
/* collect stats on all files */ /* collect stats on all samples */
for (u=0; u<nbFiles; u++) { for (u=0; u<nbFiles; u++) {
ZDICT_countEStats(esr, params, ZDICT_countEStats(esr, params,
countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
@ -726,7 +734,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog); errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
if (HUF_isError(errorCode)) { if (HUF_isError(errorCode)) {
eSize = ERROR(GENERIC); eSize = ERROR(GENERIC);
DISPLAYLEVEL(1, "HUF_buildCTable error \n"); DISPLAYLEVEL(1, " HUF_buildCTable error \n");
goto _cleanup; goto _cleanup;
} }
huffLog = (U32)errorCode; huffLog = (U32)errorCode;
@ -850,6 +858,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
U32 const notificationLevel = params.notificationLevel; U32 const notificationLevel = params.notificationLevel;
/* check conditions */ /* check conditions */
DEBUGLOG(4, "ZDICT_finalizeDictionary");
if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong); if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
@ -1054,18 +1063,22 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
{ {
ZDICT_cover_params_t params; ZDICT_cover_params_t params;
DEBUGLOG(3, "ZDICT_trainFromBuffer");
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));
params.d = 8; params.d = 8;
params.steps = 4; params.steps = 4;
/* Default to level 6 since no compression level information is avaialble */ /* Default to level 6 since no compression level information is available */
params.zParams.compressionLevel = 6; params.zParams.compressionLevel = 6;
#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
params.zParams.notificationLevel = ZSTD_DEBUG;
#endif
return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity, return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
samplesBuffer, samplesSizes, samplesBuffer, samplesSizes, nbSamples,
nbSamples, &params); &params);
} }
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
{ {
ZDICT_params_t params; ZDICT_params_t params;
memset(&params, 0, sizeof(params)); memset(&params, 0, sizeof(params));

View File

@ -38,21 +38,21 @@ extern "C" {
/*! ZDICT_trainFromBuffer(): /*! ZDICT_trainFromBuffer():
* Train a dictionary from an array of samples. * Train a dictionary from an array of samples.
* Uses ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4. * Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`, * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`. * The resulting dictionary will be saved into `dictBuffer`.
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError(). * or an error code, which can be tested with ZDICT_isError().
* Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte. * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, but this can vary a lot. * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/ */
ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
/*====== Helper functions ======*/ /*====== Helper functions ======*/
@ -72,14 +72,14 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
* ==================================================================================== */ * ==================================================================================== */
typedef struct { typedef struct {
int compressionLevel; /* 0 means default; target a specific zstd compression level */ int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */
unsigned notificationLevel; /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
unsigned dictID; /* 0 means auto mode (32-bits random value); other : force dictID value */ unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */
} ZDICT_params_t; } ZDICT_params_t;
/*! ZDICT_cover_params_t: /*! ZDICT_cover_params_t:
* For all values 0 means default.
* k and d are the only required parameters. * k and d are the only required parameters.
* For others, value 0 means default.
*/ */
typedef struct { typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
@ -91,28 +91,28 @@ typedef struct {
/*! ZDICT_trainFromBuffer_cover(): /*! ZDICT_trainFromBuffer_cover():
* Train a dictionary from an array of samples using the COVER algorithm. * Train a dictionary from an array of samples using the COVER algorithm.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`, * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`. * The resulting dictionary will be saved into `dictBuffer`.
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError(). * or an error code, which can be tested with ZDICT_isError().
* Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, but this can vary a lot. * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/ */
ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, void *dictBuffer, size_t dictBufferCapacity,
const size_t *samplesSizes, unsigned nbSamples, const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t parameters); ZDICT_cover_params_t parameters);
/*! ZDICT_optimizeTrainFromBuffer_cover(): /*! ZDICT_optimizeTrainFromBuffer_cover():
* The same requirements as above hold for all the parameters except `parameters`. * The same requirements as above hold for all the parameters except `parameters`.
* This function tries many parameter combinations and picks the best parameters. * This function tries many parameter combinations and picks the best parameters.
* `*parameters` is filled with the best parameters found, and the dictionary * `*parameters` is filled with the best parameters found,
* constructed with those parameters is stored in `dictBuffer`. * dictionary constructed with those parameters is stored in `dictBuffer`.
* *
* All of the parameters d, k, steps are optional. * All of the parameters d, k, steps are optional.
* If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
@ -125,9 +125,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
* Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
*/ */
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, void* dictBuffer, size_t dictBufferCapacity,
const size_t *samplesSizes, unsigned nbSamples, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t *parameters); ZDICT_cover_params_t* parameters);
/*! ZDICT_finalizeDictionary(): /*! ZDICT_finalizeDictionary():
* Given a custom content as a basis for dictionary, and a set of samples, * Given a custom content as a basis for dictionary, and a set of samples,
@ -157,22 +157,23 @@ typedef struct {
} ZDICT_legacy_params_t; } ZDICT_legacy_params_t;
/*! ZDICT_trainFromBuffer_legacy(): /*! ZDICT_trainFromBuffer_legacy():
* Train a dictionary from an array of samples. * Train a dictionary from an array of samples.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`, * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`. * The resulting dictionary will be saved into `dictBuffer`.
* `parameters` is optional and can be provided with values set to 0 to mean "default". * `parameters` is optional and can be provided with values set to 0 to mean "default".
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError(). * or an error code, which can be tested with ZDICT_isError().
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, but this can vary a lot. * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
* Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
*/ */
ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy( ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, void *dictBuffer, size_t dictBufferCapacity,
const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters); const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
ZDICT_legacy_params_t parameters);
/* Deprecation warnings */ /* Deprecation warnings */
/* It is generally possible to disable deprecation warnings from compiler, /* It is generally possible to disable deprecation warnings from compiler,

View File

@ -673,6 +673,15 @@ static int basicUnitTests(U32 seed, double compressibility)
goto _output_error; goto _output_error;
} }
DISPLAYLEVEL(4, "test%3i : dictBuilder on cyclic data : ", testNb++);
assert(compressedBufferSize >= totalSampleSize);
{ U32 u; for (u=0; u<totalSampleSize; u++) ((BYTE*)compressedBuffer)[u] = (BYTE)u; }
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize,
compressedBuffer, samplesSizes, nbSamples);
if (ZDICT_isError(dictSize)) goto _output_error;
DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);
DISPLAYLEVEL(4, "test%3i : dictBuilder : ", testNb++); DISPLAYLEVEL(4, "test%3i : dictBuilder : ", testNb++);
{ U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; } { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize, dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize,