[lib] Add ZSTD_c_deterministicRefPrefix
This flag forces zstd to always load the prefix in ext-dict mode, even if it happens to be contiguous, to force determinism. It also applies to dictionaries that are re-processed. A determinism test case is also added, which fails without `ZSTD_c_deterministicRefPrefix` and passes with it set. Question: Should this be the default behavior? It isn't in this PR.
This commit is contained in:
parent
2d10544b84
commit
172b4b6ac4
@ -559,6 +559,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
|
||||
bounds.upperBound = (int)ZSTD_urm_enableRowMatchFinder;
|
||||
return bounds;
|
||||
|
||||
case ZSTD_c_deterministicRefPrefix:
|
||||
bounds.lowerBound = 0;
|
||||
bounds.upperBound = 1;
|
||||
return bounds;
|
||||
|
||||
default:
|
||||
bounds.error = ERROR(parameter_unsupported);
|
||||
return bounds;
|
||||
@ -622,6 +627,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
|
||||
case ZSTD_c_validateSequences:
|
||||
case ZSTD_c_splitBlocks:
|
||||
case ZSTD_c_useRowMatchFinder:
|
||||
case ZSTD_c_deterministicRefPrefix:
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
@ -676,6 +682,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
|
||||
case ZSTD_c_validateSequences:
|
||||
case ZSTD_c_splitBlocks:
|
||||
case ZSTD_c_useRowMatchFinder:
|
||||
case ZSTD_c_deterministicRefPrefix:
|
||||
break;
|
||||
|
||||
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
|
||||
@ -897,6 +904,11 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
|
||||
CCtxParams->useRowMatchFinder = (ZSTD_useRowMatchFinderMode_e)value;
|
||||
return CCtxParams->useRowMatchFinder;
|
||||
|
||||
case ZSTD_c_deterministicRefPrefix:
|
||||
BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
|
||||
CCtxParams->deterministicRefPrefix = !!value;
|
||||
return CCtxParams->deterministicRefPrefix;
|
||||
|
||||
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
|
||||
}
|
||||
}
|
||||
@ -1026,6 +1038,9 @@ size_t ZSTD_CCtxParams_getParameter(
|
||||
case ZSTD_c_useRowMatchFinder :
|
||||
*value = (int)CCtxParams->useRowMatchFinder;
|
||||
break;
|
||||
case ZSTD_c_deterministicRefPrefix:
|
||||
*value = (int)CCtxParams->deterministicRefPrefix;
|
||||
break;
|
||||
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
|
||||
}
|
||||
return 0;
|
||||
@ -4045,11 +4060,12 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
|
||||
|
||||
if (!srcSize) return fhSize; /* do not generate an empty block if no input */
|
||||
|
||||
if (!ZSTD_window_update(&ms->window, src, srcSize)) {
|
||||
if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) {
|
||||
ms->forceNonContiguous = 0;
|
||||
ms->nextToUpdate = ms->window.dictLimit;
|
||||
}
|
||||
if (cctx->appliedParams.ldmParams.enableLdm) {
|
||||
ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
|
||||
ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0);
|
||||
}
|
||||
|
||||
if (!frame) {
|
||||
@ -4141,11 +4157,12 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
|
||||
}
|
||||
|
||||
DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
|
||||
ZSTD_window_update(&ms->window, src, srcSize);
|
||||
ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
|
||||
ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
|
||||
ms->forceNonContiguous = params->deterministicRefPrefix;
|
||||
|
||||
if (loadLdmDict) {
|
||||
ZSTD_window_update(&ls->window, src, srcSize);
|
||||
ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
|
||||
ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
|
||||
}
|
||||
|
||||
|
@ -219,6 +219,8 @@ struct ZSTD_matchState_t {
|
||||
U32* hashTable3;
|
||||
U32* chainTable;
|
||||
|
||||
U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
|
||||
|
||||
int dedicatedDictSearch; /* Indicates whether this matchState is using the
|
||||
* dedicated dictionary search structure.
|
||||
*/
|
||||
@ -317,6 +319,9 @@ struct ZSTD_CCtx_params_s {
|
||||
/* Param for deciding whether to use row-based matchfinder */
|
||||
ZSTD_useRowMatchFinderMode_e useRowMatchFinder;
|
||||
|
||||
/* Always load a dictionary in ext-dict mode (not prefix mode)? */
|
||||
int deterministicRefPrefix;
|
||||
|
||||
/* Internal use, for createCCtxParams() and freeCCtxParams() only */
|
||||
ZSTD_customMem customMem;
|
||||
}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
|
||||
@ -1138,7 +1143,8 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
|
||||
* Returns non-zero if the segment is contiguous.
|
||||
*/
|
||||
MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
|
||||
void const* src, size_t srcSize)
|
||||
void const* src, size_t srcSize,
|
||||
int forceNonContiguous)
|
||||
{
|
||||
BYTE const* const ip = (BYTE const*)src;
|
||||
U32 contiguous = 1;
|
||||
@ -1148,7 +1154,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
|
||||
assert(window->base != NULL);
|
||||
assert(window->dictBase != NULL);
|
||||
/* Check if blocks follow each other */
|
||||
if (src != window->nextSrc) {
|
||||
if (src != window->nextSrc || forceNonContiguous) {
|
||||
/* not contiguous */
|
||||
size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
|
||||
DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
|
||||
|
@ -512,7 +512,7 @@ ZSTDMT_serialState_reset(serialState_t* serialState,
|
||||
if (dictSize > 0) {
|
||||
if (dictContentType == ZSTD_dct_rawContent) {
|
||||
BYTE const* const dictEnd = (const BYTE*)dict + dictSize;
|
||||
ZSTD_window_update(&serialState->ldmState.window, dict, dictSize);
|
||||
ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0);
|
||||
ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, ¶ms.ldmParams);
|
||||
serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base);
|
||||
} else {
|
||||
@ -569,7 +569,7 @@ static void ZSTDMT_serialState_update(serialState_t* serialState,
|
||||
assert(seqStore.seq != NULL && seqStore.pos == 0 &&
|
||||
seqStore.size == 0 && seqStore.capacity > 0);
|
||||
assert(src.size <= serialState->params.jobSize);
|
||||
ZSTD_window_update(&serialState->ldmState.window, src.start, src.size);
|
||||
ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0);
|
||||
error = ZSTD_ldm_generateSequences(
|
||||
&serialState->ldmState, &seqStore,
|
||||
&serialState->params.ldmParams, src.start, src.size);
|
||||
@ -695,6 +695,10 @@ static void ZSTDMT_compressionJob(void* jobDescription)
|
||||
{ size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob);
|
||||
if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError);
|
||||
}
|
||||
if (!job->firstJob) {
|
||||
size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0);
|
||||
if (ZSTD_isError(err)) JOB_ERROR(err);
|
||||
}
|
||||
{ size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
|
||||
job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
|
||||
ZSTD_dtlm_fast,
|
||||
@ -750,6 +754,12 @@ static void ZSTDMT_compressionJob(void* jobDescription)
|
||||
if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
|
||||
lastCBlockSize = cSize;
|
||||
} }
|
||||
if (!job->firstJob) {
|
||||
/* Double check that we don't have an ext-dict, because then our
|
||||
* repcode invalidation doesn't work.
|
||||
*/
|
||||
assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
|
||||
}
|
||||
ZSTD_CCtx_trace(cctx, 0);
|
||||
|
||||
_endJob:
|
||||
|
23
lib/zstd.h
23
lib/zstd.h
@ -449,7 +449,8 @@ typedef enum {
|
||||
ZSTD_c_experimentalParam11=1008,
|
||||
ZSTD_c_experimentalParam12=1009,
|
||||
ZSTD_c_experimentalParam13=1010,
|
||||
ZSTD_c_experimentalParam14=1011
|
||||
ZSTD_c_experimentalParam14=1011,
|
||||
ZSTD_c_experimentalParam15=1012
|
||||
} ZSTD_cParameter;
|
||||
|
||||
typedef struct {
|
||||
@ -1859,6 +1860,26 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
|
||||
*/
|
||||
#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14
|
||||
|
||||
/* ZSTD_c_deterministicRefPrefix
|
||||
* Default is 0 == disabled. Set to 1 to enable.
|
||||
*
|
||||
* Zstd produces different results for prefix compression when the prefix is
|
||||
* directly adjacent to the data about to be compressed vs. when it isn't.
|
||||
* This is because zstd detects that the two buffers are contiguous and it can
|
||||
* use a more efficient match finding algorithm. However, this produces different
|
||||
* results than when the two buffers are non-contiguous. This flag forces zstd
|
||||
* to always load the prefix in non-contiguous mode, even if it happens to be
|
||||
* adjacent to the data, to guarantee determinism.
|
||||
*
|
||||
* If you really care about determinism when using a dictionary or prefix,
|
||||
* like when doing delta compression, you should select this option. It comes
|
||||
* at a speed penalty of about ~2.5% if the dictionary and data happened to be
|
||||
* contiguous, and is free if they weren't contiguous. We don't expect that
|
||||
* intentionally making the dictionary and data contiguous will be worth the
|
||||
* cost to memcpy() the data.
|
||||
*/
|
||||
#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
|
||||
|
||||
/*! ZSTD_CCtx_getParameter() :
|
||||
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
|
||||
* and store it into int* value.
|
||||
|
@ -97,6 +97,7 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, FUZZ_dataProducer
|
||||
setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, producer);
|
||||
setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, producer);
|
||||
setRand(cctx, ZSTD_c_splitBlocks, 0, 1, producer);
|
||||
setRand(cctx, ZSTD_c_deterministicRefPrefix, 0, 1, producer);
|
||||
if (FUZZ_dataProducer_uint32Range(producer, 0, 1) == 0) {
|
||||
setRand(cctx, ZSTD_c_srcSizeHint, ZSTD_SRCSIZEHINT_MIN, 2 * srcSize, producer);
|
||||
}
|
||||
|
@ -766,6 +766,50 @@ static int basicUnitTests(U32 const seed, double compressibility)
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
|
||||
DISPLAYLEVEL(3, "test%3i : testing dict compression for determinism : ", testNb++);
|
||||
{
|
||||
size_t const testSize = 1024;
|
||||
ZSTD_CCtx* const cctx = ZSTD_createCCtx();
|
||||
ZSTD_DCtx* const dctx = ZSTD_createDCtx();
|
||||
char* dict = (char*)malloc(2 * testSize);
|
||||
int ldmEnabled, level;
|
||||
|
||||
RDG_genBuffer(dict, testSize, 0.5, 0.5, seed);
|
||||
RDG_genBuffer(CNBuffer, testSize, 0.6, 0.6, seed);
|
||||
memcpy(dict + testSize, CNBuffer, testSize);
|
||||
for (level = 1; level <= 5; ++level) {
|
||||
for (ldmEnabled = 0; ldmEnabled <= 1; ++ldmEnabled) {
|
||||
size_t cSize0;
|
||||
XXH64_hash_t compressedChecksum0;
|
||||
|
||||
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1));
|
||||
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, level));
|
||||
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_enableLongDistanceMatching, ldmEnabled));
|
||||
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_deterministicRefPrefix, 1));
|
||||
|
||||
CHECK_Z(ZSTD_CCtx_refPrefix(cctx, dict, testSize));
|
||||
cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, testSize);
|
||||
CHECK_Z(cSize);
|
||||
CHECK_Z(ZSTD_decompress_usingDict(dctx, decodedBuffer, testSize, compressedBuffer, cSize, dict, testSize));
|
||||
|
||||
cSize0 = cSize;
|
||||
compressedChecksum0 = XXH64(compressedBuffer, cSize, 0);
|
||||
|
||||
CHECK_Z(ZSTD_CCtx_refPrefix(cctx, dict, testSize));
|
||||
cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, dict + testSize, testSize);
|
||||
CHECK_Z(cSize);
|
||||
|
||||
if (cSize != cSize0) goto _output_error;
|
||||
if (XXH64(compressedBuffer, cSize, 0) != compressedChecksum0) goto _output_error;
|
||||
}
|
||||
}
|
||||
|
||||
ZSTD_freeCCtx(cctx);
|
||||
ZSTD_freeDCtx(dctx);
|
||||
free(dict);
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
|
||||
DISPLAYLEVEL(3, "test%3i : LDM + opt parser with small uncompressible block ", testNb++);
|
||||
{ ZSTD_CCtx* cctx = ZSTD_createCCtx();
|
||||
ZSTD_DCtx* dctx = ZSTD_createDCtx();
|
||||
|
@ -2215,6 +2215,7 @@ static int fuzzerTests_newAPI(U32 seed, int nbTests, int startTest,
|
||||
}
|
||||
|
||||
if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_forceMaxWindow, FUZ_rand(&lseed) & 1, opaqueAPI) );
|
||||
if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_deterministicRefPrefix, FUZ_rand(&lseed) & 1, opaqueAPI) );
|
||||
|
||||
/* Apply parameters */
|
||||
if (opaqueAPI) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user