Merge pull request #2616 from terrelln/deterministic-dict

[lib] Add ZSTD_c_deterministicRefPrefix
dev
Nick Terrell 2021-05-06 11:09:22 -07:00 committed by GitHub
commit 207e33bb61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 109 additions and 9 deletions

View File

@ -559,6 +559,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
bounds.upperBound = (int)ZSTD_urm_enableRowMatchFinder;
return bounds;
case ZSTD_c_deterministicRefPrefix:
bounds.lowerBound = 0;
bounds.upperBound = 1;
return bounds;
default:
bounds.error = ERROR(parameter_unsupported);
return bounds;
@ -622,6 +627,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
case ZSTD_c_validateSequences:
case ZSTD_c_splitBlocks:
case ZSTD_c_useRowMatchFinder:
case ZSTD_c_deterministicRefPrefix:
default:
return 0;
}
@ -676,6 +682,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
case ZSTD_c_validateSequences:
case ZSTD_c_splitBlocks:
case ZSTD_c_useRowMatchFinder:
case ZSTD_c_deterministicRefPrefix:
break;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
@ -897,6 +904,11 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
CCtxParams->useRowMatchFinder = (ZSTD_useRowMatchFinderMode_e)value;
return CCtxParams->useRowMatchFinder;
case ZSTD_c_deterministicRefPrefix:
BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
CCtxParams->deterministicRefPrefix = !!value;
return CCtxParams->deterministicRefPrefix;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
}
@ -1026,6 +1038,9 @@ size_t ZSTD_CCtxParams_getParameter(
case ZSTD_c_useRowMatchFinder :
*value = (int)CCtxParams->useRowMatchFinder;
break;
case ZSTD_c_deterministicRefPrefix:
*value = (int)CCtxParams->deterministicRefPrefix;
break;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
return 0;
@ -4052,11 +4067,12 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
if (!srcSize) return fhSize; /* do not generate an empty block if no input */
if (!ZSTD_window_update(&ms->window, src, srcSize)) {
if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) {
ms->forceNonContiguous = 0;
ms->nextToUpdate = ms->window.dictLimit;
}
if (cctx->appliedParams.ldmParams.enableLdm) {
ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0);
}
if (!frame) {
@ -4148,11 +4164,12 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
}
DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
ZSTD_window_update(&ms->window, src, srcSize);
ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
ms->forceNonContiguous = params->deterministicRefPrefix;
if (loadLdmDict) {
ZSTD_window_update(&ls->window, src, srcSize);
ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
}

View File

@ -219,6 +219,8 @@ struct ZSTD_matchState_t {
U32* hashTable3;
U32* chainTable;
U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */
int dedicatedDictSearch; /* Indicates whether this matchState is using the
* dedicated dictionary search structure.
*/
@ -317,6 +319,9 @@ struct ZSTD_CCtx_params_s {
/* Param for deciding whether to use row-based matchfinder */
ZSTD_useRowMatchFinderMode_e useRowMatchFinder;
/* Always load a dictionary in ext-dict mode (not prefix mode)? */
int deterministicRefPrefix;
/* Internal use, for createCCtxParams() and freeCCtxParams() only */
ZSTD_customMem customMem;
}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
@ -1139,7 +1144,8 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
* Returns non-zero if the segment is contiguous.
*/
MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
void const* src, size_t srcSize)
void const* src, size_t srcSize,
int forceNonContiguous)
{
BYTE const* const ip = (BYTE const*)src;
U32 contiguous = 1;
@ -1149,7 +1155,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
assert(window->base != NULL);
assert(window->dictBase != NULL);
/* Check if blocks follow each other */
if (src != window->nextSrc) {
if (src != window->nextSrc || forceNonContiguous) {
/* not contiguous */
size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);

View File

@ -512,7 +512,7 @@ ZSTDMT_serialState_reset(serialState_t* serialState,
if (dictSize > 0) {
if (dictContentType == ZSTD_dct_rawContent) {
BYTE const* const dictEnd = (const BYTE*)dict + dictSize;
ZSTD_window_update(&serialState->ldmState.window, dict, dictSize);
ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0);
ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, &params.ldmParams);
serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base);
} else {
@ -569,7 +569,7 @@ static void ZSTDMT_serialState_update(serialState_t* serialState,
assert(seqStore.seq != NULL && seqStore.pos == 0 &&
seqStore.size == 0 && seqStore.capacity > 0);
assert(src.size <= serialState->params.jobSize);
ZSTD_window_update(&serialState->ldmState.window, src.start, src.size);
ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0);
error = ZSTD_ldm_generateSequences(
&serialState->ldmState, &seqStore,
&serialState->params.ldmParams, src.start, src.size);
@ -695,6 +695,10 @@ static void ZSTDMT_compressionJob(void* jobDescription)
{ size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob);
if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError);
}
if (!job->firstJob) {
size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0);
if (ZSTD_isError(err)) JOB_ERROR(err);
}
{ size_t const initError = ZSTD_compressBegin_advanced_internal(cctx,
job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */
ZSTD_dtlm_fast,
@ -750,6 +754,12 @@ static void ZSTDMT_compressionJob(void* jobDescription)
if (ZSTD_isError(cSize)) JOB_ERROR(cSize);
lastCBlockSize = cSize;
} }
if (!job->firstJob) {
/* Double check that we don't have an ext-dict, because then our
* repcode invalidation doesn't work.
*/
assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
}
ZSTD_CCtx_trace(cctx, 0);
_endJob:

View File

@ -449,7 +449,8 @@ typedef enum {
ZSTD_c_experimentalParam11=1008,
ZSTD_c_experimentalParam12=1009,
ZSTD_c_experimentalParam13=1010,
ZSTD_c_experimentalParam14=1011
ZSTD_c_experimentalParam14=1011,
ZSTD_c_experimentalParam15=1012
} ZSTD_cParameter;
typedef struct {
@ -1859,6 +1860,26 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
*/
#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14
/* ZSTD_c_deterministicRefPrefix
* Default is 0 == disabled. Set to 1 to enable.
*
* Zstd produces different results for prefix compression when the prefix is
* directly adjacent to the data about to be compressed vs. when it isn't.
* This is because zstd detects that the two buffers are contiguous and it can
* use a more efficient match finding algorithm. However, this produces different
* results than when the two buffers are non-contiguous. This flag forces zstd
* to always load the prefix in non-contiguous mode, even if it happens to be
* adjacent to the data, to guarantee determinism.
*
* If you really care about determinism when using a dictionary or prefix,
* like when doing delta compression, you should select this option. It comes
* at a speed penalty of about ~2.5% if the dictionary and data happened to be
* contiguous, and is free if they weren't contiguous. We don't expect that
* intentionally making the dictionary and data contiguous will be worth the
* cost to memcpy() the data.
*/
#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
/*! ZSTD_CCtx_getParameter() :
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
* and store it into int* value.

View File

@ -97,6 +97,7 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, FUZZ_dataProducer
setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, producer);
setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, producer);
setRand(cctx, ZSTD_c_splitBlocks, 0, 1, producer);
setRand(cctx, ZSTD_c_deterministicRefPrefix, 0, 1, producer);
if (FUZZ_dataProducer_uint32Range(producer, 0, 1) == 0) {
setRand(cctx, ZSTD_c_srcSizeHint, ZSTD_SRCSIZEHINT_MIN, 2 * srcSize, producer);
}

View File

@ -766,6 +766,50 @@ static int basicUnitTests(U32 const seed, double compressibility)
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3i : testing dict compression for determinism : ", testNb++);
{
size_t const testSize = 1024;
ZSTD_CCtx* const cctx = ZSTD_createCCtx();
ZSTD_DCtx* const dctx = ZSTD_createDCtx();
char* dict = (char*)malloc(2 * testSize);
int ldmEnabled, level;
RDG_genBuffer(dict, testSize, 0.5, 0.5, seed);
RDG_genBuffer(CNBuffer, testSize, 0.6, 0.6, seed);
memcpy(dict + testSize, CNBuffer, testSize);
for (level = 1; level <= 5; ++level) {
for (ldmEnabled = 0; ldmEnabled <= 1; ++ldmEnabled) {
size_t cSize0;
XXH64_hash_t compressedChecksum0;
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1));
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, level));
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_enableLongDistanceMatching, ldmEnabled));
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_deterministicRefPrefix, 1));
CHECK_Z(ZSTD_CCtx_refPrefix(cctx, dict, testSize));
cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, testSize);
CHECK_Z(cSize);
CHECK_Z(ZSTD_decompress_usingDict(dctx, decodedBuffer, testSize, compressedBuffer, cSize, dict, testSize));
cSize0 = cSize;
compressedChecksum0 = XXH64(compressedBuffer, cSize, 0);
CHECK_Z(ZSTD_CCtx_refPrefix(cctx, dict, testSize));
cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, dict + testSize, testSize);
CHECK_Z(cSize);
if (cSize != cSize0) goto _output_error;
if (XXH64(compressedBuffer, cSize, 0) != compressedChecksum0) goto _output_error;
}
}
ZSTD_freeCCtx(cctx);
ZSTD_freeDCtx(dctx);
free(dict);
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3i : LDM + opt parser with small uncompressible block ", testNb++);
{ ZSTD_CCtx* cctx = ZSTD_createCCtx();
ZSTD_DCtx* dctx = ZSTD_createDCtx();

View File

@ -2215,6 +2215,7 @@ static int fuzzerTests_newAPI(U32 seed, int nbTests, int startTest,
}
if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_forceMaxWindow, FUZ_rand(&lseed) & 1, opaqueAPI) );
if (FUZ_rand(&lseed) & 1) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_deterministicRefPrefix, FUZ_rand(&lseed) & 1, opaqueAPI) );
/* Apply parameters */
if (opaqueAPI) {