Adding --long support for --patch-from (#1959)

* adding long support for patch-from

* adding refPrefix to dictionary_decompress

* adding refPrefix to dictionary_loader

* conversion nit

* triggering log mode on chainLog < fileLog and removing old threshold

* adding refPrefix to dictionary_round_trip

* adding docs

* adding enableldm + forceWindow test for dict

* separate patch-from logic into FIO_adjustParamsForPatchFromMode

* moving memLimit adjustment to outside ifdefs (need for decomp)

* removing refPrefix gate on dictionary_round_trip

* rebase on top of dev refPrefix change

* making sure refPrefx + ldm is < 1% of srcSize

* combining notes for patch-from

* moving memlimit logic inside fileio.c

* adding display for optimal parser and long mode trigger

* conversion nit

* fuzzer found heap-overflow fix

* another conversion nit

* moving FIO_adjustMemLimitForPatchFromMode outside ifndef

* making params immutable

* moving memLimit update before createDictBuffer call

* making maxSrcSize unsigned long long

* making dictSize and maxSrcSize params unsigned long long

* error on files larger than 4gb

* extend refPrefix test to include round trip

* conversion to size_t

* making sure ldm is at least 10x better

* removing break

* including zstd_compress_internal and removing redundant macros

* exposing ZSTD_cycleLog()

* using cycleLog instead of chainLog

* add some more docs about user optimizations

* formatting
dev
Bimba Shrestha 2020-04-17 15:58:53 -05:00 committed by GitHub
parent 88ecdc939b
commit 5b0a452cac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 318 additions and 96 deletions

View File

@ -1011,7 +1011,7 @@ ZSTD_clampCParams(ZSTD_compressionParameters cParams)
/** ZSTD_cycleLog() :
* condition for correct operation : hashLog > 1 */
static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
{
U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
return hashLog - btScale;
@ -2784,6 +2784,7 @@ size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const
* @return : 0, or an error code
*/
static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
ldmState_t* ls,
ZSTD_cwksp* ws,
ZSTD_CCtx_params const* params,
const void* src, size_t srcSize,
@ -2795,6 +2796,11 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
ZSTD_window_update(&ms->window, src, srcSize);
ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
if (params->ldmParams.enableLdm && ls != NULL) {
ZSTD_window_update(&ls->window, src, srcSize);
ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
}
/* Assert that we the ms params match the params we're being given */
ZSTD_assertEqualCParams(params->cParams, ms->cParams);
@ -2807,6 +2813,9 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk);
if (params->ldmParams.enableLdm && ls != NULL && srcSize >= params->ldmParams.minMatchLength)
ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, &params->ldmParams);
switch(params->cParams.strategy)
{
case ZSTD_fast:
@ -2985,7 +2994,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
ms, ws, params, dictPtr, dictContentSize, dtlm));
ms, NULL, ws, params, dictPtr, dictContentSize, dtlm));
return dictID;
}
}
@ -2995,6 +3004,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
static size_t
ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
ZSTD_matchState_t* ms,
ldmState_t* ls,
ZSTD_cwksp* ws,
const ZSTD_CCtx_params* params,
const void* dict, size_t dictSize,
@ -3012,13 +3022,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
/* dict restricted modes */
if (dictContentType == ZSTD_dct_rawContent)
return ZSTD_loadDictionaryContent(ms, ws, params, dict, dictSize, dtlm);
return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
if (dictContentType == ZSTD_dct_auto) {
DEBUGLOG(4, "raw content dictionary detected");
return ZSTD_loadDictionaryContent(
ms, ws, params, dict, dictSize, dtlm);
ms, ls, ws, params, dict, dictSize, dtlm);
}
RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong);
assert(0); /* impossible */
@ -3061,12 +3071,12 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
{ size_t const dictID = cdict ?
ZSTD_compress_insertDictionary(
cctx->blockState.prevCBlock, &cctx->blockState.matchState,
&cctx->workspace, &cctx->appliedParams, cdict->dictContent,
&cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
cdict->dictContentSize, dictContentType, dtlm,
cctx->entropyWorkspace)
: ZSTD_compress_insertDictionary(
cctx->blockState.prevCBlock, &cctx->blockState.matchState,
&cctx->workspace, &cctx->appliedParams, dict, dictSize,
&cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
dictContentType, dtlm, cctx->entropyWorkspace);
FORWARD_IF_ERROR(dictID);
assert(dictID <= UINT_MAX);
@ -3344,7 +3354,7 @@ static size_t ZSTD_initCDict_internal(
params.fParams.contentSizeFlag = 1;
params.cParams = cParams;
{ size_t const dictID = ZSTD_compress_insertDictionary(
&cdict->cBlockState, &cdict->matchState, &cdict->workspace,
&cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
&params, cdict->dictContent, cdict->dictContentSize,
dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
FORWARD_IF_ERROR(dictID);

View File

@ -166,6 +166,7 @@ typedef struct {
typedef struct {
ZSTD_window_t window; /* State for the window round buffer management */
ldmEntry_t* hashTable;
U32 loadedDictEnd;
BYTE* bucketOffsets; /* Next position in bucket to insert entry */
U64 hashPower; /* Used to compute the rolling hash.
* Depends on ldmParams.minMatchLength */
@ -1051,5 +1052,8 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
*/
size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
/** ZSTD_cycleLog() :
* condition for correct operation : hashLog > 1 */
U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
#endif /* ZSTD_COMPRESS_H */

View File

@ -224,6 +224,17 @@ static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state,
return rollingHash;
}
void ZSTD_ldm_fillHashTable(
ldmState_t* state, const BYTE* ip,
const BYTE* iend, ldmParams_t const* params)
{
U64 startingHash = ZSTD_rollingHash_compute(ip, params->minMatchLength);
ZSTD_ldm_fillLdmHashTable(
state, startingHash, ip, iend - params->minMatchLength, state->window.base,
params->hashLog - params->bucketSizeLog,
*params);
}
/** ZSTD_ldm_limitTableUpdate() :
*
@ -459,7 +470,7 @@ size_t ZSTD_ldm_generateSequences(
* * Try invalidation after the sequence generation and test the
* the offset against maxDist directly.
*/
ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL, NULL);
ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL);
/* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
newLeftoverSize = ZSTD_ldm_generateSequences_internal(
ldmState, sequences, params, chunkStart, chunkSize);
@ -567,7 +578,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
if (sequence.offset == 0)
break;
assert(sequence.offset <= (1U << cParams->windowLog));
assert(ip + sequence.litLength + sequence.matchLength <= iend);
/* Fill tables for block compressor */

View File

@ -24,6 +24,10 @@ extern "C" {
#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT
void ZSTD_ldm_fillHashTable(
ldmState_t* state, const BYTE* ip,
const BYTE* iend, ldmParams_t const* params);
/**
* ZSTD_ldm_generateSequences():
*

View File

@ -461,7 +461,9 @@ typedef struct {
ZSTD_window_t ldmWindow; /* A thread-safe copy of ldmState.window */
} serialState_t;
static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params, size_t jobSize)
static int ZSTDMT_serialState_reset(serialState_t* serialState,
ZSTDMT_seqPool* seqPool, ZSTD_CCtx_params params,
size_t jobSize, const void* dict, size_t const dictSize)
{
/* Adjust parameters */
if (params.ldmParams.enableLdm) {
@ -507,6 +509,13 @@ static int ZSTDMT_serialState_reset(serialState_t* serialState, ZSTDMT_seqPool*
memset(serialState->ldmState.hashTable, 0, hashSize);
memset(serialState->ldmState.bucketOffsets, 0, bucketSize);
}
/* Update window state and fill hash table with dict */
if (params.ldmParams.enableLdm && dict) {
ZSTD_window_update(&serialState->ldmState.window, dict, dictSize);
ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, (const BYTE*)dict + dictSize, &params.ldmParams);
}
serialState->params = params;
serialState->params.jobSize = (U32)jobSize;
return 0;
@ -1267,7 +1276,7 @@ static size_t ZSTDMT_compress_advanced_internal(
assert(avgJobSize >= 256 KB); /* condition for ZSTD_compressBound(A) + ZSTD_compressBound(B) <= ZSTD_compressBound(A+B), required to compress directly into Dst (no additional buffer) */
ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(avgJobSize) );
if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, avgJobSize))
if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, avgJobSize, NULL, 0))
return ERROR(memory_allocation);
FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbJobs) ); /* only expands if necessary */
@ -1500,7 +1509,7 @@ size_t ZSTDMT_initCStream_internal(
mtctx->allJobsCompleted = 0;
mtctx->consumed = 0;
mtctx->produced = 0;
if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize))
if (ZSTDMT_serialState_reset(&mtctx->serial, mtctx->seqPool, params, mtctx->targetSectionSize, dict, dictSize))
return ERROR(memory_allocation);
return 0;
}

View File

@ -45,6 +45,7 @@
#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_magicNumber, ZSTD_frameHeaderSize_max */
#include "zstd.h"
#include "zstd_errors.h" /* ZSTD_error_frameParameter_windowTooLarge */
#include "zstd_compress_internal.h"
#if defined(ZSTD_GZCOMPRESS) || defined(ZSTD_GZDECOMPRESS)
# include <zlib.h>
@ -68,17 +69,11 @@
/*-*************************************
* Constants
***************************************/
#define KB *(1<<10)
#define MB *(1<<20)
#define GB *(1U<<30)
#define ADAPT_WINDOWLOG_DEFAULT 23 /* 8 MB */
#define DICTSIZE_MAX (32 MB) /* protection against large input (attack scenario) */
#define FNSPACE 30
#define PATCHFROM_WINDOWSIZE_EXTRA_BYTES 1 KB
/*-*************************************
* Macros
***************************************/
@ -771,6 +766,16 @@ static unsigned FIO_highbit64(unsigned long long v)
return count;
}
static void FIO_adjustMemLimitForPatchFromMode(FIO_prefs_t* const prefs,
unsigned long long const dictSize,
unsigned long long const maxSrcFileSize)
{
unsigned long long maxSize = MAX(prefs->memLimit, MAX(dictSize, maxSrcFileSize));
assert(maxSize != UTIL_FILESIZE_UNKNOWN);
if (maxSize > UINT_MAX)
EXM_THROW(42, "Can't handle files larger than %u GB\n", UINT_MAX/(1 GB) + 1);
FIO_setMemLimit(prefs, (unsigned)maxSize);
}
#ifndef ZSTD_NOCOMPRESS
@ -784,12 +789,40 @@ typedef struct {
size_t srcBufferSize;
void* dstBuffer;
size_t dstBufferSize;
void* dictBuffer;
size_t dictBufferSize;
const char* dictFileName;
ZSTD_CStream* cctx;
} cRess_t;
static void FIO_adjustParamsForPatchFromMode(FIO_prefs_t* const prefs,
ZSTD_compressionParameters* comprParams,
unsigned long long const dictSize,
unsigned long long const maxSrcFileSize,
int cLevel)
{
unsigned const fileWindowLog = FIO_highbit64(maxSrcFileSize) + 1;
ZSTD_compressionParameters const cParams = ZSTD_getCParams(cLevel, (size_t)maxSrcFileSize, (size_t)dictSize);
FIO_adjustMemLimitForPatchFromMode(prefs, dictSize, maxSrcFileSize);
if (fileWindowLog > ZSTD_WINDOWLOG_MAX)
DISPLAYLEVEL(1, "Max window log exceeded by file (compression ratio will suffer)\n");
comprParams->windowLog = MIN(ZSTD_WINDOWLOG_MAX, fileWindowLog);
if (fileWindowLog > ZSTD_cycleLog(comprParams->hashLog, cParams.strategy)) {
if (!prefs->ldmFlag)
DISPLAYLEVEL(1, "long mode automaticaly triggered\n");
FIO_setLdmFlag(prefs, 1);
}
if (cParams.strategy >= ZSTD_btopt) {
DISPLAYLEVEL(1, "[Optimal parser notes] Consider the following to improve patch size at the cost of speed:\n");
DISPLAYLEVEL(1, "- Use --single-thread mode in the zstd cli");
DISPLAYLEVEL(1, "- Set a larger targetLength (eg. --zstd=targetLength=4096)\n");
DISPLAYLEVEL(1, "- Set a larger chainLog (eg. --zstd=chainLog=31)\n");
DISPLAYLEVEL(1, "Also consdier playing around with searchLog and hashLog\n");
}
}
static cRess_t FIO_createCResources(FIO_prefs_t* const prefs,
const char* dictFileName, const size_t maxSrcFileSize,
const char* dictFileName, unsigned long long const maxSrcFileSize,
int cLevel, ZSTD_compressionParameters comprParams) {
cRess_t ress;
memset(&ress, 0, sizeof(ress));
@ -802,69 +835,70 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs,
ress.srcBufferSize = ZSTD_CStreamInSize();
ress.srcBuffer = malloc(ress.srcBufferSize);
ress.dstBufferSize = ZSTD_CStreamOutSize();
/* need to update memLimit before calling createDictBuffer
* because of memLimit check inside it */
if (prefs->patchFromMode)
FIO_adjustParamsForPatchFromMode(prefs, &comprParams, UTIL_getFileSize(dictFileName), maxSrcFileSize, cLevel);
ress.dstBuffer = malloc(ress.dstBufferSize);
ress.dictBufferSize = FIO_createDictBuffer(&ress.dictBuffer, dictFileName, prefs); /* works with dictFileName==NULL */
if (!ress.srcBuffer || !ress.dstBuffer)
EXM_THROW(31, "allocation error : not enough memory");
/* Advanced parameters, including dictionary */
{ void* dictBuffer;
size_t const dictBuffSize = FIO_createDictBuffer(&dictBuffer, dictFileName, prefs); /* works with dictFileName==NULL */
if (dictFileName && (dictBuffer==NULL))
EXM_THROW(32, "allocation error : can't create dictBuffer");
ress.dictFileName = dictFileName;
if (dictFileName && (ress.dictBuffer==NULL))
EXM_THROW(32, "allocation error : can't create dictBuffer");
ress.dictFileName = dictFileName;
if (prefs->adaptiveMode && !prefs->ldmFlag && !comprParams.windowLog)
comprParams.windowLog = ADAPT_WINDOWLOG_DEFAULT;
if (prefs->adaptiveMode && !prefs->ldmFlag && !comprParams.windowLog)
comprParams.windowLog = ADAPT_WINDOWLOG_DEFAULT;
if (prefs->patchFromMode) {
comprParams.windowLog = FIO_highbit64((unsigned long long)maxSrcFileSize + PATCHFROM_WINDOWSIZE_EXTRA_BYTES);
}
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_contentSizeFlag, 1) ); /* always enable content size when available (note: supposed to be default) */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_dictIDFlag, prefs->dictIDFlag) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_checksumFlag, prefs->checksumFlag) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_contentSizeFlag, prefs->contentSize) );
/* compression level */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_compressionLevel, cLevel) );
/* max compressed block size */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetCBlockSize, (int)prefs->targetCBlockSize) );
/* source size hint */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_srcSizeHint, (int)prefs->srcSizeHint) );
/* long distance matching */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_enableLongDistanceMatching, prefs->ldmFlag) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashLog, prefs->ldmHashLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmMinMatch, prefs->ldmMinMatch) );
if (prefs->ldmBucketSizeLog != FIO_LDM_PARAM_NOTSET) {
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmBucketSizeLog, prefs->ldmBucketSizeLog) );
}
if (prefs->ldmHashRateLog != FIO_LDM_PARAM_NOTSET) {
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashRateLog, prefs->ldmHashRateLog) );
}
/* compression parameters */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_windowLog, (int)comprParams.windowLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_chainLog, (int)comprParams.chainLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_hashLog, (int)comprParams.hashLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_searchLog, (int)comprParams.searchLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_minMatch, (int)comprParams.minMatch) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetLength, (int)comprParams.targetLength) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_strategy, comprParams.strategy) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_literalCompressionMode, (int)prefs->literalCompressionMode) );
/* multi-threading */
#ifdef ZSTD_MULTITHREAD
DISPLAYLEVEL(5,"set nb workers = %u \n", prefs->nbWorkers);
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_nbWorkers, prefs->nbWorkers) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_jobSize, prefs->blockSize) );
if (prefs->overlapLog != FIO_OVERLAP_LOG_NOTSET) {
DISPLAYLEVEL(3,"set overlapLog = %u \n", prefs->overlapLog);
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_overlapLog, prefs->overlapLog) );
}
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_rsyncable, prefs->rsyncable) );
#endif
/* dictionary */
CHECK( ZSTD_CCtx_loadDictionary(ress.cctx, dictBuffer, dictBuffSize) );
free(dictBuffer);
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_contentSizeFlag, prefs->contentSize) ); /* always enable content size when available (note: supposed to be default) */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_dictIDFlag, prefs->dictIDFlag) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_checksumFlag, prefs->checksumFlag) );
/* compression level */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_compressionLevel, cLevel) );
/* max compressed block size */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetCBlockSize, (int)prefs->targetCBlockSize) );
/* source size hint */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_srcSizeHint, (int)prefs->srcSizeHint) );
/* long distance matching */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_enableLongDistanceMatching, prefs->ldmFlag) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashLog, prefs->ldmHashLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmMinMatch, prefs->ldmMinMatch) );
if (prefs->ldmBucketSizeLog != FIO_LDM_PARAM_NOTSET) {
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmBucketSizeLog, prefs->ldmBucketSizeLog) );
}
if (prefs->ldmHashRateLog != FIO_LDM_PARAM_NOTSET) {
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashRateLog, prefs->ldmHashRateLog) );
}
/* compression parameters */
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_windowLog, (int)comprParams.windowLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_chainLog, (int)comprParams.chainLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_hashLog, (int)comprParams.hashLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_searchLog, (int)comprParams.searchLog) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_minMatch, (int)comprParams.minMatch) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetLength, (int)comprParams.targetLength) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_strategy, comprParams.strategy) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_literalCompressionMode, (int)prefs->literalCompressionMode) );
/* multi-threading */
#ifdef ZSTD_MULTITHREAD
DISPLAYLEVEL(5,"set nb workers = %u \n", prefs->nbWorkers);
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_nbWorkers, prefs->nbWorkers) );
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_jobSize, prefs->blockSize) );
if (prefs->overlapLog != FIO_OVERLAP_LOG_NOTSET) {
DISPLAYLEVEL(3,"set overlapLog = %u \n", prefs->overlapLog);
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_overlapLog, prefs->overlapLog) );
}
CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_rsyncable, prefs->rsyncable) );
#endif
/* dictionary */
if (prefs->patchFromMode) {
CHECK( ZSTD_CCtx_refPrefix(ress.cctx, ress.dictBuffer, ress.dictBufferSize) );
} else {
CHECK( ZSTD_CCtx_loadDictionary(ress.cctx, ress.dictBuffer, ress.dictBufferSize) );
}
return ress;
}
@ -872,6 +906,7 @@ static void FIO_freeCResources(cRess_t ress)
{
free(ress.srcBuffer);
free(ress.dstBuffer);
free(ress.dictBuffer);
ZSTD_freeCStream(ress.cctx); /* never fails */
}
@ -1556,7 +1591,7 @@ int FIO_compressFilename(FIO_prefs_t* const prefs, const char* dstFileName,
const char* srcFileName, const char* dictFileName,
int compressionLevel, ZSTD_compressionParameters comprParams)
{
cRess_t const ress = FIO_createCResources(prefs, dictFileName, (size_t)UTIL_getFileSize(srcFileName), compressionLevel, comprParams);
cRess_t const ress = FIO_createCResources(prefs, dictFileName, UTIL_getFileSize(srcFileName), compressionLevel, comprParams);
int const result = FIO_compressFilename_srcFile(prefs, ress, dstFileName, srcFileName, compressionLevel);
@ -1604,11 +1639,12 @@ FIO_determineCompressedName(const char* srcFileName, const char* outDirName, con
return dstFileNameBuffer;
}
static size_t FIO_getLargestFileSize(const char** inFileNames, unsigned nbFiles)
static unsigned long long FIO_getLargestFileSize(const char** inFileNames, unsigned nbFiles)
{
size_t i, fileSize, maxFileSize = 0;
size_t i;
unsigned long long fileSize, maxFileSize = 0;
for (i = 0; i < nbFiles; i++) {
fileSize = (size_t)UTIL_getFileSize(inFileNames[i]);
fileSize = UTIL_getFileSize(inFileNames[i]);
maxFileSize = fileSize > maxFileSize ? fileSize : maxFileSize;
}
return maxFileSize;
@ -1686,6 +1722,9 @@ static dRess_t FIO_createDResources(FIO_prefs_t* const prefs, const char* dictFi
dRess_t ress;
memset(&ress, 0, sizeof(ress));
if (prefs->patchFromMode)
FIO_adjustMemLimitForPatchFromMode(prefs, UTIL_getFileSize(dictFileName), 0 /* just use the dict size */);
/* Allocation */
ress.dctx = ZSTD_createDStream();
if (ress.dctx==NULL)

View File

@ -128,6 +128,14 @@ the last one takes effect.
selection, namely that windowSize > srcSize.
Note: cannot use both this and -D together
Note: `--long` mode will be automatically activated if chainLog < fileLog
(fileLog being the windowLog requried to cover the whole file). You
can also manually force it.
Node: for all levels, you can use --patch-from in --single-thread mode
to improve compression ratio at the cost of speed
Note: for level 19, you can get increased compression ratio at the cost
of speed by specifying `--zstd=targetLength=` to be something large
(i.e 4096), and by setting a large `--zstd=chainLog=`
* `-M#`, `--memory=#`:
Set a memory usage limit. By default, Zstandard uses 128 MB for decompression
as the maximum amount of memory the decompressor is allowed to use, but you can

View File

@ -1240,6 +1240,11 @@ int main(int const argCount, const char* argv[])
CLEAN_RETURN(1);
}
if (patchFromDictFileName != NULL && filenames->tableSize > 1) {
DISPLAY("error : can't use --patch-from=# on multiple files \n");
CLEAN_RETURN(1);
}
/* No status message in pipe mode (stdin - stdout) or multi-files mode */
if (!strcmp(filenames->fileNames[0], stdinmark) && outFileName && !strcmp(outFileName,stdoutmark) && (g_displayLevel==2)) g_displayLevel=1;
if ((filenames->tableSize > 1) & (g_displayLevel==2)) g_displayLevel=1;
@ -1247,15 +1252,14 @@ int main(int const argCount, const char* argv[])
/* IO Stream/File */
FIO_setNotificationLevel(g_displayLevel);
FIO_setPatchFromMode(prefs, patchFromDictFileName != NULL);
if (patchFromDictFileName != NULL) {
dictFileName = patchFromDictFileName;
}
if (memLimit == 0) {
if (compressionParams.windowLog == 0) {
memLimit = (U32)1 << g_defaultMaxWindowLog;
} else {
memLimit = (U32)1 << (compressionParams.windowLog & 31);
} }
if (patchFromDictFileName != NULL)
dictFileName = patchFromDictFileName;
FIO_setMemLimit(prefs, memLimit);
if (operation==zom_compress) {
#ifndef ZSTD_NOCOMPRESS

View File

@ -42,10 +42,15 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
ddict = ZSTD_createDDict(dict.buff, dict.size);
FUZZ_ASSERT(ddict);
} else {
FUZZ_ZASSERT(ZSTD_DCtx_loadDictionary_advanced(
if (FUZZ_dataProducer_uint32Range(producer, 0, 1) == 0)
FUZZ_ZASSERT(ZSTD_DCtx_loadDictionary_advanced(
dctx, dict.buff, dict.size,
(ZSTD_dictLoadMethod_e)FUZZ_dataProducer_uint32Range(producer, 0, 1),
(ZSTD_dictContentType_e)FUZZ_dataProducer_uint32Range(producer, 0, 2)));
else
FUZZ_ZASSERT(ZSTD_DCtx_refPrefix_advanced(
dctx, dict.buff, dict.size,
(ZSTD_dictContentType_e)FUZZ_dataProducer_uint32Range(producer, 0, 2)));
}
{

View File

@ -28,10 +28,15 @@ static size_t compress(void* compressed, size_t compressedCapacity,
void const* source, size_t sourceSize,
void const* dict, size_t dictSize,
ZSTD_dictLoadMethod_e dictLoadMethod,
ZSTD_dictContentType_e dictContentType)
ZSTD_dictContentType_e dictContentType,
int const refPrefix)
{
ZSTD_CCtx* cctx = ZSTD_createCCtx();
FUZZ_ZASSERT(ZSTD_CCtx_loadDictionary_advanced(
if (refPrefix)
FUZZ_ZASSERT(ZSTD_CCtx_refPrefix_advanced(
cctx, dict, dictSize, dictContentType));
else
FUZZ_ZASSERT(ZSTD_CCtx_loadDictionary_advanced(
cctx, dict, dictSize, dictLoadMethod, dictContentType));
size_t const compressedSize = ZSTD_compress2(
cctx, compressed, compressedCapacity, source, sourceSize);
@ -43,10 +48,15 @@ static size_t decompress(void* result, size_t resultCapacity,
void const* compressed, size_t compressedSize,
void const* dict, size_t dictSize,
ZSTD_dictLoadMethod_e dictLoadMethod,
ZSTD_dictContentType_e dictContentType)
ZSTD_dictContentType_e dictContentType,
int const refPrefix)
{
ZSTD_DCtx* dctx = ZSTD_createDCtx();
FUZZ_ZASSERT(ZSTD_DCtx_loadDictionary_advanced(
if (refPrefix)
FUZZ_ZASSERT(ZSTD_DCtx_refPrefix_advanced(
dctx, dict, dictSize, dictContentType));
else
FUZZ_ZASSERT(ZSTD_DCtx_loadDictionary_advanced(
dctx, dict, dictSize, dictLoadMethod, dictContentType));
size_t const resultSize = ZSTD_decompressDCtx(
dctx, result, resultCapacity, compressed, compressedSize);
@ -58,6 +68,7 @@ static size_t decompress(void* result, size_t resultCapacity,
int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
{
FUZZ_dataProducer_t *producer = FUZZ_dataProducer_create(src, size);
int const refPrefix = FUZZ_dataProducer_uint32Range(producer, 0, 1) != 0;
ZSTD_dictLoadMethod_e const dlm =
size = FUZZ_dataProducer_uint32Range(producer, 0, 1);
ZSTD_dictContentType_e const dct =
@ -75,14 +86,14 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size)
FUZZ_ASSERT(cBuf);
size_t const cSize =
compress(cBuf, cBufSize, src, size, src, size, dlm, dct);
compress(cBuf, cBufSize, src, size, src, size, dlm, dct, refPrefix);
/* compression failing is okay */
if (ZSTD_isError(cSize)) {
FUZZ_ASSERT_MSG(dct != ZSTD_dct_rawContent, "Raw must always succeed!");
goto out;
}
size_t const rSize =
decompress(rBuf, size, cBuf, cSize, src, size, dlm, dct);
decompress(rBuf, size, cBuf, cSize, src, size, dlm, dct, refPrefix);
FUZZ_ASSERT_MSG(rSize == size, "Incorrect regenerated size");
FUZZ_ASSERT_MSG(!memcmp(src, rBuf, size), "Corruption!");

View File

@ -536,15 +536,15 @@ static int basicUnitTests(U32 const seed, double compressibility)
BYTE* const dst = (BYTE*)compressedBuffer;
ZSTD_DCtx* dctx = ZSTD_createDCtx();
/* create a large frame and then a bunch of small frames */
size_t srcSize = ZSTD_compress((void*)dst,
size_t srcSize = ZSTD_compress((void*)dst,
compressedBufferSize, CNBuffer, largeFrameSrcSize, 3);
for (i = 0; i < nbFrames; i++)
srcSize += ZSTD_compress((void*)(dst + srcSize),
compressedBufferSize - srcSize, CNBuffer,
for (i = 0; i < nbFrames; i++)
srcSize += ZSTD_compress((void*)(dst + srcSize),
compressedBufferSize - srcSize, CNBuffer,
smallFrameSrcSize, 3);
/* decompressStream and make sure that dctx size was reduced at least once */
while (consumed < srcSize) {
ZSTD_inBuffer in = {(void*)(dst + consumed), MIN(1, srcSize - consumed), 0};
@ -566,6 +566,115 @@ static int basicUnitTests(U32 const seed, double compressibility)
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3i : ldm fill dict out-of-bounds check", testNb++);
{
ZSTD_CCtx* const cctx = ZSTD_createCCtx();
size_t const size = (1U << 10);
size_t const dstCapacity = ZSTD_compressBound(size);
void* dict = (void*)malloc(size);
void* src = (void*)malloc(size);
void* dst = (void*)malloc(dstCapacity);
RDG_genBuffer(dict, size, 0.5, 0.5, seed);
RDG_genBuffer(src, size, 0.5, 0.5, seed);
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_enableLongDistanceMatching, 1));
assert(!ZSTD_isError(ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, size, dict, size, 3)));
ZSTD_freeCCtx(cctx);
free(dict);
free(src);
free(dst);
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3i : testing dict compression with enableLdm and forceMaxWindow : ", testNb++);
{
ZSTD_CCtx* const cctx = ZSTD_createCCtx();
void* dict = (void*)malloc(CNBuffSize);
RDG_genBuffer(dict, CNBuffSize, 0.5, 0.5, seed);
RDG_genBuffer(CNBuffer, CNBuffSize, 0.6, 0.6, seed);
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_forceMaxWindow, 1));
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_enableLongDistanceMatching, 1));
assert(!ZSTD_isError(ZSTD_compress_usingDict(cctx, compressedBuffer, compressedBufferSize,
CNBuffer, CNBuffSize, dict, CNBuffSize, 3)));
ZSTD_freeCCtx(cctx);
free(dict);
}
DISPLAYLEVEL(3, "OK \n");
/* Note: this test takes 0.5 seconds to run */
DISPLAYLEVEL(3, "test%3i : testing refPrefx vs refPrefx + ldm (size comparison) : ", testNb++);
{
/* test a big buffer so that ldm can take effect */
size_t const size = 100 MB;
int const windowLog = 27;
size_t const dstSize = ZSTD_compressBound(size);
void* dict = (void*)malloc(size);
void* src = (void*)malloc(size);
void* dst = (void*)malloc(dstSize);
void* recon = (void*)malloc(size);
size_t refPrefixCompressedSize = 0;
size_t refPrefixLdmComrpessedSize = 0;
size_t reconSize = 0;
ZSTD_CCtx* const cctx = ZSTD_createCCtx();
ZSTD_DCtx* const dctx = ZSTD_createDCtx();
/* make dict and src the same uncompressible data */
RDG_genBuffer(src, size, 0, 0, seed);
memcpy(dict, src, size);
assert(!memcmp(dict, src, size));
/* set level 1 and windowLog to cover src */
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, 1));
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, windowLog));
/* compress on level 1 using just refPrefix and no ldm */
ZSTD_CCtx_refPrefix(cctx, dict, size);
refPrefixCompressedSize = ZSTD_compress2(cctx, dst, dstSize, src, size);
assert(!ZSTD_isError(refPrefixCompressedSize));
/* test round trip just refPrefix */
ZSTD_DCtx_refPrefix(dctx, dict, size);
reconSize = ZSTD_decompressDCtx(dctx, recon, size, dst, refPrefixCompressedSize);
assert(!ZSTD_isError(reconSize));
assert(reconSize == size);
assert(!memcmp(recon, src, size));
/* compress on level 1 using refPrefix and ldm */
ZSTD_CCtx_refPrefix(cctx, dict, size);;
CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_enableLongDistanceMatching, 1))
refPrefixLdmComrpessedSize = ZSTD_compress2(cctx, dst, dstSize, src, size);
assert(!ZSTD_isError(refPrefixLdmComrpessedSize));
/* test round trip refPrefix + ldm*/
ZSTD_DCtx_refPrefix(dctx, dict, size);
reconSize = ZSTD_decompressDCtx(dctx, recon, size, dst, refPrefixLdmComrpessedSize);
assert(!ZSTD_isError(reconSize));
assert(reconSize == size);
assert(!memcmp(recon, src, size));
/* make sure that refPrefixCompressedSize is significantly greater */
assert(refPrefixCompressedSize > 10 * refPrefixLdmComrpessedSize);
/* make sure the ldm comrpessed size is less than 1% of original */
assert((double)refPrefixLdmComrpessedSize / (double)size < 0.01);
ZSTD_freeDCtx(dctx);
ZSTD_freeCCtx(cctx);
free(recon);
free(dict);
free(src);
free(dst);
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3d: superblock uncompressible data, too many nocompress superblocks : ", testNb++);
{
ZSTD_CCtx* const cctx = ZSTD_createCCtx();

View File

@ -1262,11 +1262,20 @@ println "\n===> patch-from tests"
datagen -g1000 -P50 > tmp_dict
datagen -g1000 -P10 > tmp_patch
zstd --memory=10000 --patch-from=tmp_dict tmp_patch -o tmp_patch_diff
zstd -d --memory=10000 --patch-from=tmp_dict tmp_patch_diff -o tmp_patch_recon
zstd --patch-from=tmp_dict tmp_patch -o tmp_patch_diff
zstd -d --patch-from=tmp_dict tmp_patch_diff -o tmp_patch_recon
$DIFF -s tmp_patch_recon tmp_patch
rm -rf tmp_*
println "\n===> patch-from recursive tests"
mkdir tmp_dir
datagen > tmp_dir/tmp1
datagen > tmp_dir/tmp2
datagen > tmp_dict
zstd --patch-from=tmp_dict -r tmp_dir && die
rm -rf tmp*
println "\n===> large files tests "
roundTripTest -g270000000 1