From 0356e05f0741181e6dc3354dc323e889065c4684 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 30 Nov 2021 16:51:16 -0800 Subject: [PATCH] [zdict] Remove ZDICT_CONTENTSIZE_MIN restriction for ZDICT_finalizeDictionary Allow the `dictContentSize` to be any size. The finalized dictionary content size must be at least as large as the maximum repcode (8). So we add zero bytes to the dictionary to ensure that we meet that requirement. I've removed this restriction because its been causing us headaches when people complain that dictionary training failed. It fails because there isn't enough useful content to put in the dictionary. Either because every sample is exactly the same and less than ZDICT_CONTENTSIZE_MIN bytes, or there isn't enough content. Instead, we should succeed in creating the dictionary, and it is up to the user to decide if it is worthwhile. It is possible that the tables alone provide enough value. NOTE: This allows us to produce dictionaries with finalized `dictContentSize < ZDICT_CONTENTSIZE_MIN`. But, they are still valid zstd dictionaries. We could remove the `ZDICT_CONTENTSIZE_MIN` macro, but I've decided to leave that for now, so we don't break users. --- lib/dictBuilder/zdict.c | 58 ++++++++++++++++++++++++++++++++++++----- lib/zdict.h | 4 +-- tests/playTests.sh | 4 --- 3 files changed, 53 insertions(+), 13 deletions(-) diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index d93b202e..8b8b381e 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -915,6 +915,17 @@ _cleanup: } +/** + * @returns the maximum repcode value + */ +static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM]) +{ + U32 maxRep = reps[0]; + int r; + for (r = 1; r < ZSTD_REP_NUM; ++r) + maxRep = MAX(maxRep, reps[r]); + return maxRep; +} size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, const void* customDictContent, size_t dictContentSize, @@ -926,11 +937,13 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, BYTE header[HBUFFSIZE]; int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel; U32 const notificationLevel = params.notificationLevel; + /* The final dictionary content must be at least as large as the largest repcode */ + size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue); + size_t paddingSize; /* check conditions */ DEBUGLOG(4, "ZDICT_finalizeDictionary"); if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); - if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong); if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); /* dictionary header */ @@ -954,12 +967,43 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, hSize += eSize; } - /* copy elements in final buffer ; note : src and dst buffer can overlap */ - if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize; - { size_t const dictSize = hSize + dictContentSize; - char* dictEnd = (char*)dictBuffer + dictSize; - memmove(dictEnd - dictContentSize, customDictContent, dictContentSize); - memcpy(dictBuffer, header, hSize); + /* Shrink the content size if it doesn't fit in the buffer */ + if (hSize + dictContentSize > dictBufferCapacity) { + dictContentSize = dictBufferCapacity - hSize; + } + + /* Pad the dictionary content with zeros if it is too small */ + if (dictContentSize < minContentSize) { + RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall, + "dictBufferCapacity too small to fit max repcode"); + paddingSize = minContentSize - dictContentSize; + } else { + paddingSize = 0; + } + + { + size_t const dictSize = hSize + paddingSize + dictContentSize; + + /* The dictionary consists of the header, optional padding, and the content. + * The padding comes before the content because the "best" position in the + * dictionary is the last byte. + */ + BYTE* const outDictHeader = (BYTE*)dictBuffer; + BYTE* const outDictPadding = outDictHeader + hSize; + BYTE* const outDictContent = outDictPadding + paddingSize; + + assert(dictSize <= dictBufferCapacity); + assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize); + + /* First copy the customDictContent into its final location. + * `customDictContent` and `dictBuffer` may overlap, so we must + * do this before any other writes into the output buffer. + * Then copy the header & padding into the output buffer. + */ + memmove(outDictContent, customDictContent, dictContentSize); + memcpy(outDictHeader, header, hSize); + memset(outDictPadding, 0, paddingSize); + return dictSize; } } diff --git a/lib/zdict.h b/lib/zdict.h index ac98a169..f1e139a4 100644 --- a/lib/zdict.h +++ b/lib/zdict.h @@ -237,7 +237,6 @@ typedef struct { * is presumed that the most profitable content is at the end of the dictionary, * since that is the cheapest to reference. * - * `dictContentSize` must be >= ZDICT_CONTENTSIZE_MIN bytes. * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN). * * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`), @@ -272,8 +271,9 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); * Use them only in association with static linking. * ==================================================================================== */ -#define ZDICT_CONTENTSIZE_MIN 128 #define ZDICT_DICTSIZE_MIN 256 +/* Deprecated: Remove in v1.6.0 */ +#define ZDICT_CONTENTSIZE_MIN 128 /*! ZDICT_cover_params_t: * k and d are the only required parameters. diff --git a/tests/playTests.sh b/tests/playTests.sh index 1edca7c3..3c773ed5 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -1018,12 +1018,8 @@ zstd -o tmpDict --train "$TESTDIR"/*.c "$PRGDIR"/*.c test -f tmpDict zstd --train "$TESTDIR"/*.c "$PRGDIR"/*.c test -f dictionary -println "- Test dictionary training fails" -echo "000000000000000000000000000000000" > tmpz -zstd --train tmpz tmpz tmpz tmpz tmpz tmpz tmpz tmpz tmpz && die "Dictionary training should fail : source is all zeros" if [ -n "$hasMT" ] then - zstd --train -T0 tmpz tmpz tmpz tmpz tmpz tmpz tmpz tmpz tmpz && die "Dictionary training should fail : source is all zeros" println "- Create dictionary with multithreading enabled" zstd --train -T0 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpDict fi