diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index b9475e92..a6a464a8 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2891,22 +2891,28 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, /* Dictionaries that assign zero probability to symbols that show up causes problems - when FSE encoding. Refuse dictionaries that assign zero probability to symbols - that we may encounter during compression. - NOTE: This behavior is not standard and could be improved in the future. */ -static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) { + * when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check + * and only dictionaries with 100% valid symbols can be assumed valid. + */ +static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) +{ U32 s; - RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols"); - for (s = 0; s <= maxSymbolValue; ++s) { - RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols"); + if (dictMaxSymbolValue < maxSymbolValue) { + return FSE_repeat_check; } - return 0; + for (s = 0; s <= maxSymbolValue; ++s) { + if (normalizedCounter[s] == 0) { + return FSE_repeat_check; + } + } + return FSE_repeat_valid; } size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, - short* offcodeNCount, unsigned* offcodeMaxValue, const void* const dict, size_t dictSize) { + short offcodeNCount[MaxOff+1]; + unsigned offcodeMaxValue = MaxOff; const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */ const BYTE* const dictEnd = dictPtr + dictSize; dictPtr += 8; @@ -2928,16 +2934,16 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, } { unsigned offcodeLog; - size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); - /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ /* fill all offset symbols to avoid garbage at end of table */ RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.offcodeCTable, offcodeNCount, MaxOff, offcodeLog, workspace, HUF_WORKSPACE_SIZE)), dictionary_corrupted, ""); + /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ dictPtr += offcodeHeaderSize; } @@ -2946,13 +2952,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); - /* Every match length code must have non-zero probability */ - FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), ""); RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, workspace, HUF_WORKSPACE_SIZE)), dictionary_corrupted, ""); + bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML); dictPtr += matchlengthHeaderSize; } @@ -2961,13 +2966,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); - /* Every literal length code must have non-zero probability */ - FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), ""); RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( bs->entropy.fse.litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, workspace, HUF_WORKSPACE_SIZE)), dictionary_corrupted, ""); + bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL); dictPtr += litlengthHeaderSize; } @@ -2977,6 +2981,22 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, bs->rep[2] = MEM_readLE32(dictPtr+8); dictPtr += 12; + { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + U32 offcodeMax = MaxOff; + if (dictContentSize <= ((U32)-1) - 128 KB) { + U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ + offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ + } + /* All offset values <= dictContentSize + 128 KB must be representable for a valid table */ + bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)); + + /* All repCodes must be <= dictContentSize and != 0 */ + { U32 u; + for (u=0; u<3; u++) { + RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, ""); + RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); + } } } + return dictPtr - (const BYTE*)dict; } @@ -2999,8 +3019,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, { const BYTE* dictPtr = (const BYTE*)dict; const BYTE* const dictEnd = dictPtr + dictSize; - short offcodeNCount[MaxOff+1]; - unsigned offcodeMaxValue = MaxOff; size_t dictID; size_t eSize; @@ -3009,32 +3027,16 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ ); - eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize); + eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize); FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed"); dictPtr += eSize; - { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); - U32 offcodeMax = MaxOff; - if (dictContentSize <= ((U32)-1) - 128 KB) { - U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ - offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ - } - /* All offset values <= dictContentSize + 128 KB must be representable */ - FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), ""); - /* All repCodes must be <= dictContentSize and != 0*/ - { U32 u; - for (u=0; u<3; u++) { - RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, ""); - RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); - } } - - bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid; - bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid; - bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid; + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); - return dictID; } + return dictID; } /** ZSTD_compress_insertDictionary() : diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index db73f6ce..f2be0188 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -1045,7 +1045,6 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) * assumptions : magic number supposed already checked * and dictSize >= 8 */ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, - short* offcodeNCount, unsigned* offcodeMaxValue, const void* const dict, size_t dictSize); void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index 6d0b0423..6f13392a 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -105,20 +105,17 @@ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize) size_t headerSize; if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted); - { unsigned offcodeMaxValue = MaxOff; - ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t)); + { ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t)); U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE); - short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short)); - if (!bs || !wksp || !offcodeNCount) { + if (!bs || !wksp) { headerSize = ERROR(memory_allocation); } else { ZSTD_reset_compressedBlockState(bs); - headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize); + headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize); } free(bs); free(wksp); - free(offcodeNCount); } return headerSize; diff --git a/tests/golden-compression/http b/tests/golden-compression/http new file mode 100644 index 00000000..a3908cbc --- /dev/null +++ b/tests/golden-compression/http @@ -0,0 +1 @@ +ads60.vertamedia.comhttp://ads60.vertamedia.com/ops/43C53990160C658B/46991http://ads60.vertamedia.com/ve/43C53990160C658B/53http://ads60.vertamedia.com/ve/43C53990160C658B/54http://ads60.vertamedia.com/ve/43C53990160C658B/55http://ads60.vertamedia.com/ve/43C53990160C658B/56http://ads60.vertamedia.com/ve/43C53990160C658B/57http://ads60.vertamedia.com/ve/43C53990160C658B/66http://ads60.vertamedia.com/ve/43C53990160C658B/71 \ No newline at end of file diff --git a/tests/golden-dictionaries/http-dict-missing-symbols b/tests/golden-dictionaries/http-dict-missing-symbols new file mode 100644 index 00000000..0fa5ca61 Binary files /dev/null and b/tests/golden-dictionaries/http-dict-missing-symbols differ diff --git a/tests/playTests.sh b/tests/playTests.sh index 5bca6d13..7a2689cd 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -893,8 +893,9 @@ datagen | zstd -c | zstd -t println "\n===> golden files tests " -zstd -t -r "$TESTDIR/golden-compression" +zstd -t -r "$TESTDIR/golden-decompression" zstd -c -r "$TESTDIR/golden-compression" | zstd -t +zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" "$TESTDIR/golden-compression/http" -c | zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" -t println "\n===> benchmark mode tests "