Merge pull request #2202 from terrelln/dict-fi
[lib] Allow compression dictionaries with missing symbols
This commit is contained in:
commit
b6a9ded994
@ -2891,22 +2891,28 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
|
||||
|
||||
|
||||
/* Dictionaries that assign zero probability to symbols that show up causes problems
|
||||
when FSE encoding. Refuse dictionaries that assign zero probability to symbols
|
||||
that we may encounter during compression.
|
||||
NOTE: This behavior is not standard and could be improved in the future. */
|
||||
static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
|
||||
* when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check
|
||||
* and only dictionaries with 100% valid symbols can be assumed valid.
|
||||
*/
|
||||
static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
|
||||
{
|
||||
U32 s;
|
||||
RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols");
|
||||
for (s = 0; s <= maxSymbolValue; ++s) {
|
||||
RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols");
|
||||
if (dictMaxSymbolValue < maxSymbolValue) {
|
||||
return FSE_repeat_check;
|
||||
}
|
||||
return 0;
|
||||
for (s = 0; s <= maxSymbolValue; ++s) {
|
||||
if (normalizedCounter[s] == 0) {
|
||||
return FSE_repeat_check;
|
||||
}
|
||||
}
|
||||
return FSE_repeat_valid;
|
||||
}
|
||||
|
||||
size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
|
||||
short* offcodeNCount, unsigned* offcodeMaxValue,
|
||||
const void* const dict, size_t dictSize)
|
||||
{
|
||||
short offcodeNCount[MaxOff+1];
|
||||
unsigned offcodeMaxValue = MaxOff;
|
||||
const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */
|
||||
const BYTE* const dictEnd = dictPtr + dictSize;
|
||||
dictPtr += 8;
|
||||
@ -2928,16 +2934,16 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
|
||||
}
|
||||
|
||||
{ unsigned offcodeLog;
|
||||
size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
|
||||
size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
|
||||
RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
|
||||
RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
|
||||
/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
|
||||
/* fill all offset symbols to avoid garbage at end of table */
|
||||
RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
|
||||
bs->entropy.fse.offcodeCTable,
|
||||
offcodeNCount, MaxOff, offcodeLog,
|
||||
workspace, HUF_WORKSPACE_SIZE)),
|
||||
dictionary_corrupted, "");
|
||||
/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
|
||||
dictPtr += offcodeHeaderSize;
|
||||
}
|
||||
|
||||
@ -2946,13 +2952,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
|
||||
size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
|
||||
RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
|
||||
RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
|
||||
/* Every match length code must have non-zero probability */
|
||||
FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), "");
|
||||
RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
|
||||
bs->entropy.fse.matchlengthCTable,
|
||||
matchlengthNCount, matchlengthMaxValue, matchlengthLog,
|
||||
workspace, HUF_WORKSPACE_SIZE)),
|
||||
dictionary_corrupted, "");
|
||||
bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML);
|
||||
dictPtr += matchlengthHeaderSize;
|
||||
}
|
||||
|
||||
@ -2961,13 +2966,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
|
||||
size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
|
||||
RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
|
||||
RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
|
||||
/* Every literal length code must have non-zero probability */
|
||||
FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), "");
|
||||
RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
|
||||
bs->entropy.fse.litlengthCTable,
|
||||
litlengthNCount, litlengthMaxValue, litlengthLog,
|
||||
workspace, HUF_WORKSPACE_SIZE)),
|
||||
dictionary_corrupted, "");
|
||||
bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL);
|
||||
dictPtr += litlengthHeaderSize;
|
||||
}
|
||||
|
||||
@ -2977,6 +2981,22 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
|
||||
bs->rep[2] = MEM_readLE32(dictPtr+8);
|
||||
dictPtr += 12;
|
||||
|
||||
{ size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
|
||||
U32 offcodeMax = MaxOff;
|
||||
if (dictContentSize <= ((U32)-1) - 128 KB) {
|
||||
U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
|
||||
offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
|
||||
}
|
||||
/* All offset values <= dictContentSize + 128 KB must be representable for a valid table */
|
||||
bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff));
|
||||
|
||||
/* All repCodes must be <= dictContentSize and != 0 */
|
||||
{ U32 u;
|
||||
for (u=0; u<3; u++) {
|
||||
RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
|
||||
RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
|
||||
} } }
|
||||
|
||||
return dictPtr - (const BYTE*)dict;
|
||||
}
|
||||
|
||||
@ -2999,8 +3019,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
|
||||
{
|
||||
const BYTE* dictPtr = (const BYTE*)dict;
|
||||
const BYTE* const dictEnd = dictPtr + dictSize;
|
||||
short offcodeNCount[MaxOff+1];
|
||||
unsigned offcodeMaxValue = MaxOff;
|
||||
size_t dictID;
|
||||
size_t eSize;
|
||||
|
||||
@ -3009,32 +3027,16 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
|
||||
assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
|
||||
|
||||
dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ );
|
||||
eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize);
|
||||
eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize);
|
||||
FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
|
||||
dictPtr += eSize;
|
||||
|
||||
{ size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
|
||||
U32 offcodeMax = MaxOff;
|
||||
if (dictContentSize <= ((U32)-1) - 128 KB) {
|
||||
U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
|
||||
offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
|
||||
}
|
||||
/* All offset values <= dictContentSize + 128 KB must be representable */
|
||||
FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), "");
|
||||
/* All repCodes must be <= dictContentSize and != 0*/
|
||||
{ U32 u;
|
||||
for (u=0; u<3; u++) {
|
||||
RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
|
||||
RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
|
||||
} }
|
||||
|
||||
bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
|
||||
bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
|
||||
bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
|
||||
{
|
||||
size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
|
||||
FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
|
||||
ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
|
||||
return dictID;
|
||||
}
|
||||
return dictID;
|
||||
}
|
||||
|
||||
/** ZSTD_compress_insertDictionary() :
|
||||
|
@ -1045,7 +1045,6 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
|
||||
* assumptions : magic number supposed already checked
|
||||
* and dictSize >= 8 */
|
||||
size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
|
||||
short* offcodeNCount, unsigned* offcodeMaxValue,
|
||||
const void* const dict, size_t dictSize);
|
||||
|
||||
void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
|
||||
|
@ -105,20 +105,17 @@ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
|
||||
size_t headerSize;
|
||||
if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
|
||||
|
||||
{ unsigned offcodeMaxValue = MaxOff;
|
||||
ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
|
||||
{ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
|
||||
U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
|
||||
short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
|
||||
if (!bs || !wksp || !offcodeNCount) {
|
||||
if (!bs || !wksp) {
|
||||
headerSize = ERROR(memory_allocation);
|
||||
} else {
|
||||
ZSTD_reset_compressedBlockState(bs);
|
||||
headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
|
||||
headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
|
||||
}
|
||||
|
||||
free(bs);
|
||||
free(wksp);
|
||||
free(offcodeNCount);
|
||||
}
|
||||
|
||||
return headerSize;
|
||||
|
1
tests/golden-compression/http
Normal file
1
tests/golden-compression/http
Normal file
@ -0,0 +1 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?><VAST version="2.0"><Ad id="43C53990160C658B"><Wrapper><AdSystem version="1.0">ads60.vertamedia.com</AdSystem><VASTAdTagURI>http://ads60.vertamedia.com/ops/43C53990160C658B/46991</VASTAdTagURI><Impression><![CDATA[http://ads60.vertamedia.com/i/?x=1&adId=43C53990160C658B&aid=46991&cmpId=13765&fi=24929&advId=13177&pubId=28786&sid=0&width=300&height=250&domain=m.gazetaexpress.com]]></Impression><Creatives><Creative><Linear><TrackingEvents><Tracking event="start">http://ads60.vertamedia.com/ve/43C53990160C658B/53</Tracking><Tracking event="midpoint">http://ads60.vertamedia.com/ve/43C53990160C658B/54</Tracking><Tracking event="firstQuartile">http://ads60.vertamedia.com/ve/43C53990160C658B/55</Tracking><Tracking event="thirdQuartile">http://ads60.vertamedia.com/ve/43C53990160C658B/56</Tracking><Tracking event="complete">http://ads60.vertamedia.com/ve/43C53990160C658B/57</Tracking><Tracking event="skip">http://ads60.vertamedia.com/ve/43C53990160C658B/66</Tracking></TrackingEvents><VideoClicks><ClickTracking>http://ads60.vertamedia.com/ve/43C53990160C658B/71</ClickTracking></VideoClicks></Linear></Creative></Creatives></Wrapper></Ad></VAST>
|
BIN
tests/golden-dictionaries/http-dict-missing-symbols
Normal file
BIN
tests/golden-dictionaries/http-dict-missing-symbols
Normal file
Binary file not shown.
@ -893,8 +893,9 @@ datagen | zstd -c | zstd -t
|
||||
|
||||
println "\n===> golden files tests "
|
||||
|
||||
zstd -t -r "$TESTDIR/golden-compression"
|
||||
zstd -t -r "$TESTDIR/golden-decompression"
|
||||
zstd -c -r "$TESTDIR/golden-compression" | zstd -t
|
||||
zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" "$TESTDIR/golden-compression/http" -c | zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" -t
|
||||
|
||||
|
||||
println "\n===> benchmark mode tests "
|
||||
|
Loading…
x
Reference in New Issue
Block a user