Merge pull request #2202 from terrelln/dict-fi

[lib] Allow compression dictionaries with missing symbols
dev
Nick Terrell 2020-06-15 10:54:33 -07:00 committed by GitHub
commit b6a9ded994
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 45 additions and 45 deletions

View File

@ -2891,22 +2891,28 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
/* Dictionaries that assign zero probability to symbols that show up causes problems
when FSE encoding. Refuse dictionaries that assign zero probability to symbols
that we may encounter during compression.
NOTE: This behavior is not standard and could be improved in the future. */
static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
* when FSE encoding. Mark dictionaries with zero probability symbols as FSE_repeat_check
* and only dictionaries with 100% valid symbols can be assumed valid.
*/
static FSE_repeat ZSTD_dictNCountRepeat(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
{
U32 s;
RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols");
for (s = 0; s <= maxSymbolValue; ++s) {
RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols");
if (dictMaxSymbolValue < maxSymbolValue) {
return FSE_repeat_check;
}
return 0;
for (s = 0; s <= maxSymbolValue; ++s) {
if (normalizedCounter[s] == 0) {
return FSE_repeat_check;
}
}
return FSE_repeat_valid;
}
size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
short* offcodeNCount, unsigned* offcodeMaxValue,
const void* const dict, size_t dictSize)
{
short offcodeNCount[MaxOff+1];
unsigned offcodeMaxValue = MaxOff;
const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */
const BYTE* const dictEnd = dictPtr + dictSize;
dictPtr += 8;
@ -2928,16 +2934,16 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
}
{ unsigned offcodeLog;
size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
/* fill all offset symbols to avoid garbage at end of table */
RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
bs->entropy.fse.offcodeCTable,
offcodeNCount, MaxOff, offcodeLog,
workspace, HUF_WORKSPACE_SIZE)),
dictionary_corrupted, "");
/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
dictPtr += offcodeHeaderSize;
}
@ -2946,13 +2952,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
/* Every match length code must have non-zero probability */
FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), "");
RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
bs->entropy.fse.matchlengthCTable,
matchlengthNCount, matchlengthMaxValue, matchlengthLog,
workspace, HUF_WORKSPACE_SIZE)),
dictionary_corrupted, "");
bs->entropy.fse.matchlength_repeatMode = ZSTD_dictNCountRepeat(matchlengthNCount, matchlengthMaxValue, MaxML);
dictPtr += matchlengthHeaderSize;
}
@ -2961,13 +2966,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
/* Every literal length code must have non-zero probability */
FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), "");
RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
bs->entropy.fse.litlengthCTable,
litlengthNCount, litlengthMaxValue, litlengthLog,
workspace, HUF_WORKSPACE_SIZE)),
dictionary_corrupted, "");
bs->entropy.fse.litlength_repeatMode = ZSTD_dictNCountRepeat(litlengthNCount, litlengthMaxValue, MaxLL);
dictPtr += litlengthHeaderSize;
}
@ -2977,6 +2981,22 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
bs->rep[2] = MEM_readLE32(dictPtr+8);
dictPtr += 12;
{ size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
U32 offcodeMax = MaxOff;
if (dictContentSize <= ((U32)-1) - 128 KB) {
U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
}
/* All offset values <= dictContentSize + 128 KB must be representable for a valid table */
bs->entropy.fse.offcode_repeatMode = ZSTD_dictNCountRepeat(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff));
/* All repCodes must be <= dictContentSize and != 0 */
{ U32 u;
for (u=0; u<3; u++) {
RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
} } }
return dictPtr - (const BYTE*)dict;
}
@ -2999,8 +3019,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
{
const BYTE* dictPtr = (const BYTE*)dict;
const BYTE* const dictEnd = dictPtr + dictSize;
short offcodeNCount[MaxOff+1];
unsigned offcodeMaxValue = MaxOff;
size_t dictID;
size_t eSize;
@ -3009,32 +3027,16 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ );
eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize);
eSize = ZSTD_loadCEntropy(bs, workspace, dict, dictSize);
FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
dictPtr += eSize;
{ size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
U32 offcodeMax = MaxOff;
if (dictContentSize <= ((U32)-1) - 128 KB) {
U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
}
/* All offset values <= dictContentSize + 128 KB must be representable */
FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), "");
/* All repCodes must be <= dictContentSize and != 0*/
{ U32 u;
for (u=0; u<3; u++) {
RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
} }
bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
{
size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
return dictID;
}
return dictID;
}
/** ZSTD_compress_insertDictionary() :

View File

@ -1045,7 +1045,6 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
* assumptions : magic number supposed already checked
* and dictSize >= 8 */
size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
short* offcodeNCount, unsigned* offcodeMaxValue,
const void* const dict, size_t dictSize);
void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);

View File

@ -105,20 +105,17 @@ size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize)
size_t headerSize;
if (dictSize <= 8 || MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return ERROR(dictionary_corrupted);
{ unsigned offcodeMaxValue = MaxOff;
ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
{ ZSTD_compressedBlockState_t* bs = (ZSTD_compressedBlockState_t*)malloc(sizeof(ZSTD_compressedBlockState_t));
U32* wksp = (U32*)malloc(HUF_WORKSPACE_SIZE);
short* offcodeNCount = (short*)malloc((MaxOff+1)*sizeof(short));
if (!bs || !wksp || !offcodeNCount) {
if (!bs || !wksp) {
headerSize = ERROR(memory_allocation);
} else {
ZSTD_reset_compressedBlockState(bs);
headerSize = ZSTD_loadCEntropy(bs, wksp, offcodeNCount, &offcodeMaxValue, dictBuffer, dictSize);
headerSize = ZSTD_loadCEntropy(bs, wksp, dictBuffer, dictSize);
}
free(bs);
free(wksp);
free(offcodeNCount);
}
return headerSize;

View File

@ -0,0 +1 @@
<?xml version="1.0" encoding="UTF-8" ?><VAST version="2.0"><Ad id="43C53990160C658B"><Wrapper><AdSystem version="1.0">ads60.vertamedia.com</AdSystem><VASTAdTagURI>http://ads60.vertamedia.com/ops/43C53990160C658B/46991</VASTAdTagURI><Impression><![CDATA[http://ads60.vertamedia.com/i/?x=1&adId=43C53990160C658B&aid=46991&cmpId=13765&fi=24929&advId=13177&pubId=28786&sid=0&width=300&height=250&domain=m.gazetaexpress.com]]></Impression><Creatives><Creative><Linear><TrackingEvents><Tracking event="start">http://ads60.vertamedia.com/ve/43C53990160C658B/53</Tracking><Tracking event="midpoint">http://ads60.vertamedia.com/ve/43C53990160C658B/54</Tracking><Tracking event="firstQuartile">http://ads60.vertamedia.com/ve/43C53990160C658B/55</Tracking><Tracking event="thirdQuartile">http://ads60.vertamedia.com/ve/43C53990160C658B/56</Tracking><Tracking event="complete">http://ads60.vertamedia.com/ve/43C53990160C658B/57</Tracking><Tracking event="skip">http://ads60.vertamedia.com/ve/43C53990160C658B/66</Tracking></TrackingEvents><VideoClicks><ClickTracking>http://ads60.vertamedia.com/ve/43C53990160C658B/71</ClickTracking></VideoClicks></Linear></Creative></Creatives></Wrapper></Ad></VAST>

Binary file not shown.

View File

@ -893,8 +893,9 @@ datagen | zstd -c | zstd -t
println "\n===> golden files tests "
zstd -t -r "$TESTDIR/golden-compression"
zstd -t -r "$TESTDIR/golden-decompression"
zstd -c -r "$TESTDIR/golden-compression" | zstd -t
zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" "$TESTDIR/golden-compression/http" -c | zstd -D "$TESTDIR/golden-dictionaries/http-dict-missing-symbols" -t
println "\n===> benchmark mode tests "