From 52a0622beb1812dcc39da7c085e4793569fa07e6 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 15 Jun 2016 13:53:34 +0200 Subject: [PATCH] RepsCodes are saved into Dict (uncomplete : need decompression to regenerate them) --- lib/common/zstd_internal.h | 2 +- lib/compress/fse_compress.c | 2 +- lib/compress/zstd_compress.c | 28 ++++---- lib/dictBuilder/zdict.c | 126 +++++++++++++++++++++++++---------- lib/dictBuilder/zdict.h | 2 +- 5 files changed, 111 insertions(+), 49 deletions(-) diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index 17ae1a77..0909955a 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -64,7 +64,7 @@ #endif #define ZSTD_OPT_NUM (1<<12) -#define ZSTD_DICT_MAGIC 0xEC30A437 +#define ZSTD_DICT_MAGIC 0xEC30A437 /* v0.7 */ #define ZSTD_REP_NUM 3 #define ZSTD_REP_INIT ZSTD_REP_NUM diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c index 5c804dca..192d5502 100644 --- a/lib/compress/fse_compress.c +++ b/lib/compress/fse_compress.c @@ -256,7 +256,7 @@ static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize, bitStream += count << bitCount; bitCount += nbBits; bitCount -= (count>=1; } if (bitCount>16) { diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 220fadc6..b1edaff3 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2342,45 +2342,49 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx* zc, const void* src, size_t static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* zc, const void* dict, size_t dictSize) { /* note : magic number already checked */ - size_t const dictSizeStart = dictSize; + const BYTE* dictPtr = (const BYTE*)dict; + const BYTE* const dictEnd = dictPtr + dictSize; { size_t const hufHeaderSize = HUF_readCTable(zc->hufTable, 255, dict, dictSize); if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted); - dict = (const char*)dict + hufHeaderSize; - dictSize -= hufHeaderSize; + dictPtr += hufHeaderSize; } { short offcodeNCount[MaxOff+1]; unsigned offcodeMaxValue = MaxOff, offcodeLog = OffFSELog; - size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dict, dictSize); + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted); { size_t const errorCode = FSE_buildCTable(zc->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog); if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted); } - dict = (const char*)dict + offcodeHeaderSize; - dictSize -= offcodeHeaderSize; + dictPtr += offcodeHeaderSize; } { short matchlengthNCount[MaxML+1]; unsigned matchlengthMaxValue = MaxML, matchlengthLog = MLFSELog; - size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dict, dictSize); + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted); { size_t const errorCode = FSE_buildCTable(zc->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog); if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted); } - dict = (const char*)dict + matchlengthHeaderSize; - dictSize -= matchlengthHeaderSize; + dictPtr += matchlengthHeaderSize; } { short litlengthNCount[MaxLL+1]; unsigned litlengthMaxValue = MaxLL, litlengthLog = LLFSELog; - size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dict, dictSize); + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted); { size_t const errorCode = FSE_buildCTable(zc->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog); if (FSE_isError(errorCode)) return ERROR(dictionary_corrupted); } - dictSize -= litlengthHeaderSize; + dictPtr += litlengthHeaderSize; } + if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted); + zc->rep[0] = MEM_readLE32(dictPtr+0); if (zc->rep[0] >= dictSize) return ERROR(dictionary_corrupted); + zc->rep[1] = MEM_readLE32(dictPtr+4); if (zc->rep[1] >= dictSize) return ERROR(dictionary_corrupted); + zc->rep[2] = MEM_readLE32(dictPtr+8); if (zc->rep[2] >= dictSize) return ERROR(dictionary_corrupted); + dictPtr += 12; + zc->flagStaticTables = 1; - return (dictSizeStart-dictSize); + return dictPtr - (const BYTE*)dict; } /** ZSTD_compress_insertDictionary() : diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index 2e15cbbf..ace0fc15 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -578,9 +578,10 @@ typedef struct void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ } EStats_ress_t; +#define MAXREPOFFSET 1024 static void ZDICT_countEStats(EStats_ress_t esr, - U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, + U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, const void* src, size_t srcSize) { const seqStore_t* seqStorePtr; @@ -614,6 +615,17 @@ static void ZDICT_countEStats(EStats_ress_t esr, size_t u; for (u=0; uoffsetStart; + U32 offset1 = offsetPtr[0] - 3; + U32 offset2 = offsetPtr[1] - 3; + if (offset1 >= MAXREPOFFSET) offset1 = 0; + if (offset2 >= MAXREPOFFSET) offset2 = 0; + repOffsets[offset1] += 3; + repOffsets[offset2] += 1; + } + } /* @@ -629,12 +641,29 @@ static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles) static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles) { - size_t total; + size_t total=0; unsigned u; - for (u=0, total=0; u0; u--) { + offsetCount_t tmp; + if (table[u-1].count >= table[u].count) break; + tmp = table[u-1]; + table[u-1] = table[u]; + table[u] = tmp; + } +} + + #define OFFCODE_MAX 18 /* only applicable to first block */ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, unsigned compressionLevel, @@ -649,6 +678,8 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, short matchLengthNCount[MaxML+1]; U32 litLengthCount[MaxLL+1]; short litLengthNCount[MaxLL+1]; + U32 repOffset[MAXREPOFFSET] = { 0 }; + offsetCount_t bestRepOffset[ZSTD_REP_NUM+1]; EStats_ress_t esr; ZSTD_parameters params; U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; @@ -656,12 +687,15 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, size_t eSize = 0; size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); size_t const averageSampleSize = totalSrcSize / nbFiles; + BYTE* dstPtr = (BYTE*)dstBuffer; /* init */ for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */ for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1; for (u=0; u<=MaxML; u++) matchLengthCount[u]=1; for (u=0; u<=MaxLL; u++) litLengthCount[u]=1; + repOffset[1] = repOffset[4] = repOffset[8] = 1; + memset(bestRepOffset, 0, sizeof(bestRepOffset)); esr.ref = ZSTD_createCCtx(); esr.zc = ZSTD_createCCtx(); esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); @@ -679,7 +713,7 @@ static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize, /* collect stats on all files */ for (u=0; u