diff --git a/lib/zdict.c b/lib/zdict.c index c99cabe1..a643f4f5 100644 --- a/lib/zdict.c +++ b/lib/zdict.c @@ -574,7 +574,6 @@ static void ZDICT_fillNoise(void* buffer, size_t length) { unsigned acc = PRIME1; size_t p=0;; - for (p=0; p> 21); @@ -594,30 +593,37 @@ static void ZDICT_countEStats(EStats_ress_t esr, U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, const void* src, size_t srcSize) { - const BYTE* bytePtr; - const U32* u32Ptr; - seqStore_t seqStore; + const seqStore_t* seqStorePtr; if (srcSize > ZSTD_BLOCKSIZE_MAX) srcSize = ZSTD_BLOCKSIZE_MAX; /* protection vs large samples */ ZSTD_copyCCtx(esr.zc, esr.ref); ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); - seqStore = ZSTD_copySeqStore(esr.zc); + seqStorePtr = ZSTD_getSeqStore(esr.zc); - /* count stats */ - for(bytePtr = seqStore.litStart; bytePtr < seqStore.lit; bytePtr++) - countLit[*bytePtr]++; - for(u32Ptr = seqStore.offsetStart; u32Ptr < seqStore.offset; u32Ptr++) { - BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1; - if (*u32Ptr==0) offcode=0; - offsetcodeCount[offcode]++; + /* literals stats */ + { const BYTE* bytePtr; + for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) + countLit[*bytePtr]++; } - (void)matchlengthCount; (void)litlengthCount; - /* - for(bytePtr = seqStore.matchLengthStart; bytePtr < seqStore.matchLength; bytePtr++) - matchlengthCount[*bytePtr]++; - for(bytePtr = seqStore.litLengthStart; bytePtr < seqStore.litLength; bytePtr++) - litlengthCount[*bytePtr]++; - */ + + /* seqStats */ + { size_t const nbSeq = (size_t)(seqStorePtr->offset - seqStorePtr->offsetStart); + ZSTD_seqToCodes(seqStorePtr, nbSeq); + + { const BYTE* codePtr = seqStorePtr->offCodeStart; + size_t u; + for (u=0; umlCodeStart; + size_t u; + for (u=0; ullCodeStart; + size_t u; + for (u=0; u= 3) { - const U32 nb = 25; + U32 const nb = 25; + U32 const dictContentSize = ZDICT_dictSize(dictList); U32 u; - U32 dictContentSize = ZDICT_dictSize(dictList); DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); DISPLAYLEVEL(3, "list %u best segments \n", nb); for (u=1; u<=nb; u++) { @@ -850,8 +862,7 @@ size_t ZDICT_trainFromBuffer_unsafe( } } } /* create dictionary */ - { - U32 dictContentSize = ZDICT_dictSize(dictList); + { U32 dictContentSize = ZDICT_dictSize(dictList); size_t hSize; BYTE* ptr; U32 u; @@ -896,31 +907,32 @@ size_t ZDICT_trainFromBuffer_unsafe( } +/* issue : samplesBuffer need to be followed by a noisy guard band. +* work around : duplicate the buffer, and add the noise */ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_params_t params) { - size_t sBuffSize; void* newBuff; - size_t result; + size_t sBuffSize; { unsigned u; for (u=0, sBuffSize=0; u no dictionary */ newBuff = malloc(sBuffSize + NOISELENGTH); if (!newBuff) return ERROR(memory_allocation); memcpy(newBuff, samplesBuffer, sBuffSize); ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ - result = ZDICT_trainFromBuffer_unsafe(dictBuffer, dictBufferCapacity, + { size_t const result = ZDICT_trainFromBuffer_unsafe( + dictBuffer, dictBufferCapacity, newBuff, samplesSizes, nbSamples, params); - free(newBuff); - return result; + free(newBuff); + return result; } } -/* issue : samplesBuffer need to be followed by a noisy guard band. -* work around : duplicate the buffer, and add the noise ? */ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) { diff --git a/lib/zstd_compress.c b/lib/zstd_compress.c index 90857a82..9f2a28f5 100644 --- a/lib/zstd_compress.c +++ b/lib/zstd_compress.c @@ -127,9 +127,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) return 0; /* reserved as a potential error code in the future */ } -seqStore_t ZSTD_copySeqStore(const ZSTD_CCtx* ctx) /* hidden interface */ +const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) /* hidden interface */ { - return ctx->seqStore; + return &(ctx->seqStore); } @@ -569,11 +569,59 @@ static size_t ZSTD_compressLiterals (ZSTD_CCtx* zc, ostart[4] = (BYTE)(cLitSize); break; } - return lhSize+cLitSize; } +void ZSTD_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq) +{ + /* LL codes */ + { static const BYTE LL_Code[64] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 16, 17, 17, 18, 18, 19, 19, + 20, 20, 20, 20, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24 }; + const BYTE LL_deltaCode = 19; + U16* const llTable = seqStorePtr->litLengthStart; + BYTE* const llCodeTable = seqStorePtr->llCodeStart; + size_t u; + for (u=0; ulongLength; llTable[u] = (U16)ll; } + llCodeTable[u] = (ll>63) ? (BYTE)ZSTD_highbit(ll) + LL_deltaCode : LL_Code[ll]; + } } + + /* Offset codes */ + { const U32* const offsetTable = seqStorePtr->offsetStart; + BYTE* const ofCodeTable = seqStorePtr->offCodeStart; + size_t u; + for (u=0; umatchLengthStart; + BYTE* const mlCodeTable = seqStorePtr->mlCodeStart; + size_t u; + for (u=0; ulongLength; mlTable[u] = (U16)ml; } + mlCodeTable[u] = (ml>127) ? (BYTE)ZSTD_highbit(ml) + ML_deltaCode : ML_Code[ml]; + } } +} + + size_t ZSTD_compressSequences(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, size_t srcSize) @@ -619,22 +667,8 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, #define MIN_SEQ_FOR_DYNAMIC_FSE 64 #define MAX_SEQ_FOR_STATIC_FSE 1000 - /* LL codes */ - { static const BYTE LL_Code[64] = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 16, 17, 17, 18, 18, 19, 19, - 20, 20, 20, 20, 21, 21, 21, 21, - 22, 22, 22, 22, 22, 22, 22, 22, - 23, 23, 23, 23, 23, 23, 23, 23, - 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24 }; - const BYTE LL_deltaCode = 19; - size_t u; - for (u=0; ulongLength; llTable[u] = (U16)ll; } - llCodeTable[u] = (ll>63) ? (BYTE)ZSTD_highbit(ll) + LL_deltaCode : LL_Code[ll]; - } } + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr, nbSeq); /* CTable for Literal Lengths */ { U32 max = MaxLL; @@ -660,9 +694,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, LLtype = FSE_ENCODING_DYNAMIC; } } - /* Offset codes */ - { size_t i; for (i=0; i 2)) { @@ -686,23 +718,6 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, Offtype = FSE_ENCODING_DYNAMIC; } } - /* ML codes */ - { static const BYTE ML_Code[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, - 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, - 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, - 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, - 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 }; - const BYTE ML_deltaCode = 36; - size_t u; - for (u=0; ulongLength; mlTable[u] = (U16)ml; } - mlCodeTable[u] = (ml>127) ? (BYTE)ZSTD_highbit(ml) + ML_deltaCode : ML_Code[ml]; - } } - /* CTable for MatchLengths */ { U32 max = MaxML; size_t const mostFrequent = FSE_countFast(count, &max, mlCodeTable, nbSeq); diff --git a/lib/zstd_internal.h b/lib/zstd_internal.h index 4ce44399..ff271340 100644 --- a/lib/zstd_internal.h +++ b/lib/zstd_internal.h @@ -236,7 +236,8 @@ typedef struct { #endif } seqStore_t; -seqStore_t ZSTD_copySeqStore(const ZSTD_CCtx* ctx); +const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); +void ZSTD_seqToCodes(const seqStore_t* seqStorePtr, size_t const nbSeq); #endif /* ZSTD_CCOMMON_H_MODULE */ diff --git a/programs/fileio.c b/programs/fileio.c index 907d990a..ff77a8a9 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -333,28 +333,30 @@ static int FIO_compressFilename_internal(cRess_t ress, { FILE* srcFile = ress.srcFile; FILE* dstFile = ress.dstFile; - U64 filesize = 0; + U64 readsize = 0; U64 compressedfilesize = 0; size_t dictSize = ress.dictBufferSize; size_t sizeCheck, errorCode; ZSTD_parameters params; /* init */ - filesize = MAX(FIO_getFileSize(srcFileName),dictSize); - params = ZSTD_getParams(cLevel, filesize); - params.srcSize = filesize; + { U64 const filesize = FIO_getFileSize(srcFileName); + U64 const levelsize = MAX(FIO_getFileSize(srcFileName), dictSize); + params = ZSTD_getParams(cLevel, levelsize); + params.srcSize = filesize; + } if (g_maxWLog) if (params.windowLog > g_maxWLog) params.windowLog = g_maxWLog; errorCode = ZBUFF_compressInit_advanced(ress.ctx, ress.dictBuffer, ress.dictBufferSize, params); if (ZBUFF_isError(errorCode)) EXM_THROW(21, "Error initializing compression : %s", ZBUFF_getErrorName(errorCode)); /* Main compression loop */ - filesize = 0; + readsize = 0; while (1) { /* Fill input Buffer */ - size_t inSize = fread(ress.srcBuffer, (size_t)1, ress.srcBufferSize, srcFile); + size_t const inSize = fread(ress.srcBuffer, (size_t)1, ress.srcBufferSize, srcFile); if (inSize==0) break; - filesize += inSize; - DISPLAYUPDATE(2, "\rRead : %u MB ", (U32)(filesize>>20)); + readsize += inSize; + DISPLAYUPDATE(2, "\rRead : %u MB ", (U32)(readsize>>20)); { /* Compress using buffered streaming */ size_t usedInSize = inSize; @@ -371,13 +373,12 @@ static int FIO_compressFilename_internal(cRess_t ress, if (sizeCheck!=cSize) EXM_THROW(25, "Write error : cannot write compressed block into %s", dstFileName); compressedfilesize += cSize; } - DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%% ", (U32)(filesize>>20), (double)compressedfilesize/filesize*100); + DISPLAYUPDATE(2, "\rRead : %u MB ==> %.2f%% ", (U32)(readsize>>20), (double)compressedfilesize/readsize*100); } /* End of Frame */ - { - size_t cSize = ress.dstBufferSize; - size_t result = ZBUFF_compressEnd(ress.ctx, ress.dstBuffer, &cSize); + { size_t cSize = ress.dstBufferSize; + size_t const result = ZBUFF_compressEnd(ress.ctx, ress.dstBuffer, &cSize); if (result!=0) EXM_THROW(26, "Compression error : cannot create frame end"); sizeCheck = fwrite(ress.dstBuffer, 1, cSize, dstFile); @@ -388,7 +389,7 @@ static int FIO_compressFilename_internal(cRess_t ress, /* Status */ DISPLAYLEVEL(2, "\r%79s\r", ""); DISPLAYLEVEL(2,"Compressed %llu bytes into %llu bytes ==> %.2f%%\n", - (unsigned long long) filesize, (unsigned long long) compressedfilesize, (double)compressedfilesize/filesize*100); + (unsigned long long)readsize, (unsigned long long) compressedfilesize, (double)compressedfilesize/readsize*100); return 0; } diff --git a/programs/playTests.sh b/programs/playTests.sh index 444d91eb..aa0ffc3a 100755 --- a/programs/playTests.sh +++ b/programs/playTests.sh @@ -25,7 +25,7 @@ roundTripTest() { echo "\n**** simple tests **** " ./datagen > tmp -$ZSTD tmp +$ZSTD -f tmp $ZSTD -99 tmp && die "too large compression level undetected" $ZSTD tmp -c > tmpCompressed $ZSTD tmp --stdout > tmpCompressed @@ -71,6 +71,11 @@ echo "\n**** dictionary tests **** " ./datagen -g1M | md5sum > tmp1 ./datagen -g1M | $ZSTD -D tmpDict | $ZSTD -D tmpDict -dvq | md5sum > tmp2 diff -q tmp1 tmp2 +$ZSTD --train *.c *.h -o tmpDict +$ZSTD xxhash.c -D tmpDict -of tmp +$ZSTD -d tmp -D tmpDict -of result +diff xxhash.c result + echo "\n**** multiple files tests **** "