From 94efb1749d5b6fd4ce19d7299c1014999bea28cd Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 3 Feb 2018 23:54:10 -0800 Subject: [PATCH 1/3] faster decoding in 32-bits mode for long offsets (tentative) On my laptop: Before: ./zstd32 -b --zstd=wlog=27 silesia.tar enwik8 -S 3#silesia.tar : 211984896 -> 66683478 (3.179), 97.6 MB/s , 400.7 MB/s 3#enwik8 : 100000000 -> 35643153 (2.806), 76.5 MB/s , 303.2 MB/s After: ./zstd32 -b --zstd=wlog=27 silesia.tar enwik8 -S 3#silesia.tar : 211984896 -> 66683478 (3.179), 97.4 MB/s , 435.0 MB/s 3#enwik8 : 100000000 -> 35643153 (2.806), 76.2 MB/s , 338.1 MB/s Mileage vary, depending on file, and cpu type. But a generic rule is : x86 benefits less from "long-offset mode" than x64, maybe due to register pressure. On "entropy", long-mode is _never_ a win for x86. On my laptop though, it may, depending on file and compression level (enwik8 benefits more from "long-mode" than silesia). --- lib/decompress/zstd_decompress.c | 43 +++++++++++++++++--------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 6cbce9be..1a1fb250 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -23,7 +23,7 @@ /*! * LEGACY_SUPPORT : -* if set to 1, ZSTD_decompress() can decode older formats (v0.1+) +* if set to 1+, ZSTD_decompress() can decode older formats (v0.1+) */ #ifndef ZSTD_LEGACY_SUPPORT # define ZSTD_LEGACY_SUPPORT 0 @@ -235,8 +235,8 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) /*-************************************************************* -* Decompression section -***************************************************************/ + * Frame header decoding + ***************************************************************/ /*! ZSTD_isFrame() : * Tells if the content of `buffer` starts with a valid Frame Identifier. @@ -258,7 +258,7 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size) /** ZSTD_frameHeaderSize_internal() : * srcSize must be large enough to reach header size fields. - * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. * @return : size of the Frame Header * or an error code, which can be tested with ZSTD_isError() */ static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format) @@ -481,6 +481,10 @@ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t he } +/*-************************************************************* + * Block decoding + ***************************************************************/ + /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, @@ -936,13 +940,14 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); assert(ofBits <= MaxOff); - if (MEM_32bits() && longOffsets) { - U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream); + BIT_reloadDStream(&seqState->DStream); if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ } else { - offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } } @@ -955,7 +960,7 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; seqState->prevOffset[1] = seqState->prevOffset[0]; seqState->prevOffset[0] = offset = temp; - } else { + } else { /* offset == 0 */ offset = seqState->prevOffset[0]; } } else { @@ -967,16 +972,16 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e l } seq.matchLength = ML_base[mlCode] - + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */ if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) BIT_reloadDStream(&seqState->DStream); if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) BIT_reloadDStream(&seqState->DStream); - /* Verify that there is enough bits to read the rest of the data in 64-bit mode. */ + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); seq.litLength = LL_base[llCode] - + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); @@ -1364,13 +1369,13 @@ static size_t ZSTD_decompressSequencesLong( FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); /* prepare in advance */ - for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNbfParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))); - /* windowSize could be any value at this point, since it is only validated - * in the streaming API. - */ DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); if (srcSize >= ZSTD_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); @@ -1429,7 +1430,9 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, ip += litCSize; srcSize -= litCSize; } - if (frame && dctx->fParams.windowSize > (1<<23)) + if ( frame /* windowSize exists */ + && (dctx->fParams.windowSize > (1<<24)) + && MEM_64bits() /* x86 benefits less from long mode than x64 */ ) return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, isLongOffset); return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, isLongOffset); } From 0170cf9a7af94d9ffdcdb5cd3ebc35fcd542f2ed Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 5 Feb 2018 11:46:02 -0800 Subject: [PATCH 2/3] minor : modified ZSTD_preserveUnsortedMark() to be more vectorization friendly --- lib/common/fse_decompress.c | 4 ++-- lib/compress/zstd_lazy.c | 37 +++++++++++++++++++++----------- lib/decompress/zstd_decompress.c | 4 ++-- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/lib/common/fse_decompress.c b/lib/common/fse_decompress.c index 8e3f0035..4c66c3b7 100644 --- a/lib/common/fse_decompress.c +++ b/lib/common/fse_decompress.c @@ -139,8 +139,8 @@ size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned { U32 u; for (u=0; usmaller:0*, meaning the next candidate will be considered smaller. + * to *sorted=>smaller:0*, meaning next candidate will be considered smaller. * This could be wrong, and result in data corruption. + * * On second thought, this corruption might be impossible, - * because unsorted elements are always at the beginning of the list, - * and squashing to zero reduce the list to a single element, + * because unsorted elements stand at the beginning of the list, + * and squashing to zero reduces the list to a single element, * which needs to be sorted anyway. * I haven't spent much thoughts into this possible scenario, - * and just felt it was safer to implement ZSTD_preserveUnsortedMark() */ + * and just felt it was safer to implement ZSTD_preserveUnsortedMark() + * + * `size` : must be a positive multiple of ZSTD_ROWSIZE */ +#define ZSTD_ROWSIZE 16 void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue) { - U32 u; - for (u=0; udtable; return 0; case set_repeat: if (!flagRepeatTable) return ERROR(corruption_detected); From de68c2ff10f3800f5aec60abd26e63f00b5f32bc Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 7 Feb 2018 14:22:35 -0800 Subject: [PATCH 3/3] Merged ZSTD_preserveUnsortedMark() into ZSTD_reduceIndex() as it's faster, due to one memory scan instead of two (confirmed by microbenchmark). Note : as ZSTD_reduceIndex() is rarely invoked, it does not translate into a visible gain. Consider it an exercise in auto-vectorization and micro-benchmarking. --- lib/compress/zstd_compress.c | 39 ++++++++++++++++--------- lib/compress/zstd_compress_internal.h | 6 ++++ lib/compress/zstd_lazy.c | 42 --------------------------- 3 files changed, 32 insertions(+), 55 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 4a825f65..3f7ac849 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1223,32 +1223,44 @@ size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long #define ZSTD_ROWSIZE 16 -/*! ZSTD_reduceTable_internal() : - * reduce table indexes by `reducerValue` - * presume table size is a multiple of ZSTD_ROWSIZE. - * Helps auto-vectorization */ -static void ZSTD_reduceTable_internal (U32* const table, int const nbRows, U32 const reducerValue) +/*! ZSTD_reduceTable() : + * reduce table indexes by `reducerValue`, or squash to zero. + * PreserveMark preserves "unsorted mark" for btlazy2 strategy. + * It must be set to a clear 0/1 value, to remove branch during inlining. + * Presume table size is a multiple of ZSTD_ROWSIZE + * to help auto-vectorization */ +FORCE_INLINE_TEMPLATE void +ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark) { + int const nbRows = (int)size / ZSTD_ROWSIZE; int cellNb = 0; int rowNb; + assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ + assert(size < (1U<<31)); /* can be casted to int */ for (rowNb=0 ; rowNb < nbRows ; rowNb++) { int column; for (column=0; columnappliedParams.cParams.strategy != ZSTD_fast) { U32 const chainSize = (U32)1 << zc->appliedParams.cParams.chainLog; if (zc->appliedParams.cParams.strategy == ZSTD_btlazy2) - ZSTD_preserveUnsortedMark(ms->chainTable, chainSize, reducerValue); - ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue); + ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); + else + ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue); } if (ms->hashLog3) { diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index cf593658..35b153fa 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -32,6 +32,12 @@ extern "C" { ***************************************/ static const U32 g_searchStrength = 8; #define HASH_READ_SIZE 8 +#define ZSTD_DUBT_UNSORTED_MARK 1 /* For btlazy2 strategy, index 1 now means "unsorted". + It could be confused for a real successor at index "1", if sorted as larger than its predecessor. + It's not a big deal though : candidate will just be sorted again. + Additionnally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. + The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */ /*-************************************* diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 844afc05..8252135d 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -15,48 +15,6 @@ /*-************************************* * Binary Tree search ***************************************/ -#define ZSTD_DUBT_UNSORTED_MARK 1 /* note : index 1 will now be confused with "unsorted" if sorted as larger than its predecessor. - It's not a big deal though : the candidate will just be considered unsorted, and be sorted again. - Additionnally, candidate position 1 will be lost. - But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. - The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be misdhandled after table re-use with a different strategy */ - -/*! ZSTD_preserveUnsortedMark() : - * pre-emptively increase value of ZSTD_DUBT_UNSORTED_MARK before ZSTD_reduceTable() - * so that combined operation preserves its value. - * Without it, ZSTD_DUBT_UNSORTED_MARK==1 would be squashed to 0. - * As a consequence, the list of unsorted elements would stop at first element, - * removing candidates, resulting in a very small loss to compression ratio - * (since overflow protection with ZSTD_reduceTable() is relatively rare). - * - * Another potential risk is that a position will be promoted from *unsorted* - * to *sorted=>smaller:0*, meaning next candidate will be considered smaller. - * This could be wrong, and result in data corruption. - * - * On second thought, this corruption might be impossible, - * because unsorted elements stand at the beginning of the list, - * and squashing to zero reduces the list to a single element, - * which needs to be sorted anyway. - * I haven't spent much thoughts into this possible scenario, - * and just felt it was safer to implement ZSTD_preserveUnsortedMark() - * - * `size` : must be a positive multiple of ZSTD_ROWSIZE */ -#define ZSTD_ROWSIZE 16 -void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue) -{ - int cellNb = 0; - U32 const nbRows = size / ZSTD_ROWSIZE; - U32 rowNb; - assert((size % ZSTD_ROWSIZE) == 0); - for (rowNb=0 ; rowNb < nbRows ; rowNb++) { - int column; - for (column=0; column