From 05430b25a860534f5cac2629d97b2d6a34922c89 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 14 Dec 2021 02:12:09 -0800 Subject: [PATCH] roll SSE implementation of row_lazy match finder mostly for maintenance convenience. Performance wise, there is very little change, slightly faster for slog 3 & 4, neutral or very slightly negative for slot 5 & 6. --- lib/compress/zstd_lazy.c | 80 ++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 5326ec5f..a585b79d 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -417,7 +417,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; U32* const tmpHashTable = hashTable; U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); - U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; + U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; U32 hashIdx; @@ -982,47 +982,45 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); } +#if defined(ZSTD_ARCH_X86_SSE2) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) +{ + const __m128i comparisonMask = _mm_set1_epi8((char)tag); + int matches[4] = {0}; + int i; + assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); + for (i=0; ianchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ @@ -1670,7 +1668,7 @@ ZSTD_compressBlock_lazy_generic( } /* store sequence */ _storeSequence: - { size_t const litLength = start - anchor; + { size_t const litLength = (size_t)(start - anchor); ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); anchor = ip = start + matchLength; } @@ -2003,7 +2001,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( /* catch up */ if (offset) { - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + U32 const matchIndex = (U32)((size_t)(start-base) - (offset - ZSTD_REP_MOVE)); const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ @@ -2012,7 +2010,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( /* store sequence */ _storeSequence: - { size_t const litLength = start - anchor; + { size_t const litLength = (size_t)(start - anchor); ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); anchor = ip = start + matchLength; }