From 25bda9053add6218a58d88c5b8119afa63165231 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 16:32:04 -0600 Subject: [PATCH 01/11] Add files via upload msvc suport avx2 path --- lib/compress/zstd_lazy.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 3d523e84..6df85e47 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -873,7 +873,7 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */ -#if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */ +#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) ||defined(__AVX__)) /* SIMD SSE version*/ #include typedef __m128i ZSTD_Vec128; @@ -894,7 +894,7 @@ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) { static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) { return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); } - +#if !defined(__AVX2__) typedef struct { __m128i fst; __m128i snd; @@ -921,6 +921,28 @@ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd); return fstMask | (sndMask << 16); } +#else//AVX2 +typedef struct { + __m256i v; +} ZSTD_Vec256; + +static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { + ZSTD_Vec256 v; + v.v = _mm256_load_si256((const __m256i*)ptr); + return v; +} + +static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { + ZSTD_Vec256 v; + v.v = _mm256_set1_epi32(val); + return v; +} + +static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { + return (ZSTD_VecMask)_mm256_movemask_epi8(_mm256_cmpeq_epi8(x.v, y.v)); +} + +#endif #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */ From 52f44bb365337054d30a8e0edf83dd7c612b4d32 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 16:33:07 -0600 Subject: [PATCH 02/11] Add files via upload msvc --- lib/compress/zstd_compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index b7ee2980..27a2798a 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -222,7 +222,7 @@ static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_useR /* Returns row matchfinder usage enum given an initial mode and cParams */ static ZSTD_useRowMatchFinderMode_e ZSTD_resolveRowMatchFinderMode(ZSTD_useRowMatchFinderMode_e mode, const ZSTD_compressionParameters* const cParams) { -#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(__ARM_NEON)) +#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(__AVX__) || defined(__ARM_NEON)) int const kHasSIMD128 = 1; #else int const kHasSIMD128 = 0; From 77d54eb3b3116bf9606426730de818f33907aec3 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 16:40:32 -0600 Subject: [PATCH 03/11] Add files via upload --- lib/compress/zstd_compress.c | 2 +- lib/compress/zstd_lazy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 27a2798a..584678f7 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -222,7 +222,7 @@ static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_useR /* Returns row matchfinder usage enum given an initial mode and cParams */ static ZSTD_useRowMatchFinderMode_e ZSTD_resolveRowMatchFinderMode(ZSTD_useRowMatchFinderMode_e mode, const ZSTD_compressionParameters* const cParams) { -#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(__AVX__) || defined(__ARM_NEON)) +#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(_M_AMD64) || defined(__ARM_NEON)) int const kHasSIMD128 = 1; #else int const kHasSIMD128 = 0; diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 6df85e47..50861891 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -873,7 +873,7 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */ -#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) ||defined(__AVX__)) /* SIMD SSE version*/ +#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) ||defined(_M_AMD64)) /* SIMD SSE version*/ #include typedef __m128i ZSTD_Vec128; From 0b9f4bb0ff1f313bea9e9166f693ec64a3a6a43e Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 16:47:24 -0600 Subject: [PATCH 04/11] Update zstd_lazy.c use 8bit --- lib/compress/zstd_lazy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 50861891..e445d5a6 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -934,7 +934,7 @@ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { ZSTD_Vec256 v; - v.v = _mm256_set1_epi32(val); + v.v = _mm256_set1_epi8(val); return v; } From 69ac124b1209ccf12c8c5969f0d7a2124ccbe554 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 16:53:19 -0600 Subject: [PATCH 05/11] Update zstd_lazy.c --- lib/compress/zstd_lazy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index e445d5a6..bccf2cc1 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -934,7 +934,7 @@ static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { ZSTD_Vec256 v; - v.v = _mm256_set1_epi8(val); + v.v = _mm256_set1_epi8((char)val); return v; } From 0e071214b5721f3415611d07d33469f8026c3bb0 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 17:03:30 -0600 Subject: [PATCH 06/11] Update zstd_lazy.c switch to unaligned load as I don't know if buffer will always be aligned to 32 bytes, and compilers aside from MSVC might actually use aligned loads --- lib/compress/zstd_lazy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index bccf2cc1..008b0a6b 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -928,7 +928,7 @@ typedef struct { static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { ZSTD_Vec256 v; - v.v = _mm256_load_si256((const __m256i*)ptr); + v.v = _mm256_loadu_si256((const __m256i*)ptr); return v; } From 8f7ea1afeba1f3762e99413424df95b9faf4d2d8 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 19:02:34 -0600 Subject: [PATCH 07/11] Update zstd_lazy.c Switch to other comment style --- lib/compress/zstd_lazy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 008b0a6b..cbe2adae 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -921,7 +921,7 @@ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd); return fstMask | (sndMask << 16); } -#else//AVX2 +#else/* AVX2 */ typedef struct { __m256i v; } ZSTD_Vec256; From a62856bf65f381eb2f99d056005b4b39cb7c8725 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 19:10:24 -0600 Subject: [PATCH 08/11] Update zstd_lazy.c Remove the AVX2 part --- lib/compress/zstd_lazy.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index cbe2adae..1add4209 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -894,7 +894,7 @@ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) { static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) { return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); } -#if !defined(__AVX2__) + typedef struct { __m128i fst; __m128i snd; @@ -921,28 +921,6 @@ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd); return fstMask | (sndMask << 16); } -#else/* AVX2 */ -typedef struct { - __m256i v; -} ZSTD_Vec256; - -static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { - ZSTD_Vec256 v; - v.v = _mm256_loadu_si256((const __m256i*)ptr); - return v; -} - -static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { - ZSTD_Vec256 v; - v.v = _mm256_set1_epi8((char)val); - return v; -} - -static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { - return (ZSTD_VecMask)_mm256_movemask_epi8(_mm256_cmpeq_epi8(x.v, y.v)); -} - -#endif #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */ From bb1cdd8c63046e66de63cf76448868bbc1dc6b72 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 19:11:28 -0600 Subject: [PATCH 09/11] Update zstd_lazy.c add space --- lib/compress/zstd_lazy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 1add4209..1f220831 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -873,7 +873,7 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */ -#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) ||defined(_M_AMD64)) /* SIMD SSE version*/ +#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(_M_AMD64)) /* SIMD SSE version*/ #include typedef __m128i ZSTD_Vec128; From d688ab1e0cfcbe5a894f07bab4033978d99bebd3 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 19:18:12 -0600 Subject: [PATCH 10/11] Add files via upload AVX2 --- lib/compress/zstd_lazy.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 1f220831..29681462 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -873,7 +873,7 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */ -#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(_M_AMD64)) /* SIMD SSE version*/ +#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) ||defined(_M_AMD64)) /* SIMD SSE version*/ #include typedef __m128i ZSTD_Vec128; @@ -894,7 +894,7 @@ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) { static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) { return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); } - +#if !defined(__AVX2__) typedef struct { __m128i fst; __m128i snd; @@ -921,6 +921,27 @@ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd); return fstMask | (sndMask << 16); } +#else/* AVX2 */ +typedef struct { + __m256i v; +} ZSTD_Vec256; + +static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { + ZSTD_Vec256 v; + v.v = _mm256_loadu_si256((const __m256i*)ptr); + return v; +} + +static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { + ZSTD_Vec256 v; + v.v = _mm256_set1_epi8((char)val); + return v; +} + +static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { + return (ZSTD_VecMask)_mm256_movemask_epi8(_mm256_cmpeq_epi8(x.v, y.v)); +} +#endif #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */ From bee0ef56475eb567b7c801812a1a5f8215b4ffd9 Mon Sep 17 00:00:00 2001 From: TrianglesPCT Date: Fri, 14 May 2021 19:23:13 -0600 Subject: [PATCH 11/11] Update zstd_lazy.c It put the changes back when I tried to make a separate pull request, i don't understand githubs interface at all. --- lib/compress/zstd_lazy.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 29681462..1f220831 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -873,7 +873,7 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */ -#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) ||defined(_M_AMD64)) /* SIMD SSE version*/ +#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(_M_AMD64)) /* SIMD SSE version*/ #include typedef __m128i ZSTD_Vec128; @@ -894,7 +894,7 @@ static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) { static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) { return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); } -#if !defined(__AVX2__) + typedef struct { __m128i fst; __m128i snd; @@ -921,27 +921,6 @@ static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd); return fstMask | (sndMask << 16); } -#else/* AVX2 */ -typedef struct { - __m256i v; -} ZSTD_Vec256; - -static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { - ZSTD_Vec256 v; - v.v = _mm256_loadu_si256((const __m256i*)ptr); - return v; -} - -static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { - ZSTD_Vec256 v; - v.v = _mm256_set1_epi8((char)val); - return v; -} - -static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { - return (ZSTD_VecMask)_mm256_movemask_epi8(_mm256_cmpeq_epi8(x.v, y.v)); -} -#endif #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */