SIMDE was introduced for aarch64 support, however, the library itself supports non-SIMD fallback, which allows us provide support to other platforms without code changes. There is another world beyond x86. So we can simply enable SIMDE for processors without SSE2 support. Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
4198 lines
97 KiB
C
4198 lines
97 KiB
C
/* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use, copy,
|
|
* modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
* of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*
|
|
* Copyright:
|
|
* 2017 Evan Nemerson <evan@nemerson.com>
|
|
* 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
|
|
* 2015 Brandon Rowlett <browlett@nvidia.com>
|
|
* 2015 Ken Fast <kfast@gdeb.com>
|
|
* 2017 Hasindu Gamaarachchi <hasindu@unsw.edu.au>
|
|
* 2018 Jeff Daily <jeff.daily@amd.com>
|
|
*/
|
|
|
|
#if !defined(SIMDE__SSE2_H)
|
|
#if !defined(SIMDE__SSE2_H)
|
|
#define SIMDE__SSE2_H
|
|
#endif
|
|
#include "sse.h"
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#undef SIMDE_SSE2_NATIVE
|
|
#endif
|
|
#if defined(SIMDE_SSE2_FORCE_NATIVE)
|
|
#define SIMDE_SSE2_NATIVE
|
|
#elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \
|
|
!defined(SIMDE_NO_NATIVE)
|
|
#define SIMDE_SSE2_NATIVE
|
|
#elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \
|
|
!defined(SIMDE_NO_NEON)
|
|
#define SIMDE_SSE2_NEON
|
|
#endif
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE)
|
|
#if defined(SIMDE_SSE2_FORCE_NATIVE)
|
|
#error Native SSE2 support requires native SSE support
|
|
#else
|
|
#warning Native SSE2 support requires native SSE support, disabling
|
|
#undef SIMDE_SSE2_NATIVE
|
|
#endif
|
|
#elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON)
|
|
#warning SSE2 NEON support requires SSE NEON support, disabling
|
|
#undef SIMDE_SSE_NEON
|
|
#endif
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#include <emmintrin.h>
|
|
#else
|
|
#if defined(SIMDE_SSE2_NEON)
|
|
#include <arm_neon.h>
|
|
#endif
|
|
#endif
|
|
|
|
#include <stdint.h>
|
|
#include <limits.h>
|
|
#include <string.h>
|
|
|
|
#define vreinterpretq_m128i_s32(v) \
|
|
(simde__m128i) { .neon_i32 = v }
|
|
#define vreinterpretq_m128i_u64(v) \
|
|
(simde__m128i) { .neon_u64 = v }
|
|
|
|
#define vreinterpretq_s32_m128i(a) a.neon_i32
|
|
#define vreinterpretq_u64_m128i(a) a.neon_u64
|
|
|
|
SIMDE__BEGIN_DECLS
|
|
|
|
typedef SIMDE_ALIGN(16) union {
|
|
#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
|
|
int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
|
|
int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
|
|
int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
|
|
int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
|
|
#if defined(SIMDE__HAVE_INT128)
|
|
simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
|
|
simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
|
|
#endif
|
|
simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
|
|
simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
|
|
#else
|
|
int8_t i8[16];
|
|
int16_t i16[8];
|
|
int32_t i32[4];
|
|
int64_t i64[2];
|
|
uint8_t u8[16];
|
|
uint16_t u16[8];
|
|
uint32_t u32[4];
|
|
uint64_t u64[2];
|
|
#if defined(SIMDE__HAVE_INT128)
|
|
simde_int128 i128[1];
|
|
simde_uint128 u128[1];
|
|
#endif
|
|
simde_float32 f32[4];
|
|
simde_float64 f64[2];
|
|
#endif
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
__m128i n;
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8x16_t neon_i8;
|
|
int16x8_t neon_i16;
|
|
int32x4_t neon_i32;
|
|
int64x2_t neon_i64;
|
|
uint8x16_t neon_u8;
|
|
uint16x8_t neon_u16;
|
|
uint32x4_t neon_u32;
|
|
uint64x2_t neon_u64;
|
|
float32x4_t neon_f32;
|
|
#if defined(SIMDE_ARCH_AMD64)
|
|
float64x2_t neon_f64;
|
|
#endif
|
|
#endif
|
|
} simde__m128i;
|
|
|
|
typedef SIMDE_ALIGN(16) union {
|
|
#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
|
|
int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
|
|
int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
|
|
int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
|
|
int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
|
|
simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
|
|
simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
|
|
#else
|
|
int8_t i8[16];
|
|
int16_t i16[8];
|
|
int32_t i32[4];
|
|
int64_t i64[2];
|
|
uint8_t u8[16];
|
|
uint16_t u16[8];
|
|
uint32_t u32[4];
|
|
uint64_t u64[2];
|
|
simde_float32 f32[4];
|
|
simde_float64 f64[2];
|
|
#endif
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
__m128d n;
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8x16_t neon_i8;
|
|
int16x8_t neon_i16;
|
|
int32x4_t neon_i32;
|
|
int64x2_t neon_i64;
|
|
uint8x16_t neon_u8;
|
|
uint16x8_t neon_u16;
|
|
uint32x4_t neon_u32;
|
|
uint64x2_t neon_u64;
|
|
float32x4_t neon_f32;
|
|
#if defined(SIMDE_ARCH_AMD64)
|
|
float64x2_t neon_f64;
|
|
#endif
|
|
#endif
|
|
} simde__m128d;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
HEDLEY_STATIC_ASSERT(sizeof(__m128i) == sizeof(simde__m128i),
|
|
"__m128i size doesn't match simde__m128i size");
|
|
HEDLEY_STATIC_ASSERT(sizeof(__m128d) == sizeof(simde__m128d),
|
|
"__m128d size doesn't match simde__m128d size");
|
|
SIMDE__FUNCTION_ATTRIBUTES simde__m128i SIMDE__M128I_C(__m128i v)
|
|
{
|
|
simde__m128i r;
|
|
r.n = v;
|
|
return r;
|
|
}
|
|
SIMDE__FUNCTION_ATTRIBUTES simde__m128d SIMDE__M128D_C(__m128d v)
|
|
{
|
|
simde__m128d r;
|
|
r.n = v;
|
|
return r;
|
|
}
|
|
#elif defined(SIMDE_SSE_NEON)
|
|
#define SIMDE__M128I_NEON_C(T, expr) \
|
|
(simde__m128i) { .neon_##T = expr }
|
|
#define SIMDE__M128D_NEON_C(T, expr) \
|
|
(simde__m128d) { .neon_##T = expr }
|
|
#endif
|
|
HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
|
|
HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = a.i8[i] + b.i8[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a.i16[i] + b.i16[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] + b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] + b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_add_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64)
|
|
return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f64[i] + b.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_add_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.f64[0] = a.f64[0] + b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M64_C(_mm_add_si64(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = a.i64[0] + b.i64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
|
|
r.i8[i] = INT8_MAX;
|
|
} else if ((((b.i8[i]) < 0) &&
|
|
((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
|
|
r.i8[i] = INT8_MIN;
|
|
} else {
|
|
r.i8[i] = (a.i8[i]) + (b.i8[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
if ((((b.i16[i]) > 0) &&
|
|
((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
|
|
r.i16[i] = INT16_MAX;
|
|
} else if ((((b.i16[i]) < 0) &&
|
|
((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) {
|
|
r.i16[i] = INT16_MIN;
|
|
} else {
|
|
r.i16[i] = (a.i16[i]) + (b.i16[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
|
|
r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i])
|
|
? (a.u8[i] + b.u8[i])
|
|
: UINT8_MAX;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i])
|
|
? (a.u16[i] + b.u16[i])
|
|
: UINT16_MAX;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_and_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
|
|
r.u64[i] = a.u64[i] & b.u64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_and_si128(a.n, b.n));
|
|
#elif defined(SIMDE_SSE_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] & b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
|
|
r.u64[i] = ~a.u64[i] & b.u64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = ~(a.i64[i]) & b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
|
|
r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
if (HEDLEY_UNLIKELY(imm8 > 15)) {
|
|
r.u64[0] = 0;
|
|
r.u64[1] = 0;
|
|
return r;
|
|
}
|
|
|
|
const int s = imm8 * 8;
|
|
|
|
#if defined(SIMDE__HAVE_INT128)
|
|
r.u128[0] = a.u128[0] << s;
|
|
#else
|
|
if (s < 64) {
|
|
r.u64[0] = (a.u64[0] << s);
|
|
r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s));
|
|
} else {
|
|
r.u64[0] = 0;
|
|
r.u64[1] = a.u64[0] << (s - 64);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
#define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_bslli_si128(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i8, \
|
|
(((imm8) <= 0) ? ((a).neon_i8) \
|
|
: (((imm8) > 15) ? (vdupq_n_s8(0)) \
|
|
: (vextq_s8(vdupq_n_s8(0), \
|
|
(a).neon_i8, \
|
|
16 - (imm8))))))
|
|
#endif
|
|
#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
if (HEDLEY_UNLIKELY(imm8 > 15)) {
|
|
r.u64[0] = 0;
|
|
r.u64[1] = 0;
|
|
return r;
|
|
}
|
|
|
|
const int s = imm8 * 8;
|
|
|
|
#if defined(SIMDE__HAVE_INT128)
|
|
r.u128[0] = a.u128[0] >> s;
|
|
#else
|
|
if (s < 64) {
|
|
r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s));
|
|
r.u64[1] = (a.u64[1] >> s);
|
|
} else {
|
|
r.u64[0] = a.u64[1] >> (s - 64);
|
|
r.u64[1] = 0;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
#define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_bsrli_si128(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i8, \
|
|
((imm8) <= 0) \
|
|
? ((a).neon_i8) \
|
|
: (((imm8) > 15) ? (vdupq_n_s8(0)) \
|
|
: (vextq_s8((a).neon_i8, \
|
|
vdupq_n_s8(0), (imm8)))))
|
|
#endif
|
|
#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_clflush(void const *p)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_clflush(p);
|
|
#else
|
|
(void)p;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comieq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comieq_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] == b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comige_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comige_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] >= b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comigt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comigt_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] > b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comile_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comile_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] <= b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comilt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comilt_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] < b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comineq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comineq_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] != b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_castpd_ps(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_castpd_ps(a.n));
|
|
#else
|
|
union {
|
|
simde__m128d pd;
|
|
simde__m128 ps;
|
|
} r;
|
|
r.pd = a;
|
|
return r.ps;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_castpd_si128(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_castpd_si128(a.n));
|
|
#else
|
|
union {
|
|
simde__m128d pd;
|
|
simde__m128i si128;
|
|
} r;
|
|
r.pd = a;
|
|
return r.si128;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_castps_pd(simde__m128 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_castps_pd(a.n));
|
|
#else
|
|
union {
|
|
simde__m128 ps;
|
|
simde__m128d pd;
|
|
} r;
|
|
r.ps = a;
|
|
return r.pd;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_castps_si128(simde__m128 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_castps_si128(a.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, a.neon_i32);
|
|
#else
|
|
union {
|
|
simde__m128 ps;
|
|
simde__m128i si128;
|
|
} r;
|
|
r.ps = a;
|
|
return r.si128;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_castsi128_pd(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_castsi128_pd(a.n));
|
|
#else
|
|
union {
|
|
simde__m128i si128;
|
|
simde__m128d pd;
|
|
} r;
|
|
r.si128 = a;
|
|
return r.pd;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_castsi128_ps(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_castsi128_ps(a.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128_NEON_C(f32, a.neon_f32);
|
|
#else
|
|
union {
|
|
simde__m128i si128;
|
|
simde__m128 ps;
|
|
} r;
|
|
r.si128 = a;
|
|
return r.ps;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128D_NEON_C(
|
|
i32, vreinterpretq_s32_u32(
|
|
vceqq_s32(vreinterpretq_s32_f32(b.neon_f32),
|
|
vreinterpretq_s32_f32(a.neon_f32))));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0;
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128D_NEON_C(f32,
|
|
vreinterpretq_f32_u16(vmvnq_u16(
|
|
vceqq_s16(b.neon_i16, a.neon_i16))));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmplt_pd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmplt_sd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmpge_pd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmpge_sd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmpgt_pd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmpgt_sd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0)
|
|
: UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0)
|
|
: UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0)
|
|
: UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0)
|
|
: UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtepi32_pd(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = (simde_float64)a.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_cvtepi32_ps(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_cvtepi32_ps(a.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32));
|
|
#else
|
|
simde__m128 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
|
|
r.f32[i] = (simde_float32)a.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvtpd_epi32(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.i32[i] = (int32_t)a.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cvtpd_pi32(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M64_C(_mm_cvtpd_pi32(a.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (int32_t)a.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_cvtpd_ps(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_cvtpd_ps(a.n));
|
|
#else
|
|
simde__m128 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
|
|
r.f32[i] = (simde_float32)a.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtpi32_pd(simde__m64 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = (simde_float64)a.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvtps_epi32(simde__m128 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cvtps_epi32(a.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
/* The default rounding mode on SSE is 'round to even', which ArmV7
|
|
does not support! It is supported on ARMv8 however. */
|
|
#if defined(SIMDE_ARCH_AARCH64)
|
|
return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32));
|
|
#else
|
|
uint32x4_t signmask = vdupq_n_u32(0x80000000);
|
|
float32x4_t half = vbslq_f32(signmask, a.neon_f32,
|
|
vdupq_n_f32(0.5f)); /* +/- 0.5 */
|
|
int32x4_t r_normal = vcvtq_s32_f32(
|
|
vaddq_f32(a.neon_f32, half)); /* round to integer: [a + 0.5]*/
|
|
int32x4_t r_trunc =
|
|
vcvtq_s32_f32(a.neon_f32); /* truncate to integer: [a] */
|
|
int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
|
|
int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
|
|
vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
|
|
float32x4_t delta = vsubq_f32(
|
|
a.neon_f32,
|
|
vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
|
|
uint32x4_t is_delta_half =
|
|
vceqq_f32(delta, half); /* delta == +/- 0.5 */
|
|
return SIMDE__M128I_NEON_C(i32,
|
|
vbslq_s32(is_delta_half, r_even, r_normal));
|
|
#endif
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (int32_t)a.f32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtps_pd(simde__m128 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cvtps_pd(a.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
double simde_mm_cvtsd_f64(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
return _mm_cvtsd_f64(a.n);
|
|
#else
|
|
return a.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvtsd_si32(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_cvtsd_si32(a.n);
|
|
#else
|
|
return (int32_t)a.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvtsd_si64(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if defined(__PGI)
|
|
return _mm_cvtsd_si64x(a.n);
|
|
#else
|
|
return _mm_cvtsd_si64(a.n);
|
|
#endif
|
|
#else
|
|
return (int32_t)a.f64[0];
|
|
#endif
|
|
}
|
|
#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n));
|
|
#else
|
|
simde__m128 r;
|
|
|
|
r.f32[0] = (simde_float32)b.f64[0];
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 1; i < (sizeof(r) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i];
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvtsi128_si32(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_cvtsi128_si32(a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return vgetq_lane_s32(a.neon_i32, 0);
|
|
#else
|
|
return a.i32[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int64_t simde_mm_cvtsi128_si64(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if defined(__PGI)
|
|
return _mm_cvtsi128_si64x(a.n);
|
|
#else
|
|
return _mm_cvtsi128_si64(a.n);
|
|
#endif
|
|
#else
|
|
return a.i64[0];
|
|
#endif
|
|
}
|
|
#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b));
|
|
#else
|
|
simde__m128d r;
|
|
|
|
r.f64[0] = (simde_float64)b;
|
|
r.i64[1] = a.i64[1];
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvtsi32_si128(int32_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvtsi32_si128(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
|
|
#else
|
|
r.i32[0] = a;
|
|
r.i32[1] = 0;
|
|
r.i32[2] = 0;
|
|
r.i32[3] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int32_t b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if !defined(__PGI)
|
|
r.n = _mm_cvtsi64_sd(a.n, b);
|
|
#else
|
|
r.n = _mm_cvtsi64x_sd(a.n, b);
|
|
#endif
|
|
#else
|
|
r.f64[0] = (simde_float64)b;
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvtsi64_si128(int64_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if !defined(__PGI)
|
|
r.n = _mm_cvtsi64_si128(a);
|
|
#else
|
|
r.n = _mm_cvtsi64x_si128(a);
|
|
#endif
|
|
#else
|
|
r.i64[0] = a;
|
|
r.i64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvtss_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = b.f32[0];
|
|
r.i64[1] = a.i64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvttpd_epi32(simde__m128d a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvttpd_epi32(a.n);
|
|
#else
|
|
for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
|
|
r.i32[i] = (int32_t)trunc(a.f64[i]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cvttpd_pi32(simde__m128d a)
|
|
{
|
|
simde__m64 r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvttpd_pi32(a.n);
|
|
#else
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (int32_t)trunc(a.f64[i]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvttps_epi32(simde__m128 a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvttps_epi32(a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vcvtq_s32_f32(a.neon_f32);
|
|
#else
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (int32_t)truncf(a.f32[i]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvttsd_si32(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_cvttsd_si32(a.n);
|
|
#else
|
|
return (int32_t)trunc(a.f64[0]);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int64_t simde_mm_cvttsd_si64(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if !defined(__PGI)
|
|
return _mm_cvttsd_si64(a.n);
|
|
#else
|
|
return _mm_cvttsd_si64x(a.n);
|
|
#endif
|
|
#else
|
|
return (int64_t)trunc(a.f64[0]);
|
|
#endif
|
|
}
|
|
#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_div_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f64[i] / b.f64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_div_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = a.f64[0] / b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
return a.u16[imm8 & 7];
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && \
|
|
(!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0))
|
|
#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8)
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_extract_epi16(a, imm8) \
|
|
(vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff)))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8)
|
|
{
|
|
a.u16[imm8 & 7] = (int16_t)i;
|
|
return a;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
#define simde_mm_insert_epi16(a, i, imm8) \
|
|
SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8)))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_insert_epi16(a, i, imm8) \
|
|
SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8)))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
|
|
{
|
|
simde__m128d r;
|
|
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_load_pd(mem_addr);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_u32 = vld1q_u32((uint32_t const *)mem_addr);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(&r, mem_addr, sizeof(r));
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_load_pd1(mem_addr);
|
|
#else
|
|
r.f64[0] = *mem_addr;
|
|
r.f64[1] = *mem_addr;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_load_sd(mem_addr);
|
|
#else
|
|
memcpy(&r, mem_addr, sizeof(simde_float64));
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr)
|
|
{
|
|
simde__m128i r;
|
|
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_load_si128(&(mem_addr->n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(&r, mem_addr, sizeof(r));
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadh_pd(a.n, mem_addr);
|
|
#else
|
|
simde_float64 t;
|
|
memcpy(&t, mem_addr, sizeof(t));
|
|
r.f64[0] = a.f64[0];
|
|
r.f64[1] = t;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadl_epi64(&mem_addr->n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr),
|
|
vcreate_s32(0));
|
|
#else
|
|
r.u64[0] = mem_addr->u64[0];
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadl_pd(a.n, mem_addr);
|
|
#else
|
|
memcpy(&r, mem_addr, sizeof(simde_float64));
|
|
r.u64[1] = a.u64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
|
|
{
|
|
simde__m128d r;
|
|
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadr_pd(mem_addr);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
r.f64[0] = mem_addr[1];
|
|
r.f64[1] = mem_addr[0];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadu_pd(mem_addr);
|
|
#else
|
|
simde_float64 l, h;
|
|
memcpy(&l, &mem_addr[0], sizeof(l));
|
|
memcpy(&h, &mem_addr[1], sizeof(h));
|
|
r.f64[0] = l;
|
|
r.f64[1] = h;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadu_si128(&((*mem_addr).n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
|
|
#else
|
|
memcpy(&r, mem_addr, sizeof(r));
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_madd_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32x4_t pl =
|
|
vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16));
|
|
int32x4_t ph =
|
|
vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16));
|
|
int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
|
|
int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
|
|
r.neon_i32 = vcombine_s32(rl, rh);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i += 2) {
|
|
r.i32[i / 2] =
|
|
(a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask,
|
|
int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_maskmoveu_si128(a.n, mask.n, (char *)mem_addr);
|
|
#else
|
|
for (size_t i = 0; i < 16; i++) {
|
|
if (mask.u8[i] & 0x80) {
|
|
mem_addr[i] = a.i8[i];
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_movemask_epi8(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_movemask_epi8(a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
uint8x16_t input = a.neon_u8;
|
|
SIMDE_ALIGN(16)
|
|
static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
|
|
uint8x8_t mask_and = vdup_n_u8(0x80);
|
|
int8x8_t mask_shift = vld1_s8(xr);
|
|
|
|
uint8x8_t lo = vget_low_u8(input);
|
|
uint8x8_t hi = vget_high_u8(input);
|
|
|
|
lo = vand_u8(lo, mask_and);
|
|
lo = vshl_u8(lo, mask_shift);
|
|
|
|
hi = vand_u8(hi, mask_and);
|
|
hi = vshl_u8(hi, mask_shift);
|
|
|
|
lo = vpadd_u8(lo, lo);
|
|
lo = vpadd_u8(lo, lo);
|
|
lo = vpadd_u8(lo, lo);
|
|
|
|
hi = vpadd_u8(hi, hi);
|
|
hi = vpadd_u8(hi, hi);
|
|
hi = vpadd_u8(hi, hi);
|
|
|
|
return ((hi[0] << 8) | (lo[0] & 0xFF));
|
|
#else
|
|
int32_t r = 0;
|
|
SIMDE__VECTORIZE_REDUCTION(| : r)
|
|
for (size_t i = 0; i < 16; i++) {
|
|
r |= (a.u8[15 - i] >> 7) << (15 - i);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_movemask_pd(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_movemask_pd(a.n);
|
|
#else
|
|
int32_t r = 0;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(a.u64) / sizeof(a.u64[0])); i++) {
|
|
r |= (a.u64[i] >> 63) << i;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_movepi64_pi64(simde__m128i a)
|
|
{
|
|
simde__m64 r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_movepi64_pi64(a.n);
|
|
#else
|
|
r.i64[0] = a.i64[0];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_movpi64_epi64(simde__m64 a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_movpi64_epi64(a.n);
|
|
#else
|
|
r.i64[0] = a.i64[0];
|
|
r.i64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_min_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_min_epu8(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
|
|
r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_min_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_min_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_max_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_max_epu8(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
|
|
r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_max_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_max_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_move_epi64(simde__m128i a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_move_epi64(a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1);
|
|
#else
|
|
r.i64[0] = a.i64[0];
|
|
r.i64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_move_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mul_epu32(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
|
|
r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] * b.i64[i];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] % b.i64[i];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mul_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f64[i] * b.f64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mul_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = a.f64[0] * b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b)
|
|
{
|
|
simde__m64 r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
r.n = _mm_mul_su32(a.n, b.n);
|
|
#else
|
|
r.u64[0] = ((uint64_t)a.u32[0]) * ((uint64_t)b.u32[0]);
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mulhi_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int16x4_t a3210 = vget_low_s16(a.neon_i16);
|
|
int16x4_t b3210 = vget_low_s16(b.neon_i16);
|
|
int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
|
|
int16x4_t a7654 = vget_high_s16(a.neon_i16);
|
|
int16x4_t b7654 = vget_high_s16(b.neon_i16);
|
|
int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
|
|
uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
|
|
vreinterpretq_u16_s32(ab7654));
|
|
r.neon_u16 = rv.val[1];
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
|
|
((int32_t)b.i16[i]))) >>
|
|
16);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
r.n = _mm_mulhi_epu16(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = (uint16_t)(
|
|
(((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mullo_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
|
|
((int32_t)b.i16[i]))) &
|
|
0xffff);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_or_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] | b.i64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_or_si128(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] | b.i64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_packs_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16));
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i8[i] = (a.i16[i] > INT8_MAX)
|
|
? INT8_MAX
|
|
: ((a.i16[i] < INT8_MIN)
|
|
? INT8_MIN
|
|
: ((int8_t)a.i16[i]));
|
|
r.i8[i + 8] = (b.i16[i] > INT8_MAX)
|
|
? INT8_MAX
|
|
: ((b.i16[i] < INT8_MIN)
|
|
? INT8_MIN
|
|
: ((int8_t)b.i16[i]));
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_packs_epi32(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 =
|
|
vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32));
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i16[i] = (a.i32[i] > INT16_MAX)
|
|
? INT16_MAX
|
|
: ((a.i32[i] < INT16_MIN)
|
|
? INT16_MIN
|
|
: ((int16_t)a.i32[i]));
|
|
r.i16[i + 4] = (b.i32[i] > INT16_MAX)
|
|
? INT16_MAX
|
|
: ((b.i32[i] < INT16_MIN)
|
|
? INT16_MIN
|
|
: ((int16_t)b.i32[i]));
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_packus_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_u8 =
|
|
vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16));
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.u8[i] = (a.i16[i] > UINT8_MAX)
|
|
? UINT8_MAX
|
|
: ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i]));
|
|
r.u8[i + 8] =
|
|
(b.i16[i] > UINT8_MAX)
|
|
? UINT8_MAX
|
|
: ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i]));
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_pause(void)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_pause();
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_sad_epu8(a.n, b.n);
|
|
#else
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
uint16_t tmp = 0;
|
|
SIMDE__VECTORIZE_REDUCTION(+ : tmp)
|
|
for (size_t j = 0; j < ((sizeof(r.u8) / sizeof(r.u8[0])) / 2);
|
|
j++) {
|
|
const size_t e = j + (i * 8);
|
|
tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e])
|
|
: (b.u8[e] - a.u8[e]);
|
|
}
|
|
r.i64[i] = tmp;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
|
|
int8_t e11, int8_t e10, int8_t e9, int8_t e8,
|
|
int8_t e7, int8_t e6, int8_t e5, int8_t e4,
|
|
int8_t e3, int8_t e2, int8_t e1, int8_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,
|
|
e3, e2, e1, e0);
|
|
#else
|
|
r.i8[0] = e0;
|
|
r.i8[1] = e1;
|
|
r.i8[2] = e2;
|
|
r.i8[3] = e3;
|
|
r.i8[4] = e4;
|
|
r.i8[5] = e5;
|
|
r.i8[6] = e6;
|
|
r.i8[7] = e7;
|
|
r.i8[8] = e8;
|
|
r.i8[9] = e9;
|
|
r.i8[10] = e10;
|
|
r.i8[11] = e11;
|
|
r.i8[12] = e12;
|
|
r.i8[13] = e13;
|
|
r.i8[14] = e14;
|
|
r.i8[15] = e15;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
|
|
int16_t e3, int16_t e2, int16_t e1, int16_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
|
|
r.neon_i16 = vld1q_s16(data);
|
|
#else
|
|
r.i16[0] = e0;
|
|
r.i16[1] = e1;
|
|
r.i16[2] = e2;
|
|
r.i16[3] = e3;
|
|
r.i16[4] = e4;
|
|
r.i16[5] = e5;
|
|
r.i16[6] = e6;
|
|
r.i16[7] = e7;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi32(e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3};
|
|
r.neon_i32 = vld1q_s32(data);
|
|
#else
|
|
r.i32[0] = e0;
|
|
r.i32[1] = e1;
|
|
r.i32[2] = e2;
|
|
r.i32[3] = e3;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi64(e1.n, e0.n);
|
|
#else
|
|
r.i64[0] = e0.i64[0];
|
|
r.i64[1] = e1.i64[0];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi64x(e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r = SIMDE__M128I_NEON_C(i64,
|
|
vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)));
|
|
#else
|
|
r.i64[0] = e0;
|
|
r.i64[1] = e1;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13,
|
|
uint8_t e12, uint8_t e11, uint8_t e10,
|
|
uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
|
|
uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
|
|
uint8_t e1, uint8_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.u8[0] = e0;
|
|
r.u8[1] = e1;
|
|
r.u8[2] = e2;
|
|
r.u8[3] = e3;
|
|
r.u8[4] = e4;
|
|
r.u8[5] = e5;
|
|
r.u8[6] = e6;
|
|
r.u8[7] = e7;
|
|
r.u8[8] = e8;
|
|
r.u8[9] = e9;
|
|
r.u8[10] = e10;
|
|
r.u8[11] = e11;
|
|
r.u8[12] = e12;
|
|
r.u8[13] = e13;
|
|
r.u8[14] = e14;
|
|
r.u8[15] = e15;
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5,
|
|
uint16_t e4, uint16_t e3, uint16_t e2,
|
|
uint16_t e1, uint16_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.u16[0] = e0;
|
|
r.u16[1] = e1;
|
|
r.u16[2] = e2;
|
|
r.u16[3] = e3;
|
|
r.u16[4] = e4;
|
|
r.u16[5] = e5;
|
|
r.u16[6] = e6;
|
|
r.u16[7] = e7;
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1,
|
|
uint32_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.u32[0] = e0;
|
|
r.u32[1] = e1;
|
|
r.u32[2] = e2;
|
|
r.u32[3] = e3;
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.u64[0] = e0;
|
|
r.u64[1] = e1;
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_pd(e1, e0);
|
|
#else
|
|
r.f64[0] = e0;
|
|
r.f64[1] = e1;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_set_pd1(simde_float64 a)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_pd(a);
|
|
#else
|
|
r.f64[0] = a;
|
|
r.f64[1] = a;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_set_sd(simde_float64 a)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_sd(a);
|
|
#else
|
|
r.f64[0] = a;
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi8(int8_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi8(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i8 = vdupq_n_s8(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi16(int16_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi16(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 = vdupq_n_s16(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi32(int32_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi32(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vdupq_n_s32(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi64x(int64_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi64x(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i64 = vmovq_n_s64(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi64(simde__m64 a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi64(a.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[0];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_set1_pd(simde_float64 a)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_pd(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.f64[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
|
|
int8_t e11, int8_t e10, int8_t e9, int8_t e8,
|
|
int8_t e7, int8_t e6, int8_t e5, int8_t e4,
|
|
int8_t e3, int8_t e2, int8_t e1, int8_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
|
|
e4, e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8,
|
|
e7, e6, e5, e4, e3, e2, e1, e0};
|
|
r.neon_i8 = vld1q_s8(t);
|
|
#else
|
|
r.i8[0] = e15;
|
|
r.i8[1] = e14;
|
|
r.i8[2] = e13;
|
|
r.i8[3] = e12;
|
|
r.i8[4] = e11;
|
|
r.i8[5] = e10;
|
|
r.i8[6] = e9;
|
|
r.i8[7] = e8;
|
|
r.i8[8] = e7;
|
|
r.i8[9] = e6;
|
|
r.i8[10] = e5;
|
|
r.i8[11] = e4;
|
|
r.i8[12] = e3;
|
|
r.i8[13] = e2;
|
|
r.i8[14] = e1;
|
|
r.i8[15] = e0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
|
|
int16_t e3, int16_t e2, int16_t e1, int16_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0};
|
|
r.neon_i16 = vld1q_s16(t);
|
|
#else
|
|
r.i16[0] = e7;
|
|
r.i16[1] = e6;
|
|
r.i16[2] = e5;
|
|
r.i16[3] = e4;
|
|
r.i16[4] = e3;
|
|
r.i16[5] = e2;
|
|
r.i16[6] = e1;
|
|
r.i16[7] = e0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_epi32(e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32_t t[] = {e3, e2, e1, e0};
|
|
r.neon_i32 = vld1q_s32(t);
|
|
#else
|
|
r.i32[0] = e3;
|
|
r.i32[1] = e2;
|
|
r.i32[2] = e1;
|
|
r.i32[3] = e0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_epi64(e1.n, e0.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64);
|
|
#else
|
|
r.i64[0] = e1.i64[0];
|
|
r.i64[1] = e0.i64[0];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_pd(e1, e0);
|
|
#else
|
|
r.f64[0] = e1;
|
|
r.f64[1] = e0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_setzero_pd(void)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setzero_pd();
|
|
#else
|
|
r.u64[0] = 0;
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setzero_si128(void)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setzero_si128();
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vdupq_n_s32(0);
|
|
#else
|
|
r.u64[0] = 0;
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_shuffle_epi32(a, imm8) \
|
|
SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8)))
|
|
#elif defined(SIMDE__SHUFFLE_VECTOR)
|
|
#define simde_mm_shuffle_epi32(a, imm8) \
|
|
({ \
|
|
const simde__m128i simde__tmp_a_ = a; \
|
|
(simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR( \
|
|
32, 16, (simde__tmp_a_).i32, \
|
|
(simde__tmp_a_).i32, ((imm8)) & 3, \
|
|
((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \
|
|
((imm8) >> 6) & 3)}; \
|
|
})
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8)
|
|
{
|
|
simde__m128d r;
|
|
|
|
r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1];
|
|
r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1];
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
#define simde_mm_shuffle_pd(a, b, imm8) \
|
|
SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8)))
|
|
#elif defined(SIMDE__SHUFFLE_VECTOR)
|
|
#define simde_mm_shuffle_pd(a, b, imm8) \
|
|
({ \
|
|
(simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR( \
|
|
64, 16, (a).f64, (b).f64, \
|
|
(((imm8)) & 1), \
|
|
(((imm8) >> 1) & 1) + 2)}; \
|
|
})
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.i64[0] = a.i64[0];
|
|
for (size_t i = 4; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_shufflehi_epi16(a, imm8) \
|
|
SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8)))
|
|
#elif defined(SIMDE__SHUFFLE_VECTOR)
|
|
#define simde_mm_shufflehi_epi16(a, imm8) \
|
|
({ \
|
|
const simde__m128i simde__tmp_a_ = a; \
|
|
(simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
|
|
16, 16, (simde__tmp_a_).i16, \
|
|
(simde__tmp_a_).i16, 0, 1, 2, 3, \
|
|
(((imm8)) & 3) + 4, \
|
|
(((imm8) >> 2) & 3) + 4, \
|
|
(((imm8) >> 4) & 3) + 4, \
|
|
(((imm8) >> 6) & 3) + 4)}; \
|
|
})
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
for (size_t i = 0; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2); i++) {
|
|
r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)];
|
|
}
|
|
r.i64[1] = a.i64[1];
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_shufflelo_epi16(a, imm8) \
|
|
SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8)))
|
|
#elif defined(SIMDE__SHUFFLE_VECTOR)
|
|
#define simde_mm_shufflelo_epi16(a, imm8) \
|
|
({ \
|
|
const simde__m128i simde__tmp_a_ = a; \
|
|
(simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
|
|
16, 16, (simde__tmp_a_).i16, \
|
|
(simde__tmp_a_).i16, (((imm8)) & 3), \
|
|
(((imm8) >> 2) & 3), \
|
|
(((imm8) >> 4) & 3), \
|
|
(((imm8) >> 6) & 3), 4, 5, 6, 7)}; \
|
|
})
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 15)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = a.u16[i] << s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 31)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] << s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 63)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] << s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_sqrt_pd(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_sqrt_pd(a.n));
|
|
#else
|
|
simde__m128d r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = sqrt(a.f64[i]);
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.f64[0] = sqrt(b.f64[0]);
|
|
r.f64[1] = a.f64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 15)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = a.u16[i] >> s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 31)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
|
|
r.u32[i] = a.u32[i] >> s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 31)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
|
|
r.u64[i] = a.u64[i] >> s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srai_epi16(simde__m128i a, int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
const uint16_t m =
|
|
(uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - imm8));
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.u16[0])); i++) {
|
|
const uint16_t is_neg = ((uint16_t)(
|
|
((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
|
|
r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg);
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srai_epi16(a, imm8) \
|
|
SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8)));
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srai_epi32(simde__m128i a, int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
const uint32_t m =
|
|
(uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - imm8));
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.u32[0])); i++) {
|
|
uint32_t is_neg = ((uint32_t)(
|
|
((a.u32[i]) >> ((sizeof(int32_t) * CHAR_BIT) - 1))));
|
|
r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg);
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srai_epi32(a, imm8) \
|
|
SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8)))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_srai_epi32(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i32, \
|
|
((imm8) <= 0) \
|
|
? (a.neon_i32) \
|
|
: (((imm8) > 31) \
|
|
? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \
|
|
16)) \
|
|
: (vshrq_n_s32(a.neon_i32, (imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
int cnt = (int)count.i64[0];
|
|
|
|
if (cnt > 15 || cnt < 0) {
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
|
|
i++) {
|
|
r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
|
|
}
|
|
} else {
|
|
const uint16_t m = (uint16_t)(
|
|
(~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
|
|
i++) {
|
|
const uint16_t is_neg = a.i16[i] < 0;
|
|
r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
|
|
return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
const uint64_t cnt = count.u64[0];
|
|
|
|
if (cnt > 31) {
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
|
|
i++) {
|
|
r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
|
|
}
|
|
} else if (cnt == 0) {
|
|
memcpy(&r, &a, sizeof(r));
|
|
} else {
|
|
const uint32_t m = (uint32_t)(
|
|
(~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
|
|
i++) {
|
|
const uint32_t is_neg = a.i32[i] < 0;
|
|
r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a.i16[i] << s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_slli_epi16(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i16, ((imm8) <= 0) \
|
|
? ((a).neon_i16) \
|
|
: (((imm8) > 31) ? (vdupq_n_s16(0)) \
|
|
: (vshlq_n_s16((a).neon_i16, \
|
|
(imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] << s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_slli_epi32(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i32, ((imm8) <= 0) \
|
|
? ((a).neon_i32) \
|
|
: (((imm8) > 31) ? (vdupq_n_s32(0)) \
|
|
: (vshlq_n_s32((a).neon_i32, \
|
|
(imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] << s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8));
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.u16[i] = a.u16[i] >> s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_srli_epi16(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
u16, ((imm8) <= 0) \
|
|
? ((a).neon_u16) \
|
|
: (((imm8) > 31) ? (vdupq_n_u16(0)) \
|
|
: (vshrq_n_u16((a).neon_u16, \
|
|
(imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.u32[i] = a.u32[i] >> s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_srli_epi32(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
u32, ((imm8) <= 0) \
|
|
? ((a).neon_u32) \
|
|
: (((imm8) > 31) ? (vdupq_n_u32(0)) \
|
|
: (vshrq_n_u32((a).neon_u32, \
|
|
(imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const unsigned char s = imm8 & 255;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
if (s > 63) {
|
|
r.u64[i] = 0;
|
|
} else {
|
|
r.u64[i] = a.u64[i] >> s;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_srli_epi64(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
u64, \
|
|
(((imm8)&255) < 0 || ((imm8)&255) > 63) \
|
|
? (vdupq_n_u64(0)) \
|
|
: ((((imm8)&255) == 0) \
|
|
? (a.neon_u64) \
|
|
: (vshrq_n_u64((a).neon_u64, (imm8)&255))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
|
|
simde__m128d a)
|
|
{
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_store_pd(mem_addr, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
|
|
simde__m128d a)
|
|
{
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_store1_pd(mem_addr, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
mem_addr[0] = a.f64[0];
|
|
mem_addr[1] = a.f64[0];
|
|
#endif
|
|
}
|
|
#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_store_sd(mem_addr, a.n);
|
|
#else
|
|
memcpy(mem_addr, &a, sizeof(a.f64[0]));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_store_si128(&mem_addr->n, a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
vst1q_s32((int32_t *)mem_addr, a.neon_i32);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storeh_pd(mem_addr, a.n);
|
|
#else
|
|
*mem_addr = a.f64[1];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storel_epi64(&(mem_addr->n), a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0);
|
|
#else
|
|
mem_addr->i64[0] = a.i64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storel_pd(mem_addr, a.n);
|
|
#else
|
|
*mem_addr = a.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a)
|
|
{
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storer_pd(mem_addr, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
mem_addr[0] = a.f64[1];
|
|
mem_addr[1] = a.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storeu_pd(mem_addr, a.n);
|
|
#else
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storeu_si128(&mem_addr->n, a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32_t v[4];
|
|
vst1q_s32(v, a.neon_i32);
|
|
memcpy(mem_addr, v, sizeof(v));
|
|
#else
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
|
|
simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_stream_pd(mem_addr, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_stream_si128(&mem_addr->n, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_stream_si32(mem_addr, a);
|
|
#else
|
|
*mem_addr = a;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
|
|
*mem_addr = a;
|
|
#elif defined(__GNUC__)
|
|
_mm_stream_si64((long long *)mem_addr, a);
|
|
#else
|
|
_mm_stream_si64(mem_addr, a);
|
|
#endif
|
|
#else
|
|
*mem_addr = a;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = a.i8[i] - b.i8[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a.i16[i] - b.i16[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] - b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] - b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f64[i] - b.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.f64[0] = a.f64[0] - b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M64_C(_mm_sub_si64(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = a.i64[0] - b.i64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
|
|
if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
|
|
r.i8[i] = INT8_MIN;
|
|
} else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
|
|
r.i8[i] = INT8_MAX;
|
|
} else {
|
|
r.i8[i] = (a.i8[i]) - (b.i8[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
|
|
if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) {
|
|
r.i16[i] = INT16_MIN;
|
|
} else if ((b.i16[i]) < 0 &&
|
|
(a.i16[i]) > INT16_MAX + (b.i16[i])) {
|
|
r.i16[i] = INT16_MAX;
|
|
} else {
|
|
r.i16[i] = (a.i16[i]) - (b.i16[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
|
|
const int32_t x = a.u8[i] - b.u8[i];
|
|
if (x < 0) {
|
|
r.u8[i] = 0;
|
|
} else if (x > UINT8_MAX) {
|
|
r.u8[i] = UINT8_MAX;
|
|
} else {
|
|
r.u8[i] = (uint8_t)x;
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
|
|
const int32_t x = a.u16[i] - b.u16[i];
|
|
if (x < 0) {
|
|
r.u16[i] = 0;
|
|
} else if (x > UINT16_MAX) {
|
|
r.u16[i] = UINT16_MAX;
|
|
} else {
|
|
r.u16[i] = (uint16_t)x;
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomieq_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] == b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomige_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] >= b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomigt_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] > b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomile_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] <= b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomilt_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] < b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomineq_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] != b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_undefined_pd(void)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
|
|
r.n = _mm_undefined_pd();
|
|
#else
|
|
r = simde_mm_setzero_pd();
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_undefined_si128(void)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
|
|
r.n = _mm_undefined_si128();
|
|
#else
|
|
r = simde_mm_setzero_si128();
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_lfence(void)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_lfence();
|
|
#else
|
|
simde_mm_sfence();
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_mfence(void)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_mfence();
|
|
#else
|
|
simde_mm_sfence();
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16));
|
|
int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16));
|
|
int8x8x2_t result = vzip_s8(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i8,
|
|
vcombine_s8(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
|
|
r.i8[(i * 2)] = a.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
|
|
r.i8[(i * 2) + 1] =
|
|
b.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int16x4_t a1 = vget_high_s16(a.neon_i16);
|
|
int16x4_t b1 = vget_high_s16(b.neon_i16);
|
|
int16x4x2_t result = vzip_s16(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i16,
|
|
vcombine_s16(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
|
|
r.i16[(i * 2)] =
|
|
a.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
|
|
r.i16[(i * 2) + 1] =
|
|
b.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32x2_t a1 = vget_high_s32(a.neon_i32);
|
|
int32x2_t b1 = vget_high_s32(b.neon_i32);
|
|
int32x2x2_t result = vzip_s32(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i32,
|
|
vcombine_s32(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
|
|
r.i32[(i * 2)] =
|
|
a.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
|
|
r.i32[(i * 2) + 1] =
|
|
b.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
|
|
r.i64[(i * 2)] =
|
|
a.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
|
|
r.i64[(i * 2) + 1] =
|
|
b.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
|
|
r.f64[(i * 2)] =
|
|
a.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
|
|
r.f64[(i * 2) + 1] =
|
|
b.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16));
|
|
int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16));
|
|
int8x8x2_t result = vzip_s8(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i8,
|
|
vcombine_s8(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
|
|
r.i8[(i * 2)] = a.i8[i];
|
|
r.i8[(i * 2) + 1] = b.i8[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int16x4_t a1 = vget_low_s16(a.neon_i16);
|
|
int16x4_t b1 = vget_low_s16(b.neon_i16);
|
|
int16x4x2_t result = vzip_s16(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i16,
|
|
vcombine_s16(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
|
|
r.i16[(i * 2)] = a.i16[i];
|
|
r.i16[(i * 2) + 1] = b.i16[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32x2_t a1 = vget_low_s32(a.neon_i32);
|
|
int32x2_t b1 = vget_low_s32(b.neon_i32);
|
|
int32x2x2_t result = vzip_s32(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i32,
|
|
vcombine_s32(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
|
|
r.i32[(i * 2)] = a.i32[i];
|
|
r.i32[(i * 2) + 1] = b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
|
|
r.i64[(i * 2)] = a.i64[i];
|
|
r.i64[(i * 2) + 1] = b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
|
|
r.f64[(i * 2)] = a.f64[i];
|
|
r.f64[(i * 2) + 1] = b.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] ^ b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] ^ b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_not_si128(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = ~(a.i32[i]);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__END_DECLS
|
|
|
|
#endif /* !defined(SIMDE__SSE2_H) */
|