6366f6ab59
SIMDE was introduced for aarch64 support, however, the library itself supports non-SIMD fallback, which allows us provide support to other platforms without code changes. There is another world beyond x86. So we can simply enable SIMDE for processors without SSE2 support. Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
1357 lines
30 KiB
C
1357 lines
30 KiB
C
/* Copyright (c) 2017-2018 Evan Nemerson <evan@nemerson.com>
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use, copy,
|
|
* modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
* of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#if !defined(SIMDE__MMX_H)
|
|
#if !defined(SIMDE__MMX_H)
|
|
#define SIMDE__MMX_H
|
|
#endif
|
|
#include "simde-common.h"
|
|
|
|
#if defined(SIMDE_MMX_FORCE_NATIVE)
|
|
#define SIMDE_MMX_NATIVE
|
|
#elif defined(__MMX__) && !defined(SIMDE_MMX_NO_NATIVE) && \
|
|
!defined(SIMDE_NO_NATIVE)
|
|
#define SIMDE_MMX_NATIVE
|
|
#elif defined(__ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && \
|
|
!defined(SIMDE_NO_NEON)
|
|
#define SIMDE_MMX_NEON
|
|
#endif
|
|
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
#include <mmintrin.h>
|
|
#else
|
|
#if defined(SIMDE_MMX_NEON)
|
|
#include <arm_neon.h>
|
|
#endif
|
|
#endif
|
|
#include <stdint.h>
|
|
#include <limits.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
SIMDE__BEGIN_DECLS
|
|
|
|
typedef union {
|
|
#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
|
|
int8_t i8 __attribute__((__vector_size__(8), __may_alias__));
|
|
int16_t i16 __attribute__((__vector_size__(8), __may_alias__));
|
|
int32_t i32 __attribute__((__vector_size__(8), __may_alias__));
|
|
int64_t i64 __attribute__((__vector_size__(8), __may_alias__));
|
|
uint8_t u8 __attribute__((__vector_size__(8), __may_alias__));
|
|
uint16_t u16 __attribute__((__vector_size__(8), __may_alias__));
|
|
uint32_t u32 __attribute__((__vector_size__(8), __may_alias__));
|
|
uint64_t u64 __attribute__((__vector_size__(8), __may_alias__));
|
|
simde_float32 f32 __attribute__((__vector_size__(8), __may_alias__));
|
|
#else
|
|
int8_t i8[8];
|
|
int16_t i16[4];
|
|
int32_t i32[2];
|
|
int64_t i64[1];
|
|
uint8_t u8[8];
|
|
uint16_t u16[4];
|
|
uint32_t u32[2];
|
|
uint64_t u64[1];
|
|
simde_float32 f32[2];
|
|
#endif
|
|
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
__m64 n;
|
|
#elif defined(SIMDE_MMX_NEON)
|
|
int8x8_t neon_i8;
|
|
int16x4_t neon_i16;
|
|
int32x2_t neon_i32;
|
|
int64x1_t neon_i64;
|
|
uint8x8_t neon_u8;
|
|
uint16x4_t neon_u16;
|
|
uint32x2_t neon_u32;
|
|
uint64x1_t neon_u64;
|
|
float32x2_t neon_f32;
|
|
#endif
|
|
} simde__m64;
|
|
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
HEDLEY_STATIC_ASSERT(sizeof(__m64) == sizeof(simde__m64),
|
|
"__m64 size doesn't match simde__m64 size");
|
|
SIMDE__FUNCTION_ATTRIBUTES simde__m64 SIMDE__M64_C(__m64 v)
|
|
{
|
|
simde__m64 r;
|
|
r.n = v;
|
|
return r;
|
|
}
|
|
#elif defined(SIMDE_MMX_NEON)
|
|
#define SIMDE__M64_NEON_C(T, expr) \
|
|
(simde__m64) { .neon_##T = (expr) }
|
|
#endif
|
|
HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_add_pi8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < 8; i++) {
|
|
r.i8[i] = a.i8[i] + b.i8[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_add_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
r.i16[i] = a.i16[i] + b.i16[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_add_pi32(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int32_t)); i++) {
|
|
r.i32[i] = a.i32[i] + b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_adds_pi8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 8; i++) {
|
|
if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
|
|
r.i8[i] = INT8_MAX;
|
|
} else if ((((b.i8[i]) < 0) &&
|
|
((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
|
|
r.i8[i] = INT8_MIN;
|
|
} else {
|
|
r.i8[i] = (a.i8[i]) + (b.i8[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_adds_pu8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < 8; i++) {
|
|
const int32_t x = a.u8[i] + b.u8[i];
|
|
if (x < 0)
|
|
r.u8[i] = 0;
|
|
else if (x > UINT8_MAX)
|
|
r.u8[i] = UINT8_MAX;
|
|
else
|
|
r.u8[i] = (uint8_t)x;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_adds_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 4; i++) {
|
|
if ((((b.i16[i]) > 0) &&
|
|
((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
|
|
r.i16[i] = INT16_MAX;
|
|
} else if ((((b.i16[i]) < 0) &&
|
|
((a.i16[i]) < (SHRT_MIN - (b.i16[i]))))) {
|
|
r.i16[i] = SHRT_MIN;
|
|
} else {
|
|
r.i16[i] = (a.i16[i]) + (b.i16[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_adds_pu16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
const uint32_t x = a.u16[i] + b.u16[i];
|
|
if (x > UINT16_MAX)
|
|
r.u16[i] = UINT16_MAX;
|
|
else
|
|
r.u16[i] = (uint16_t)x;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_and_si64(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = a.i64[0] & b.i64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pand(a, b) simde_mm_and_si64(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_andnot_si64(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = ~(a.i64[0]) & b.i64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_cmpeq_pi8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 8; i++) {
|
|
r.i8[i] = (a.i8[i] == b.i8[i]) * 0xff;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_cmpeq_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 4; i++) {
|
|
r.i16[i] = (a.i16[i] == b.i16[i]) * 0xffff;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_cmpeq_pi32(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 2; i++) {
|
|
r.i32[i] = (a.i32[i] == b.i32[i]) * 0xffffffff;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_cmpgt_pi8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 8; i++) {
|
|
r.i8[i] = (a.i8[i] > b.i8[i]) * 0xff;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_cmpgt_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 4; i++) {
|
|
r.i16[i] = (a.i16[i] > b.i16[i]) * 0xffff;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_cmpgt_pi32(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 2; i++) {
|
|
r.i32[i] = (a.i32[i] > b.i32[i]) * 0xffffffff;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int64_t simde_mm_cvtm64_si64(simde__m64 a)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
|
|
return _mm_cvtm64_si64(a.n);
|
|
#else
|
|
return a.i64[0];
|
|
#endif
|
|
}
|
|
#define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cvtsi32_si64(int32_t a)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_cvtsi32_si64(a));
|
|
#else
|
|
simde__m64 r;
|
|
r.i32[0] = a;
|
|
r.i32[1] = 0;
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cvtsi64_m64(int64_t a)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
|
|
return SIMDE__M64_C(_mm_cvtsi64_m64(a));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = a;
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvtsi64_si32(simde__m64 a)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return _mm_cvtsi64_si32(a.n);
|
|
#else
|
|
return a.i32[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_empty(void)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
_mm_empty();
|
|
#else
|
|
#endif
|
|
}
|
|
#define simde_m_empty() simde_mm_empty()
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_madd_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 4; i += 2) {
|
|
r.i32[i / 2] =
|
|
(a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_mulhi_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 4; i++) {
|
|
r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) >> 16);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_mullo_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (int i = 0; i < 4; i++) {
|
|
r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) & 0xffff);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_or_si64(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = a.i64[0] | b.i64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_por(a, b) simde_mm_or_si64(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_packs_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
if (a.i16[i] < INT8_MIN) {
|
|
r.i8[i] = INT8_MIN;
|
|
} else if (a.i16[i] > INT8_MAX) {
|
|
r.i8[i] = INT8_MAX;
|
|
} else {
|
|
r.i8[i] = (int8_t)a.i16[i];
|
|
}
|
|
}
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
if (b.i16[i] < INT8_MIN) {
|
|
r.i8[i + 4] = INT8_MIN;
|
|
} else if (b.i16[i] > INT8_MAX) {
|
|
r.i8[i + 4] = INT8_MAX;
|
|
} else {
|
|
r.i8[i + 4] = (int8_t)b.i16[i];
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_packs_pi32(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(a.i32[0])); i++) {
|
|
if (a.i32[i] < SHRT_MIN) {
|
|
r.i16[i] = SHRT_MIN;
|
|
} else if (a.i32[i] > INT16_MAX) {
|
|
r.i16[i] = INT16_MAX;
|
|
} else {
|
|
r.i16[i] = (int16_t)a.i32[i];
|
|
}
|
|
}
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(b.i32[0])); i++) {
|
|
if (b.i32[i] < SHRT_MIN) {
|
|
r.i16[i + 2] = SHRT_MIN;
|
|
} else if (b.i32[i] > INT16_MAX) {
|
|
r.i16[i + 2] = INT16_MAX;
|
|
} else {
|
|
r.i16[i + 2] = (int16_t)b.i32[i];
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_packs_pu16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
if (a.i16[i] > UINT8_MAX) {
|
|
r.u8[i] = UINT8_MAX;
|
|
} else if (a.i16[i] < 0) {
|
|
r.u8[i] = 0;
|
|
} else {
|
|
r.u8[i] = (int8_t)a.i16[i];
|
|
}
|
|
}
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
if (b.i16[i] > UINT8_MAX) {
|
|
r.u8[i + 4] = UINT8_MAX;
|
|
} else if (b.i16[i] < 0) {
|
|
r.u8[i + 4] = 0;
|
|
} else {
|
|
r.u8[i + 4] = (int8_t)b.i16[i];
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
|
|
int8_t e3, int8_t e2, int8_t e1, int8_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
|
|
#else
|
|
simde__m64 r;
|
|
r.i8[0] = e0;
|
|
r.i8[1] = e1;
|
|
r.i8[2] = e2;
|
|
r.i8[3] = e3;
|
|
r.i8[4] = e4;
|
|
r.i8[5] = e5;
|
|
r.i8[6] = e6;
|
|
r.i8[7] = e7;
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
|
|
uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set_pi8((int8_t)e7, (int8_t)e6, (int8_t)e5,
|
|
(int8_t)e4, (int8_t)e3, (int8_t)e2,
|
|
(int8_t)e1, (int8_t)e0));
|
|
#else
|
|
simde__m64 r;
|
|
r.u8[0] = e0;
|
|
r.u8[1] = e1;
|
|
r.u8[2] = e2;
|
|
r.u8[3] = e3;
|
|
r.u8[4] = e4;
|
|
r.u8[5] = e5;
|
|
r.u8[6] = e6;
|
|
r.u8[7] = e7;
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set_pi16(e3, e2, e1, e0));
|
|
#else
|
|
simde__m64 r;
|
|
r.i16[0] = e0;
|
|
r.i16[1] = e1;
|
|
r.i16[2] = e2;
|
|
r.i16[3] = e3;
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
|
|
uint16_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set_pi16((int16_t)e3, (int16_t)e2, (int16_t)e1,
|
|
(int16_t)e0));
|
|
#else
|
|
simde__m64 r;
|
|
r.u16[0] = e0;
|
|
r.u16[1] = e1;
|
|
r.u16[2] = e2;
|
|
r.u16[3] = e3;
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set_pi32((int32_t)e1, (int32_t)e0));
|
|
#else
|
|
simde__m64 r;
|
|
r.u32[0] = e0;
|
|
r.u32[1] = e1;
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set_pi32(e1, e0));
|
|
#else
|
|
simde__m64 r;
|
|
r.i32[0] = e0;
|
|
r.i32[1] = e1;
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_set1_pi8(int8_t a)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set1_pi8(a));
|
|
#else
|
|
return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_set1_pi16(int16_t a)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set1_pi16(a));
|
|
#else
|
|
return simde_mm_set_pi16(a, a, a, a);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_set1_pi32(int32_t a)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_set1_pi32(a));
|
|
#else
|
|
return simde_mm_set_pi32(a, a);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
|
|
int8_t e3, int8_t e2, int8_t e1, int8_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
|
|
#else
|
|
return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_setr_pi16(e3, e2, e1, e0));
|
|
#else
|
|
return simde_mm_set_pi16(e0, e1, e2, e3);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_setr_pi32(e1, e0));
|
|
#else
|
|
return simde_mm_set_pi32(e0, e1);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_setzero_si64(void)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_setzero_si64());
|
|
#else
|
|
return simde_mm_set_pi32(0, 0);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_sll_pi16(a.n, count.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
if (HEDLEY_UNLIKELY(count.u64[0] > 15)) {
|
|
memset(&r, 0, sizeof(r));
|
|
return r;
|
|
}
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = a.u16[i] << count.u64[0];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_sll_pi32(a.n, count.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
if (HEDLEY_UNLIKELY(count.u64[0] > 31)) {
|
|
memset(&r, 0, sizeof(r));
|
|
return r;
|
|
}
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
|
|
r.u32[i] = a.u32[i] << count.u64[0];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M64_C(_mm_slli_pi16(a.n, count));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = a.u16[i] << count;
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M64_C(_mm_slli_pi32(a.n, count));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int)); i++) {
|
|
r.u32[i] = a.u32[i] << count;
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_slli_si64(a.n, count));
|
|
#else
|
|
simde__m64 r;
|
|
r.u64[0] = a.u64[0] << count;
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_sll_si64(a.n, count.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
if (HEDLEY_UNLIKELY(count.u64[0] > 63)) {
|
|
memset(&r, 0, sizeof(r));
|
|
return r;
|
|
}
|
|
|
|
r.u64[0] = a.u64[0] << count.u64[0];
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_srl_pi16(a.n, count.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
if (HEDLEY_UNLIKELY(count.u64[0] > 15)) {
|
|
memset(&r, 0, sizeof(r));
|
|
return r;
|
|
}
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) {
|
|
r.u16[i] = a.u16[i] >> count.u64[0];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_srl_pi32(a.n, count.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
if (HEDLEY_UNLIKELY(count.u64[0] > 31)) {
|
|
memset(&r, 0, sizeof(r));
|
|
return r;
|
|
}
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < sizeof(r.u32) / sizeof(r.u32[0]); i++) {
|
|
r.u32[i] = a.u32[i] >> count.u64[0];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M64_C(_mm_srli_pi16(a.n, count));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) {
|
|
r.u16[i] = a.u16[i] >> count;
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M64_C(_mm_srli_pi32(a.n, count));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int)); i++) {
|
|
r.u32[i] = a.u32[i] >> count;
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M64_C(_mm_srli_si64(a.n, count));
|
|
#else
|
|
simde__m64 r;
|
|
r.u64[0] = a.u64[0] >> count;
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_srl_si64(a.n, count.n));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
if (HEDLEY_UNLIKELY(count.u64[0] > 63)) {
|
|
memset(&r, 0, sizeof(r));
|
|
return r;
|
|
}
|
|
|
|
r.u64[0] = a.u64[0] >> count.u64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M64_C(_mm_srai_pi16(a.n, count));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
const uint16_t m =
|
|
(uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - count));
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
const uint16_t is_neg = ((uint16_t)(
|
|
((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
|
|
r.u16[i] = (a.u16[i] >> count) | (m * is_neg);
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M64_C(_mm_srai_pi32(a.n, count));
|
|
#else
|
|
simde__m64 r;
|
|
|
|
const uint32_t m =
|
|
(uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - count));
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int)); i++) {
|
|
const uint32_t is_neg = ((uint32_t)(
|
|
((a.u32[i]) >> ((sizeof(int) * CHAR_BIT) - 1))));
|
|
r.u32[i] = (a.u32[i] >> count) | (m * is_neg);
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_sra_pi16(a.n, count.n));
|
|
#else
|
|
simde__m64 r;
|
|
int cnt = (int)count.i64[0];
|
|
|
|
if (cnt > 15 || cnt < 0) {
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
|
|
i++) {
|
|
r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
|
|
}
|
|
} else {
|
|
const uint16_t m = (uint16_t)(
|
|
(~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
|
|
i++) {
|
|
const uint16_t is_neg = a.i16[i] < 0;
|
|
r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_sra_pi32(a.n, count.n));
|
|
#else
|
|
simde__m64 r;
|
|
const uint64_t cnt = count.u64[0];
|
|
|
|
if (cnt > 31) {
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
|
|
i++) {
|
|
r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
|
|
}
|
|
} else if (cnt == 0) {
|
|
memcpy(&r, &a, sizeof(r));
|
|
} else {
|
|
const uint32_t m = (uint32_t)(
|
|
(~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
|
|
i++) {
|
|
const uint32_t is_neg = a.i32[i] < 0;
|
|
r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_sub_pi8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < 8; i++) {
|
|
r.i8[i] = a.i8[i] - b.i8[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_sub_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
r.i16[i] = a.i16[i] - b.i16[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_sub_pi32(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int)); i++) {
|
|
r.i32[i] = a.i32[i] - b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_subs_pi8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8); i++) {
|
|
if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
|
|
r.i8[i] = INT8_MIN;
|
|
} else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
|
|
r.i8[i] = INT8_MAX;
|
|
} else {
|
|
r.i8[i] = (a.i8[i]) - (b.i8[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_subs_pu8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8); i++) {
|
|
const int32_t x = a.u8[i] - b.u8[i];
|
|
if (x < 0) {
|
|
r.u8[i] = 0;
|
|
} else if (x > UINT8_MAX) {
|
|
r.u8[i] = UINT8_MAX;
|
|
} else {
|
|
r.u8[i] = (uint8_t)x;
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_subs_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
|
|
if (((b.i16[i]) > 0 && (a.i16[i]) < SHRT_MIN + (b.i16[i]))) {
|
|
r.i16[i] = SHRT_MIN;
|
|
} else if ((b.i16[i]) < 0 &&
|
|
(a.i16[i]) > INT16_MAX + (b.i16[i])) {
|
|
r.i16[i] = INT16_MAX;
|
|
} else {
|
|
r.i16[i] = (a.i16[i]) - (b.i16[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_subs_pu16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) {
|
|
const int x = a.u16[i] - b.u16[i];
|
|
if (x < 0) {
|
|
r.u16[i] = 0;
|
|
} else if (x > UINT16_MAX) {
|
|
r.u16[i] = UINT16_MAX;
|
|
} else {
|
|
r.u16[i] = (uint16_t)x;
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_unpackhi_pi8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i8[0] = a.i8[4];
|
|
r.i8[1] = b.i8[4];
|
|
r.i8[2] = a.i8[5];
|
|
r.i8[3] = b.i8[5];
|
|
r.i8[4] = a.i8[6];
|
|
r.i8[5] = b.i8[6];
|
|
r.i8[6] = a.i8[7];
|
|
r.i8[7] = b.i8[7];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_unpackhi_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i16[0] = a.i16[2];
|
|
r.i16[1] = b.i16[2];
|
|
r.i16[2] = a.i16[3];
|
|
r.i16[3] = b.i16[3];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_unpackhi_pi32(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i32[0] = a.i32[1];
|
|
r.i32[1] = b.i32[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_unpacklo_pi8(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i8[0] = a.i8[0];
|
|
r.i8[1] = b.i8[0];
|
|
r.i8[2] = a.i8[1];
|
|
r.i8[3] = b.i8[1];
|
|
r.i8[4] = a.i8[2];
|
|
r.i8[5] = b.i8[2];
|
|
r.i8[6] = a.i8[3];
|
|
r.i8[7] = b.i8[3];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_unpacklo_pi16(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i16[0] = a.i16[0];
|
|
r.i16[1] = b.i16[0];
|
|
r.i16[2] = a.i16[1];
|
|
r.i16[3] = b.i16[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_unpacklo_pi32(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i32[0] = a.i32[0];
|
|
r.i32[1] = b.i32[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return SIMDE__M64_C(_mm_xor_si64(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = a.i64[0] ^ b.i64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
#define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_m_to_int(simde__m64 a)
|
|
{
|
|
#if defined(SIMDE_MMX_NATIVE)
|
|
return _m_to_int(a.n);
|
|
#else
|
|
return a.i32[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__END_DECLS
|
|
|
|
#endif /* !defined(SIMDE__MMX_H) */
|