update C headers to clang 6.0.0rc3
parent
1ba6e1641a
commit
4955c4b8f9
|
@ -541,10 +541,12 @@ set(ZIG_C_HEADER_FILES
|
|||
"adxintrin.h"
|
||||
"altivec.h"
|
||||
"ammintrin.h"
|
||||
"arm64intr.h"
|
||||
"arm_acle.h"
|
||||
"arm_neon.h"
|
||||
"armintr.h"
|
||||
"avx2intrin.h"
|
||||
"avx512bitalgintrin.h"
|
||||
"avx512bwintrin.h"
|
||||
"avx512cdintrin.h"
|
||||
"avx512dqintrin.h"
|
||||
|
@ -553,17 +555,25 @@ set(ZIG_C_HEADER_FILES
|
|||
"avx512ifmaintrin.h"
|
||||
"avx512ifmavlintrin.h"
|
||||
"avx512pfintrin.h"
|
||||
"avx512vbmi2intrin.h"
|
||||
"avx512vbmiintrin.h"
|
||||
"avx512vbmivlintrin.h"
|
||||
"avx512vlbitalgintrin.h"
|
||||
"avx512vlbwintrin.h"
|
||||
"avx512vlcdintrin.h"
|
||||
"avx512vldqintrin.h"
|
||||
"avx512vlintrin.h"
|
||||
"avx512vlvbmi2intrin.h"
|
||||
"avx512vlvnniintrin.h"
|
||||
"avx512vnniintrin.h"
|
||||
"avx512vpopcntdqintrin.h"
|
||||
"avx512vpopcntdqvlintrin.h"
|
||||
"avxintrin.h"
|
||||
"bmi2intrin.h"
|
||||
"bmiintrin.h"
|
||||
"cetintrin.h"
|
||||
"clflushoptintrin.h"
|
||||
"clwbintrin.h"
|
||||
"clzerointrin.h"
|
||||
"cpuid.h"
|
||||
"cuda_wrappers/algorithm"
|
||||
|
@ -575,6 +585,7 @@ set(ZIG_C_HEADER_FILES
|
|||
"fma4intrin.h"
|
||||
"fmaintrin.h"
|
||||
"fxsrintrin.h"
|
||||
"gfniintrin.h"
|
||||
"htmintrin.h"
|
||||
"htmxlintrin.h"
|
||||
"ia32intrin.h"
|
||||
|
@ -614,8 +625,10 @@ set(ZIG_C_HEADER_FILES
|
|||
"tmmintrin.h"
|
||||
"unwind.h"
|
||||
"vadefs.h"
|
||||
"vaesintrin.h"
|
||||
"varargs.h"
|
||||
"vecintrin.h"
|
||||
"vpclmulqdqintrin.h"
|
||||
"wmmintrin.h"
|
||||
"x86intrin.h"
|
||||
"xmmintrin.h"
|
||||
|
|
|
@ -131,15 +131,6 @@ __DEVICE__ float ldexp(float __arg, int __exp) {
|
|||
__DEVICE__ float log(float __x) { return ::logf(__x); }
|
||||
__DEVICE__ float log10(float __x) { return ::log10f(__x); }
|
||||
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
|
||||
__DEVICE__ float nexttoward(float __from, double __to) {
|
||||
return __builtin_nexttowardf(__from, __to);
|
||||
}
|
||||
__DEVICE__ double nexttoward(double __from, double __to) {
|
||||
return __builtin_nexttoward(__from, __to);
|
||||
}
|
||||
__DEVICE__ float nexttowardf(float __from, double __to) {
|
||||
return __builtin_nexttowardf(__from, __to);
|
||||
}
|
||||
__DEVICE__ float pow(float __base, float __exp) {
|
||||
return ::powf(__base, __exp);
|
||||
}
|
||||
|
@ -157,6 +148,10 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
|
|||
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
|
||||
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
|
||||
|
||||
// Notably missing above is nexttoward. We omit it because
|
||||
// libdevice doesn't provide an implementation, and we don't want to be in the
|
||||
// business of implementing tricky libm functions in this header.
|
||||
|
||||
// Now we've defined everything we promised we'd define in
|
||||
// __clang_cuda_math_forward_declares.h. We need to do two additional things to
|
||||
// fix up our math functions.
|
||||
|
@ -295,13 +290,6 @@ ldexp(__T __x, int __exp) {
|
|||
return std::ldexp((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
||||
double>::type
|
||||
nexttoward(__T __from, double __to) {
|
||||
return std::nexttoward((double)__from, __to);
|
||||
}
|
||||
|
||||
template <typename __T1, typename __T2>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<
|
||||
std::numeric_limits<__T1>::is_specialized &&
|
||||
|
@ -388,7 +376,6 @@ using ::lrint;
|
|||
using ::lround;
|
||||
using ::nearbyint;
|
||||
using ::nextafter;
|
||||
using ::nexttoward;
|
||||
using ::pow;
|
||||
using ::remainder;
|
||||
using ::remquo;
|
||||
|
@ -456,8 +443,6 @@ using ::lroundf;
|
|||
using ::modff;
|
||||
using ::nearbyintf;
|
||||
using ::nextafterf;
|
||||
using ::nexttowardf;
|
||||
using ::nexttowardf;
|
||||
using ::powf;
|
||||
using ::remainderf;
|
||||
using ::remquof;
|
||||
|
|
|
@ -34,23 +34,24 @@
|
|||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
||||
#pragma push_macro("__MAKE_SHUFFLES")
|
||||
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \
|
||||
inline __device__ int __FnName(int __val, int __offset, \
|
||||
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
|
||||
__Type) \
|
||||
inline __device__ int __FnName(int __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return __IntIntrinsic(__val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ float __FnName(float __val, int __offset, \
|
||||
inline __device__ float __FnName(float __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return __FloatIntrinsic(__val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \
|
||||
inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned int>( \
|
||||
::__FnName(static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ long long __FnName(long long __val, int __offset, \
|
||||
inline __device__ long long __FnName(long long __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
struct __Bits { \
|
||||
int __a, __b; \
|
||||
|
@ -65,12 +66,29 @@
|
|||
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
|
||||
return __ret; \
|
||||
} \
|
||||
inline __device__ long __FnName(long __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
_Static_assert(sizeof(long) == sizeof(long long) || \
|
||||
sizeof(long) == sizeof(int)); \
|
||||
if (sizeof(long) == sizeof(long long)) { \
|
||||
return static_cast<long>( \
|
||||
::__FnName(static_cast<long long>(__val), __offset, __width)); \
|
||||
} else if (sizeof(long) == sizeof(int)) { \
|
||||
return static_cast<long>( \
|
||||
::__FnName(static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
} \
|
||||
inline __device__ unsigned long __FnName( \
|
||||
unsigned long __val, __Type __offset, int __width = warpSize) { \
|
||||
return static_cast<unsigned long>( \
|
||||
::__FnName(static_cast<long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ unsigned long long __FnName( \
|
||||
unsigned long long __val, int __offset, int __width = warpSize) { \
|
||||
unsigned long long __val, __Type __offset, int __width = warpSize) { \
|
||||
return static_cast<unsigned long long>(::__FnName( \
|
||||
static_cast<unsigned long long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ double __FnName(double __val, int __offset, \
|
||||
inline __device__ double __FnName(double __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
long long __tmp; \
|
||||
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
|
||||
|
@ -81,13 +99,15 @@
|
|||
return __ret; \
|
||||
}
|
||||
|
||||
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
|
||||
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
|
||||
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
|
||||
// maxLane.
|
||||
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
|
||||
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
|
||||
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
||||
|
||||
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
|
||||
unsigned int);
|
||||
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
|
||||
unsigned int);
|
||||
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
|
||||
int);
|
||||
#pragma pop_macro("__MAKE_SHUFFLES")
|
||||
|
||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
@ -97,25 +117,26 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
|||
// __shfl_sync_* variants available in CUDA-9
|
||||
#pragma push_macro("__MAKE_SYNC_SHUFFLES")
|
||||
#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
|
||||
__Mask) \
|
||||
inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
__Mask, __Type) \
|
||||
inline __device__ int __FnName(unsigned int __mask, int __val, \
|
||||
__Type __offset, int __width = warpSize) { \
|
||||
return __IntIntrinsic(__mask, __val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ float __FnName(unsigned int __mask, float __val, \
|
||||
int __offset, int __width = warpSize) { \
|
||||
__Type __offset, int __width = warpSize) { \
|
||||
return __FloatIntrinsic(__mask, __val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ unsigned int __FnName(unsigned int __mask, \
|
||||
unsigned int __val, int __offset, \
|
||||
unsigned int __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned int>( \
|
||||
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ long long __FnName(unsigned int __mask, long long __val, \
|
||||
int __offset, int __width = warpSize) { \
|
||||
__Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
struct __Bits { \
|
||||
int __a, __b; \
|
||||
}; \
|
||||
|
@ -130,13 +151,31 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
|||
return __ret; \
|
||||
} \
|
||||
inline __device__ unsigned long long __FnName( \
|
||||
unsigned int __mask, unsigned long long __val, int __offset, \
|
||||
unsigned int __mask, unsigned long long __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned long long>(::__FnName( \
|
||||
__mask, static_cast<unsigned long long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ long __FnName(unsigned int __mask, long __val, \
|
||||
__Type __offset, int __width = warpSize) { \
|
||||
_Static_assert(sizeof(long) == sizeof(long long) || \
|
||||
sizeof(long) == sizeof(int)); \
|
||||
if (sizeof(long) == sizeof(long long)) { \
|
||||
return static_cast<long>(::__FnName( \
|
||||
__mask, static_cast<long long>(__val), __offset, __width)); \
|
||||
} else if (sizeof(long) == sizeof(int)) { \
|
||||
return static_cast<long>( \
|
||||
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
} \
|
||||
inline __device__ unsigned long __FnName( \
|
||||
unsigned int __mask, unsigned long __val, __Type __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned long>( \
|
||||
::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ double __FnName(unsigned int __mask, double __val, \
|
||||
int __offset, int __width = warpSize) { \
|
||||
__Type __offset, int __width = warpSize) { \
|
||||
long long __tmp; \
|
||||
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
|
||||
memcpy(&__tmp, &__val, sizeof(__val)); \
|
||||
|
@ -146,15 +185,15 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
|||
return __ret; \
|
||||
}
|
||||
__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
|
||||
__nvvm_shfl_sync_idx_f32, 0x1f);
|
||||
__nvvm_shfl_sync_idx_f32, 0x1f, int);
|
||||
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
|
||||
// maxLane.
|
||||
__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
|
||||
__nvvm_shfl_sync_up_f32, 0);
|
||||
__nvvm_shfl_sync_up_f32, 0, unsigned int);
|
||||
__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
|
||||
__nvvm_shfl_sync_down_f32, 0x1f);
|
||||
__nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
|
||||
__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
|
||||
__nvvm_shfl_sync_bfly_f32, 0x1f);
|
||||
__nvvm_shfl_sync_bfly_f32, 0x1f, int);
|
||||
#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
|
||||
|
||||
inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
|
||||
|
@ -188,6 +227,10 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
|
|||
|
||||
inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
|
||||
|
||||
inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
|
||||
return __nvvm_fns(mask, base, offset);
|
||||
}
|
||||
|
||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
||||
// Define __match* builtins CUDA-9 headers expect to see.
|
||||
|
|
|
@ -149,9 +149,6 @@ __DEVICE__ double nearbyint(double);
|
|||
__DEVICE__ float nearbyint(float);
|
||||
__DEVICE__ double nextafter(double, double);
|
||||
__DEVICE__ float nextafter(float, float);
|
||||
__DEVICE__ double nexttoward(double, double);
|
||||
__DEVICE__ float nexttoward(float, double);
|
||||
__DEVICE__ float nexttowardf(float, double);
|
||||
__DEVICE__ double pow(double, double);
|
||||
__DEVICE__ double pow(double, int);
|
||||
__DEVICE__ float pow(float, float);
|
||||
|
@ -185,6 +182,10 @@ __DEVICE__ float tgamma(float);
|
|||
__DEVICE__ double trunc(double);
|
||||
__DEVICE__ float trunc(float);
|
||||
|
||||
// Notably missing above is nexttoward, which we don't define on
|
||||
// the device side because libdevice doesn't give us an implementation, and we
|
||||
// don't want to be in the business of writing one ourselves.
|
||||
|
||||
// We need to define these overloads in exactly the namespace our standard
|
||||
// library uses (including the right inline namespace), otherwise they won't be
|
||||
// picked up by other functions in the standard library (e.g. functions in
|
||||
|
@ -255,7 +256,6 @@ using ::nan;
|
|||
using ::nanf;
|
||||
using ::nearbyint;
|
||||
using ::nextafter;
|
||||
using ::nexttoward;
|
||||
using ::pow;
|
||||
using ::remainder;
|
||||
using ::remquo;
|
||||
|
|
|
@ -270,12 +270,18 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
|
|||
// include guard from math.h wrapper from libstdc++. We have to undo the header
|
||||
// guard temporarily to get the definitions we need.
|
||||
#pragma push_macro("_GLIBCXX_MATH_H")
|
||||
#pragma push_macro("_LIBCPP_VERSION")
|
||||
#if CUDA_VERSION >= 9000
|
||||
#undef _GLIBCXX_MATH_H
|
||||
// We also need to undo another guard that checks for libc++ 3.8+
|
||||
#ifdef _LIBCPP_VERSION
|
||||
#define _LIBCPP_VERSION 3700
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "math_functions.hpp"
|
||||
#pragma pop_macro("_GLIBCXX_MATH_H")
|
||||
#pragma pop_macro("_LIBCPP_VERSION")
|
||||
#pragma pop_macro("__GNUC__")
|
||||
#pragma pop_macro("signbit")
|
||||
|
||||
|
|
8464
c_headers/arm_neon.h
8464
c_headers/arm_neon.h
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,97 @@
|
|||
/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512BITALGINTRIN_H
|
||||
#define __AVX512BITALGINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg")))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_popcnt_epi16(__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
|
||||
(__v32hi) _mm512_popcnt_epi16(__B),
|
||||
(__v32hi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
|
||||
{
|
||||
return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(),
|
||||
__U,
|
||||
__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_popcnt_epi8(__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
|
||||
(__v64qi) _mm512_popcnt_epi8(__B),
|
||||
(__v64qi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
|
||||
{
|
||||
return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(),
|
||||
__U,
|
||||
__B);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
|
||||
(__v64qi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
|
||||
{
|
||||
return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
|
||||
__A,
|
||||
__B);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -56,293 +56,145 @@ _mm512_setzero_hi(void) {
|
|||
|
||||
/* Integer compare */
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
|
||||
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
|
||||
(__v64qi)(__m512i)(b), (int)(p), \
|
||||
(__mmask64)-1); })
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
|
||||
__u);
|
||||
}
|
||||
#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
|
||||
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
|
||||
(__v64qi)(__m512i)(b), (int)(p), \
|
||||
(__mmask64)(m)); })
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
|
||||
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
|
||||
(__v64qi)(__m512i)(b), (int)(p), \
|
||||
(__mmask64)-1); })
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
|
||||
__u);
|
||||
}
|
||||
#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
|
||||
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
|
||||
(__v64qi)(__m512i)(b), (int)(p), \
|
||||
(__mmask64)(m)); })
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
|
||||
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
|
||||
(__v32hi)(__m512i)(b), (int)(p), \
|
||||
(__mmask32)-1); })
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
|
||||
__u);
|
||||
}
|
||||
#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
|
||||
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
|
||||
(__v32hi)(__m512i)(b), (int)(p), \
|
||||
(__mmask32)(m)); })
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
|
||||
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
|
||||
(__v32hi)(__m512i)(b), (int)(p), \
|
||||
(__mmask32)-1); })
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
|
||||
__u);
|
||||
}
|
||||
#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
|
||||
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
|
||||
(__v32hi)(__m512i)(b), (int)(p), \
|
||||
(__mmask32)(m)); })
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
#define _mm512_cmpeq_epi8_mask(A, B) \
|
||||
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
|
||||
#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
|
||||
#define _mm512_cmpge_epi8_mask(A, B) \
|
||||
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
|
||||
#define _mm512_mask_cmpge_epi8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
|
||||
#define _mm512_cmpgt_epi8_mask(A, B) \
|
||||
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
|
||||
#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
|
||||
#define _mm512_cmple_epi8_mask(A, B) \
|
||||
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
|
||||
#define _mm512_mask_cmple_epi8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
|
||||
#define _mm512_cmplt_epi8_mask(A, B) \
|
||||
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
|
||||
#define _mm512_mask_cmplt_epi8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
|
||||
#define _mm512_cmpneq_epi8_mask(A, B) \
|
||||
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
|
||||
#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
|
||||
__u);
|
||||
}
|
||||
#define _mm512_cmpeq_epu8_mask(A, B) \
|
||||
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
|
||||
#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
|
||||
#define _mm512_cmpge_epu8_mask(A, B) \
|
||||
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
|
||||
#define _mm512_mask_cmpge_epu8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
|
||||
#define _mm512_cmpgt_epu8_mask(A, B) \
|
||||
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
|
||||
#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
|
||||
#define _mm512_cmple_epu8_mask(A, B) \
|
||||
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
|
||||
#define _mm512_mask_cmple_epu8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
|
||||
#define _mm512_cmplt_epu8_mask(A, B) \
|
||||
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
|
||||
#define _mm512_mask_cmplt_epu8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
|
||||
#define _mm512_cmpneq_epu8_mask(A, B) \
|
||||
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
|
||||
#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
#define _mm512_cmpeq_epi16_mask(A, B) \
|
||||
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
|
||||
#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
|
||||
#define _mm512_cmpge_epi16_mask(A, B) \
|
||||
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
|
||||
#define _mm512_mask_cmpge_epi16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
|
||||
#define _mm512_cmpgt_epi16_mask(A, B) \
|
||||
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
|
||||
#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
|
||||
#define _mm512_cmple_epi16_mask(A, B) \
|
||||
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
|
||||
#define _mm512_mask_cmple_epi16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
|
||||
#define _mm512_cmplt_epi16_mask(A, B) \
|
||||
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
|
||||
#define _mm512_mask_cmplt_epi16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
|
||||
#define _mm512_cmpneq_epi16_mask(A, B) \
|
||||
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
|
||||
#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
|
||||
(__mmask64)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
|
||||
__u);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
|
||||
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
|
||||
__u);
|
||||
}
|
||||
#define _mm512_cmpeq_epu16_mask(A, B) \
|
||||
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
|
||||
#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
|
||||
#define _mm512_cmpge_epu16_mask(A, B) \
|
||||
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
|
||||
#define _mm512_mask_cmpge_epu16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
|
||||
#define _mm512_cmpgt_epu16_mask(A, B) \
|
||||
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
|
||||
#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
|
||||
#define _mm512_cmple_epu16_mask(A, B) \
|
||||
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
|
||||
#define _mm512_mask_cmple_epu16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
|
||||
#define _mm512_cmplt_epu16_mask(A, B) \
|
||||
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
|
||||
#define _mm512_mask_cmplt_epu16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
|
||||
#define _mm512_cmpneq_epu16_mask(A, B) \
|
||||
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
|
||||
#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \
|
||||
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_add_epi8 (__m512i __A, __m512i __B) {
|
||||
|
@ -1541,46 +1393,6 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
|
|||
}
|
||||
|
||||
|
||||
#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
|
||||
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
|
||||
(__v64qi)(__m512i)(b), (int)(p), \
|
||||
(__mmask64)-1); })
|
||||
|
||||
#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
|
||||
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
|
||||
(__v64qi)(__m512i)(b), (int)(p), \
|
||||
(__mmask64)(m)); })
|
||||
|
||||
#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
|
||||
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
|
||||
(__v64qi)(__m512i)(b), (int)(p), \
|
||||
(__mmask64)-1); })
|
||||
|
||||
#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
|
||||
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
|
||||
(__v64qi)(__m512i)(b), (int)(p), \
|
||||
(__mmask64)(m)); })
|
||||
|
||||
#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
|
||||
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
|
||||
(__v32hi)(__m512i)(b), (int)(p), \
|
||||
(__mmask32)-1); })
|
||||
|
||||
#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
|
||||
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
|
||||
(__v32hi)(__m512i)(b), (int)(p), \
|
||||
(__mmask32)(m)); })
|
||||
|
||||
#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
|
||||
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
|
||||
(__v32hi)(__m512i)(b), (int)(p), \
|
||||
(__mmask32)-1); })
|
||||
|
||||
#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
|
||||
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
|
||||
(__v32hi)(__m512i)(b), (int)(p), \
|
||||
(__mmask32)(m)); })
|
||||
|
||||
#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
|
||||
(__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
|
||||
(__v32hi)_mm512_undefined_epi32(), \
|
||||
|
@ -2042,15 +1854,13 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
|
|||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
|
||||
(__mmask64) __B);
|
||||
return (__mmask64) (( __A & 0xFFFFFFFF) | ( __B << 32));
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
|
||||
(__mmask32) __B);
|
||||
return (__mmask32) (( __A & 0xFFFF) | ( __B << 16));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
|
@ -2105,61 +1915,56 @@ _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
|
|||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_test_epi8_mask (__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
|
||||
(__v64qi) __B,
|
||||
(__mmask64) -1);
|
||||
return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B),
|
||||
_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
|
||||
(__v64qi) __B, __U);
|
||||
return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
|
||||
_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_test_epi16_mask (__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__mmask32) -1);
|
||||
return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B),
|
||||
_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
|
||||
(__v32hi) __B, __U);
|
||||
return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
|
||||
_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
|
||||
(__v64qi) __B,
|
||||
(__mmask64) -1);
|
||||
return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
|
||||
(__v64qi) __B, __U);
|
||||
return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
|
||||
_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__mmask32) -1);
|
||||
return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B),
|
||||
_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
|
||||
(__v32hi) __B, __U);
|
||||
return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
|
||||
_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -130,13 +130,14 @@ _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
|
|||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcastmb_epi64 (__mmask8 __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcastmb512 (__A);
|
||||
return (__m512i) _mm512_set1_epi64((long long) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcastmw_epi32 (__mmask16 __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcastmw512 (__A);
|
||||
return (__m512i) _mm512_set1_epi32((int) __A);
|
||||
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -8787,7 +8787,7 @@ _mm512_kortestz (__mmask16 __A, __mmask16 __B)
|
|||
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
|
||||
_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
|
||||
{
|
||||
return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
|
||||
return (__mmask16) (( __A & 0xFF) | ( __B << 8));
|
||||
}
|
||||
|
||||
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -0,0 +1,391 @@
|
|||
/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512VBMI2INTRIN_H
|
||||
#define __AVX512VBMI2INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2")))
|
||||
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
|
||||
(__v32hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
|
||||
(__v32hi) _mm512_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
|
||||
(__v64qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
|
||||
(__v64qi) _mm512_setzero_qi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
|
||||
{
|
||||
__builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
|
||||
{
|
||||
__builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
|
||||
(__v32hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
|
||||
(__v32hi) _mm512_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
|
||||
(__v64qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
|
||||
(__v64qi) _mm512_setzero_qi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
|
||||
(__v32hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
|
||||
(__v32hi) _mm512_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
|
||||
(__v64qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
|
||||
(__v64qi) _mm512_setzero_qi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
|
||||
(__v8di)(B), \
|
||||
(int)(I), \
|
||||
(__v8di)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
|
||||
_mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm512_shldi_epi64(A, B, I) \
|
||||
_mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
|
||||
(__v16si)(B), \
|
||||
(int)(I), \
|
||||
(__v16si)(S), \
|
||||
(__mmask16)(U)); })
|
||||
|
||||
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
|
||||
_mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm512_shldi_epi32(A, B, I) \
|
||||
_mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
|
||||
(__v32hi)(B), \
|
||||
(int)(I), \
|
||||
(__v32hi)(S), \
|
||||
(__mmask32)(U)); })
|
||||
|
||||
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
|
||||
_mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm512_shldi_epi16(A, B, I) \
|
||||
_mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
|
||||
(__v8di)(B), \
|
||||
(int)(I), \
|
||||
(__v8di)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
|
||||
_mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm512_shrdi_epi64(A, B, I) \
|
||||
_mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
|
||||
(__v16si)(B), \
|
||||
(int)(I), \
|
||||
(__v16si)(S), \
|
||||
(__mmask16)(U)); })
|
||||
|
||||
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
|
||||
_mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm512_shrdi_epi32(A, B, I) \
|
||||
_mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
|
||||
(__v32hi)(B), \
|
||||
(int)(I), \
|
||||
(__v32hi)(S), \
|
||||
(__mmask32)(U)); })
|
||||
|
||||
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
|
||||
_mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm512_shrdi_epi16(A, B, I) \
|
||||
_mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
|
||||
(__v8di) __A,
|
||||
(__v8di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S,
|
||||
(__v8di) __A,
|
||||
(__v8di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
|
||||
(__v8di) __A,
|
||||
(__v8di) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
|
||||
(__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S,
|
||||
(__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
|
||||
(__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__mmask32) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
|
||||
(__v8di) __A,
|
||||
(__v8di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S,
|
||||
(__v8di) __A,
|
||||
(__v8di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
|
||||
(__v8di) __A,
|
||||
(__v8di) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
|
||||
(__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S,
|
||||
(__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
|
||||
(__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__mmask32) -1);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512VLBITALGINTRIN_H
|
||||
#define __AVX512VLBITALGINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg")))
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_popcnt_epi16(__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
|
||||
(__v16hi) _mm256_popcnt_epi16(__B),
|
||||
(__v16hi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
|
||||
{
|
||||
return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
|
||||
__U,
|
||||
__B);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_popcnt_epi16(__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
|
||||
(__v8hi) _mm128_popcnt_epi16(__B),
|
||||
(__v8hi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
|
||||
{
|
||||
return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
|
||||
__U,
|
||||
__B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_popcnt_epi8(__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
|
||||
(__v32qi) _mm256_popcnt_epi8(__B),
|
||||
(__v32qi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
|
||||
{
|
||||
return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
|
||||
__U,
|
||||
__B);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_popcnt_epi8(__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
|
||||
(__v16qi) _mm128_popcnt_epi8(__B),
|
||||
(__v16qi) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
|
||||
{
|
||||
return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
|
||||
__U,
|
||||
__B);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
|
||||
(__v32qi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
|
||||
_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B)
|
||||
{
|
||||
return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1,
|
||||
__A,
|
||||
__B);
|
||||
}
|
||||
|
||||
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
|
||||
(__v16qi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
|
||||
_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B)
|
||||
{
|
||||
return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1,
|
||||
__A,
|
||||
__B);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
|
@ -33,26 +33,26 @@
|
|||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_broadcastmb_epi64 (__mmask8 __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_broadcastmb128 (__A);
|
||||
{
|
||||
return (__m128i) _mm_set1_epi64x((long long) __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_broadcastmb_epi64 (__mmask8 __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcastmb256 (__A);
|
||||
return (__m256i) _mm256_set1_epi64x((long long)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_broadcastmw_epi32 (__mmask16 __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_broadcastmw128 (__A);
|
||||
return (__m128i) _mm_set1_epi32((int)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_broadcastmw_epi32 (__mmask16 __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcastmw256 (__A);
|
||||
return (__m256i) _mm256_set1_epi32((int)__A);
|
||||
}
|
||||
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,748 @@
|
|||
/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512VLVBMI2INTRIN_H
|
||||
#define __AVX512VLVBMI2INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2")))
|
||||
|
||||
static __inline __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_setzero_hi(void) {
|
||||
return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
|
||||
(__v8hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
|
||||
(__v8hi) _mm128_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
|
||||
(__v16qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
|
||||
(__v16qi) _mm128_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
|
||||
{
|
||||
__builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
|
||||
{
|
||||
__builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
|
||||
(__v8hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
|
||||
(__v8hi) _mm128_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
|
||||
(__v16qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
|
||||
(__v16qi) _mm128_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
|
||||
(__v8hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
|
||||
(__v8hi) _mm128_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
|
||||
(__v16qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
|
||||
(__v16qi) _mm128_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_setzero_hi(void) {
|
||||
return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
|
||||
(__v16hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
|
||||
(__v16hi) _mm256_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
|
||||
(__v32qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
|
||||
(__v32qi) _mm256_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
|
||||
{
|
||||
__builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
|
||||
{
|
||||
__builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
|
||||
(__v16hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
|
||||
(__v16hi) _mm256_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
|
||||
(__v32qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
|
||||
(__v32qi) _mm256_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
|
||||
(__v16hi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
|
||||
(__v16hi) _mm256_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
|
||||
(__v32qi) __S,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
|
||||
(__v32qi) _mm256_setzero_hi(),
|
||||
__U);
|
||||
}
|
||||
|
||||
#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
|
||||
(__v4di)(B), \
|
||||
(int)(I), \
|
||||
(__v4di)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
|
||||
_mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm256_shldi_epi64(A, B, I) \
|
||||
_mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
|
||||
(__v2di)(B), \
|
||||
(int)(I), \
|
||||
(__v2di)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm128_maskz_shldi_epi64(U, A, B, I) \
|
||||
_mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm128_shldi_epi64(A, B, I) \
|
||||
_mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
|
||||
(__v8si)(B), \
|
||||
(int)(I), \
|
||||
(__v8si)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
|
||||
_mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm256_shldi_epi32(A, B, I) \
|
||||
_mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
|
||||
(__v4si)(B), \
|
||||
(int)(I), \
|
||||
(__v4si)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm128_maskz_shldi_epi32(U, A, B, I) \
|
||||
_mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm128_shldi_epi32(A, B, I) \
|
||||
_mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
|
||||
(__v16hi)(B), \
|
||||
(int)(I), \
|
||||
(__v16hi)(S), \
|
||||
(__mmask16)(U)); })
|
||||
|
||||
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
|
||||
_mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm256_shldi_epi16(A, B, I) \
|
||||
_mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
|
||||
(__v8hi)(B), \
|
||||
(int)(I), \
|
||||
(__v8hi)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm128_maskz_shldi_epi16(U, A, B, I) \
|
||||
_mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm128_shldi_epi16(A, B, I) \
|
||||
_mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
|
||||
(__v4di)(B), \
|
||||
(int)(I), \
|
||||
(__v4di)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
|
||||
_mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm256_shrdi_epi64(A, B, I) \
|
||||
_mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
|
||||
(__v2di)(B), \
|
||||
(int)(I), \
|
||||
(__v2di)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm128_maskz_shrdi_epi64(U, A, B, I) \
|
||||
_mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm128_shrdi_epi64(A, B, I) \
|
||||
_mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
|
||||
(__v8si)(B), \
|
||||
(int)(I), \
|
||||
(__v8si)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
|
||||
_mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm256_shrdi_epi32(A, B, I) \
|
||||
_mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
|
||||
(__v4si)(B), \
|
||||
(int)(I), \
|
||||
(__v4si)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm128_maskz_shrdi_epi32(U, A, B, I) \
|
||||
_mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm128_shrdi_epi32(A, B, I) \
|
||||
_mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
|
||||
(__v16hi)(B), \
|
||||
(int)(I), \
|
||||
(__v16hi)(S), \
|
||||
(__mmask16)(U)); })
|
||||
|
||||
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
|
||||
_mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm256_shrdi_epi16(A, B, I) \
|
||||
_mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
|
||||
(__v8hi)(B), \
|
||||
(int)(I), \
|
||||
(__v8hi)(S), \
|
||||
(__mmask8)(U)); })
|
||||
|
||||
#define _mm128_maskz_shrdi_epi16(U, A, B, I) \
|
||||
_mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
|
||||
|
||||
#define _mm128_shrdi_epi16(A, B, I) \
|
||||
_mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
|
||||
(__v4di) __A,
|
||||
(__v4di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S,
|
||||
(__v4di) __A,
|
||||
(__v4di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
|
||||
(__v4di) __A,
|
||||
(__v4di) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
|
||||
(__v2di) __A,
|
||||
(__v2di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S,
|
||||
(__v2di) __A,
|
||||
(__v2di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
|
||||
(__v2di) __A,
|
||||
(__v2di) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
|
||||
(__v16hi) __A,
|
||||
(__v16hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S,
|
||||
(__v16hi) __A,
|
||||
(__v16hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
|
||||
(__v16hi) __A,
|
||||
(__v16hi) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
|
||||
(__v8hi) __A,
|
||||
(__v8hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S,
|
||||
(__v8hi) __A,
|
||||
(__v8hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
|
||||
(__v8hi) __A,
|
||||
(__v8hi) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
|
||||
(__v4di) __A,
|
||||
(__v4di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S,
|
||||
(__v4di) __A,
|
||||
(__v4di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
|
||||
(__v4di) __A,
|
||||
(__v4di) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
|
||||
(__v2di) __A,
|
||||
(__v2di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S,
|
||||
(__v2di) __A,
|
||||
(__v2di) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
|
||||
(__v2di) __A,
|
||||
(__v2di) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
|
||||
(__v16hi) __A,
|
||||
(__v16hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S,
|
||||
(__v16hi) __A,
|
||||
(__v16hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
|
||||
(__v16hi) __A,
|
||||
(__v16hi) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
|
||||
(__v8hi) __A,
|
||||
(__v8hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S,
|
||||
(__v8hi) __A,
|
||||
(__v8hi) __B,
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
|
||||
(__v8hi) __A,
|
||||
(__v8hi) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,254 @@
|
|||
/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512VLVNNIINTRIN_H
|
||||
#define __AVX512VLVNNIINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni")))
|
||||
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
|
||||
(__v8si) __A,
|
||||
(__v8si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
|
||||
(__v4si) __A,
|
||||
(__v4si) __B,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,146 @@
|
|||
/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512VNNIINTRIN_H
|
||||
#define __AVX512VNNIINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni")))
|
||||
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
|
||||
(__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,99 @@
|
|||
/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
|
||||
*------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512VPOPCNTDQVLINTRIN_H
|
||||
#define __AVX512VPOPCNTDQVLINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl")))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) {
|
||||
return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
|
||||
return (__m128i)__builtin_ia32_selectq_128(
|
||||
(__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
|
||||
return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) {
|
||||
return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
|
||||
return (__m128i)__builtin_ia32_selectd_128(
|
||||
(__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
|
||||
return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) {
|
||||
return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
|
||||
return (__m256i)__builtin_ia32_selectq_256(
|
||||
(__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
|
||||
return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) {
|
||||
return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
|
||||
return (__m256i)__builtin_ia32_selectd_256(
|
||||
(__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
|
||||
return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,93 @@
|
|||
/*===---- cetintrin.h - CET intrinsic ------------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __CETINTRIN_H
|
||||
#define __CETINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("shstk")))
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
|
||||
__builtin_ia32_incsspd(__a);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
|
||||
__builtin_ia32_incsspq(__a);
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
|
||||
return __builtin_ia32_rdsspd(__a);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
|
||||
return __builtin_ia32_rdsspq(__a);
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
|
||||
__builtin_ia32_saveprevssp();
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
|
||||
__builtin_ia32_rstorssp(__p);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
|
||||
__builtin_ia32_wrssd(__a, __p);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
|
||||
__builtin_ia32_wrssq(__a, __p);
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
|
||||
__builtin_ia32_wrussd(__a, __p);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
|
||||
__builtin_ia32_wrussq(__a, __p);
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
|
||||
__builtin_ia32_setssbsy();
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
|
||||
__builtin_ia32_clrssbsy(__p);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __CETINTRIN_H */
|
|
@ -173,16 +173,24 @@
|
|||
#define bit_AVX512VL 0x80000000
|
||||
|
||||
/* Features in %ecx for leaf 7 sub-leaf 0 */
|
||||
#define bit_PREFTCHWT1 0x00000001
|
||||
#define bit_AVX512VBMI 0x00000002
|
||||
#define bit_PKU 0x00000004
|
||||
#define bit_OSPKE 0x00000010
|
||||
#define bit_PREFTCHWT1 0x00000001
|
||||
#define bit_AVX512VBMI 0x00000002
|
||||
#define bit_PKU 0x00000004
|
||||
#define bit_OSPKE 0x00000010
|
||||
#define bit_AVX512VBMI2 0x00000040
|
||||
#define bit_SHSTK 0x00000080
|
||||
#define bit_GFNI 0x00000100
|
||||
#define bit_VAES 0x00000200
|
||||
#define bit_VPCLMULQDQ 0x00000400
|
||||
#define bit_AVX512VNNI 0x00000800
|
||||
#define bit_AVX512BITALG 0x00001000
|
||||
#define bit_AVX512VPOPCNTDQ 0x00004000
|
||||
#define bit_RDPID 0x00400000
|
||||
#define bit_RDPID 0x00400000
|
||||
|
||||
/* Features in %edx for leaf 7 sub-leaf 0 */
|
||||
#define bit_AVX5124VNNIW 0x00000004
|
||||
#define bit_AVX5124FMAPS 0x00000008
|
||||
#define bit_IBT 0x00100000
|
||||
|
||||
/* Features in %eax for leaf 13 sub-leaf 1 */
|
||||
#define bit_XSAVEOPT 0x00000001
|
||||
|
@ -192,6 +200,7 @@
|
|||
/* Features in %ecx for leaf 0x80000001 */
|
||||
#define bit_LAHF_LM 0x00000001
|
||||
#define bit_ABM 0x00000020
|
||||
#define bit_LZCNT bit_ABM /* for gcc compat */
|
||||
#define bit_SSE4a 0x00000040
|
||||
#define bit_PRFCHW 0x00000100
|
||||
#define bit_XOP 0x00000800
|
||||
|
|
|
@ -80,7 +80,7 @@ min(const __T &__a, const __T &__b, __Cmp __cmp) {
|
|||
template <class __T>
|
||||
inline __device__ const __T &
|
||||
min(const __T &__a, const __T &__b) {
|
||||
return __a < __b ? __b : __a;
|
||||
return __a < __b ? __a : __b;
|
||||
}
|
||||
|
||||
#ifdef _LIBCPP_END_NAMESPACE_STD
|
||||
|
|
|
@ -217,8 +217,8 @@ _mm_div_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Calculates the square root of the lower double-precision value of
|
||||
/// the second operand and returns it in the lower 64 bits of the result.
|
||||
/// The upper 64 bits of the result are copied from the upper double-
|
||||
/// precision value of the first operand.
|
||||
/// The upper 64 bits of the result are copied from the upper
|
||||
/// double-precision value of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -260,8 +260,8 @@ _mm_sqrt_pd(__m128d __a)
|
|||
|
||||
/// \brief Compares lower 64-bit double-precision values of both operands, and
|
||||
/// returns the lesser of the pair of values in the lower 64-bits of the
|
||||
/// result. The upper 64 bits of the result are copied from the upper double-
|
||||
/// precision value of the first operand.
|
||||
/// result. The upper 64 bits of the result are copied from the upper
|
||||
/// double-precision value of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -304,8 +304,8 @@ _mm_min_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares lower 64-bit double-precision values of both operands, and
|
||||
/// returns the greater of the pair of values in the lower 64-bits of the
|
||||
/// result. The upper 64 bits of the result are copied from the upper double-
|
||||
/// precision value of the first operand.
|
||||
/// result. The upper 64 bits of the result are copied from the upper
|
||||
/// double-precision value of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -983,8 +983,10 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
|
|||
}
|
||||
|
||||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] for equality. The
|
||||
/// comparison yields 0 for false, 1 for true.
|
||||
/// the two 128-bit floating-point vectors of [2 x double] for equality.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -996,7 +998,8 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
|
|||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_comieq_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1008,7 +1011,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
|
|||
/// the value in the first parameter is less than the corresponding value in
|
||||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1020,7 +1024,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
|
|||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_comilt_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1032,7 +1037,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
|
|||
/// the value in the first parameter is less than or equal to the
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1044,7 +1050,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
|
|||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_comile_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1056,7 +1063,8 @@ _mm_comile_sd(__m128d __a, __m128d __b)
|
|||
/// the value in the first parameter is greater than the corresponding value
|
||||
/// in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1068,7 +1076,8 @@ _mm_comile_sd(__m128d __a, __m128d __b)
|
|||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_comigt_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1080,7 +1089,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
|
|||
/// the value in the first parameter is greater than or equal to the
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1092,7 +1102,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
|
|||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_comige_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1104,7 +1115,8 @@ _mm_comige_sd(__m128d __a, __m128d __b)
|
|||
/// the value in the first parameter is unequal to the corresponding value in
|
||||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two
|
||||
/// lower double-precision values is NaN, 1 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1116,7 +1128,8 @@ _mm_comige_sd(__m128d __a, __m128d __b)
|
|||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 1 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_comineq_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1127,7 +1140,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
|
|||
/// the two 128-bit floating-point vectors of [2 x double] for equality. The
|
||||
/// comparison yields 0 for false, 1 for true.
|
||||
///
|
||||
/// If either of the two lower double-precision values is NaN, 1 is returned.
|
||||
/// If either of the two lower double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1140,7 +1153,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
|
|||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 1 is returned.
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_ucomieq_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1153,7 +1166,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
|
|||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two lower
|
||||
/// double-precision values is NaN, 1 is returned.
|
||||
/// double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1166,7 +1179,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
|
|||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 1 is returned.
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_ucomilt_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1179,7 +1192,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
|
|||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two lower
|
||||
/// double-precision values is NaN, 1 is returned.
|
||||
/// double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1192,7 +1205,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
|
|||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison results. If either of the two
|
||||
/// lower double-precision values is NaN, 1 is returned.
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_ucomile_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1257,7 +1270,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
|
|||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two lower
|
||||
/// double-precision values is NaN, 0 is returned.
|
||||
/// double-precision values is NaN, 1 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1270,7 +1283,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
|
|||
/// A 128-bit vector of [2 x double]. The lower double-precision value is
|
||||
/// compared to the lower double-precision value of \a __a.
|
||||
/// \returns An integer containing the comparison result. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
/// lower double-precision values is NaN, 1 is returned.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_ucomineq_sd(__m128d __a, __m128d __b)
|
||||
{
|
||||
|
@ -1935,14 +1948,15 @@ _mm_store_pd(double *__dp, __m128d __a)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
|
||||
/// This intrinsic corresponds to the
|
||||
/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
|
||||
///
|
||||
/// \param __dp
|
||||
/// A pointer to a memory location that can store two double-precision
|
||||
/// values.
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
|
||||
/// of the values in \a dp.
|
||||
/// of the values in \a __dp.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_store1_pd(double *__dp, __m128d __a)
|
||||
{
|
||||
|
@ -1950,18 +1964,20 @@ _mm_store1_pd(double *__dp, __m128d __a)
|
|||
_mm_store_pd(__dp, __a);
|
||||
}
|
||||
|
||||
/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
|
||||
/// location.
|
||||
/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
|
||||
/// the upper and lower 64 bits of a memory location.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
|
||||
/// This intrinsic corresponds to the
|
||||
/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
|
||||
///
|
||||
/// \param __dp
|
||||
/// A pointer to a 128-bit memory location. The address of the memory
|
||||
/// location has to be 16-byte aligned.
|
||||
/// A pointer to a memory location that can store two double-precision
|
||||
/// values.
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double] containing the values to be stored.
|
||||
/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
|
||||
/// of the values in \a __dp.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_store_pd1(double *__dp, __m128d __a)
|
||||
{
|
||||
|
@ -3846,8 +3862,7 @@ _mm_set1_epi8(char __b)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
|
||||
/// instruction.
|
||||
/// This intrinsic does not correspond to a specific instruction.
|
||||
///
|
||||
/// \param __q0
|
||||
/// A 64-bit integral value used to initialize the lower 64 bits of the
|
||||
|
@ -4018,7 +4033,7 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
|
|||
/// specified unaligned memory location. When a mask bit is 1, the
|
||||
/// corresponding byte is written, otherwise it is not written.
|
||||
///
|
||||
/// To minimize caching, the date is flagged as non-temporal (unlikely to be
|
||||
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||
/// used again soon). Exception and trap behavior for elements not selected
|
||||
/// for storage to memory are implementation dependent.
|
||||
///
|
||||
|
@ -4532,8 +4547,8 @@ _mm_unpackhi_epi32(__m128i __a, __m128i __b)
|
|||
return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
|
||||
}
|
||||
|
||||
/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
|
||||
/// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
|
||||
/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
|
||||
/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -4665,7 +4680,7 @@ _mm_unpacklo_epi64(__m128i __a, __m128i __b)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector operand. The lower 64 bits are moved to the
|
||||
|
@ -4682,7 +4697,7 @@ _mm_movepi64_pi64(__m128i __a)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
|
||||
/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit value.
|
||||
|
@ -4712,8 +4727,8 @@ _mm_move_epi64(__m128i __a)
|
|||
return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
|
||||
}
|
||||
|
||||
/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
|
||||
/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
|
||||
/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
|
||||
/// [2 x double] and interleaves them into a 128-bit vector of [2 x
|
||||
/// double].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -4733,7 +4748,7 @@ _mm_unpackhi_pd(__m128d __a, __m128d __b)
|
|||
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
|
||||
}
|
||||
|
||||
/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
|
||||
/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors
|
||||
/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
|
||||
/// double].
|
||||
///
|
||||
|
@ -4792,9 +4807,9 @@ _mm_movemask_pd(__m128d __a)
|
|||
/// A 128-bit vector of [2 x double].
|
||||
/// \param i
|
||||
/// An 8-bit immediate value. The least significant two bits specify which
|
||||
/// elements to copy from a and b: \n
|
||||
/// Bit[0] = 0: lower element of a copied to lower element of result. \n
|
||||
/// Bit[0] = 1: upper element of a copied to lower element of result. \n
|
||||
/// elements to copy from \a a and \a b: \n
|
||||
/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
|
||||
/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
|
||||
/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
|
||||
/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
|
||||
/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
|
||||
|
|
|
@ -60,73 +60,73 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
|
|||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
|
@ -144,13 +144,13 @@ _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
|||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
|
@ -168,37 +168,37 @@ _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
|
|||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
|
@ -216,13 +216,13 @@ _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
|||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -46,85 +46,85 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
|
|||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
|
@ -142,13 +142,13 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
|||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
|
@ -166,37 +166,37 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
|
|||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
|
@ -214,13 +214,13 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
|||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -0,0 +1,202 @@
|
|||
/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __GFNIINTRIN_H
|
||||
#define __GFNIINTRIN_H
|
||||
|
||||
|
||||
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), \
|
||||
(char)(I)); })
|
||||
|
||||
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||
(__v16qi)(__m128i)(S)); })
|
||||
|
||||
|
||||
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
|
||||
(__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
||||
U, A, B, I); })
|
||||
|
||||
|
||||
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
|
||||
(__v32qi)(__m256i)(B), \
|
||||
(char)(I)); })
|
||||
|
||||
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||
(__v32qi)(__m256i)(S)); })
|
||||
|
||||
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
|
||||
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
||||
U, A, B, I); })
|
||||
|
||||
|
||||
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
|
||||
(__v64qi)(__m512i)(B), \
|
||||
(char)(I)); })
|
||||
|
||||
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
||||
(__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||
(__v64qi)(__m512i)(S)); })
|
||||
|
||||
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
|
||||
(__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(), \
|
||||
U, A, B, I); })
|
||||
|
||||
#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), \
|
||||
(char)(I)); })
|
||||
|
||||
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
|
||||
(__v16qi)(__m128i)(S)); })
|
||||
|
||||
|
||||
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
|
||||
(__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
||||
U, A, B, I); })
|
||||
|
||||
|
||||
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
|
||||
(__v32qi)(__m256i)(B), \
|
||||
(char)(I)); })
|
||||
|
||||
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
|
||||
(__v32qi)(__m256i)(S)); })
|
||||
|
||||
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
|
||||
(__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
||||
U, A, B, I); })
|
||||
|
||||
|
||||
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
|
||||
(__v64qi)(__m512i)(B), \
|
||||
(char)(I)); })
|
||||
|
||||
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
||||
(__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \
|
||||
(__v64qi)(__m512i)(S)); })
|
||||
|
||||
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
|
||||
(__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(), \
|
||||
U, A, B, I); })
|
||||
|
||||
/* Default attributes for simple form (no masking). */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni")))
|
||||
|
||||
/* Default attributes for ZMM forms. */
|
||||
#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni")))
|
||||
|
||||
/* Default attributes for VLX forms. */
|
||||
#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni")))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
|
||||
(__v16qi) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
|
||||
_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_selectb_128(__U,
|
||||
(__v16qi) _mm_gf2p8mul_epi8(__A, __B),
|
||||
(__v16qi) __S);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
|
||||
_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
|
||||
{
|
||||
return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
|
||||
__U, __A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
|
||||
(__v32qi) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
|
||||
_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_selectb_256(__U,
|
||||
(__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
|
||||
(__v32qi) __S);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
|
||||
_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
|
||||
{
|
||||
return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
|
||||
__U, __A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
|
||||
(__v64qi) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_selectb_512(__U,
|
||||
(__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
|
||||
(__v64qi) __S);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
|
||||
{
|
||||
return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(),
|
||||
__U, __A, __B);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_F
|
||||
#undef __DEFAULT_FN_ATTRS_VL
|
||||
|
||||
#endif // __GFNIINTRIN_H
|
||||
|
|
@ -118,6 +118,10 @@ _mm256_cvtph_ps(__m128i __a)
|
|||
}
|
||||
#endif /* __AVX2__ */
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
|
||||
#include <vpclmulqdqintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
|
||||
#include <bmiintrin.h>
|
||||
#endif
|
||||
|
@ -146,6 +150,10 @@ _mm256_cvtph_ps(__m128i __a)
|
|||
#include <avx512bwintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__)
|
||||
#include <avx512bitalgintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
|
||||
#include <avx512cdintrin.h>
|
||||
#endif
|
||||
|
@ -154,10 +162,29 @@ _mm256_cvtph_ps(__m128i __a)
|
|||
#include <avx512vpopcntdqintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
|
||||
#include <avx512vpopcntdqvlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__)
|
||||
#include <avx512vnniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512VNNI__))
|
||||
#include <avx512vlvnniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
|
||||
#include <avx512dqintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512BITALG__))
|
||||
#include <avx512vlbitalgintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512BW__))
|
||||
#include <avx512vlbwintrin.h>
|
||||
|
@ -195,6 +222,15 @@ _mm256_cvtph_ps(__m128i __a)
|
|||
#include <avx512vbmivlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__)
|
||||
#include <avx512vbmi2intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VBMI2__) && defined(__AVX512VL__))
|
||||
#include <avx512vlvbmi2intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
|
||||
#include <avx512pfintrin.h>
|
||||
#endif
|
||||
|
@ -203,6 +239,14 @@ _mm256_cvtph_ps(__m128i __a)
|
|||
#include <pkuintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__)
|
||||
#include <vaesintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__)
|
||||
#include <gfniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||
_rdrand16_step(unsigned short *__p)
|
||||
|
@ -319,6 +363,10 @@ _writegsbase_u64(unsigned long long __V)
|
|||
#include <xsavesintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__)
|
||||
#include <cetintrin.h>
|
||||
#endif
|
||||
|
||||
/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
|
||||
* whereas others are also available at all times. */
|
||||
#include <adxintrin.h>
|
||||
|
|
|
@ -15886,6 +15886,313 @@ double __ovld __conv sub_group_scan_inclusive_max(double x);
|
|||
|
||||
#endif //cl_khr_subgroups cl_intel_subgroups
|
||||
|
||||
#if defined(cl_intel_subgroups)
|
||||
// Intel-Specific Sub Group Functions
|
||||
float __ovld __conv intel_sub_group_shuffle( float x, uint c );
|
||||
float2 __ovld __conv intel_sub_group_shuffle( float2 x, uint c );
|
||||
float3 __ovld __conv intel_sub_group_shuffle( float3 x, uint c );
|
||||
float4 __ovld __conv intel_sub_group_shuffle( float4 x, uint c );
|
||||
float8 __ovld __conv intel_sub_group_shuffle( float8 x, uint c );
|
||||
float16 __ovld __conv intel_sub_group_shuffle( float16 x, uint c );
|
||||
|
||||
int __ovld __conv intel_sub_group_shuffle( int x, uint c );
|
||||
int2 __ovld __conv intel_sub_group_shuffle( int2 x, uint c );
|
||||
int3 __ovld __conv intel_sub_group_shuffle( int3 x, uint c );
|
||||
int4 __ovld __conv intel_sub_group_shuffle( int4 x, uint c );
|
||||
int8 __ovld __conv intel_sub_group_shuffle( int8 x, uint c );
|
||||
int16 __ovld __conv intel_sub_group_shuffle( int16 x, uint c );
|
||||
|
||||
uint __ovld __conv intel_sub_group_shuffle( uint x, uint c );
|
||||
uint2 __ovld __conv intel_sub_group_shuffle( uint2 x, uint c );
|
||||
uint3 __ovld __conv intel_sub_group_shuffle( uint3 x, uint c );
|
||||
uint4 __ovld __conv intel_sub_group_shuffle( uint4 x, uint c );
|
||||
uint8 __ovld __conv intel_sub_group_shuffle( uint8 x, uint c );
|
||||
uint16 __ovld __conv intel_sub_group_shuffle( uint16 x, uint c );
|
||||
|
||||
long __ovld __conv intel_sub_group_shuffle( long x, uint c );
|
||||
ulong __ovld __conv intel_sub_group_shuffle( ulong x, uint c );
|
||||
|
||||
float __ovld __conv intel_sub_group_shuffle_down( float cur, float next, uint c );
|
||||
float2 __ovld __conv intel_sub_group_shuffle_down( float2 cur, float2 next, uint c );
|
||||
float3 __ovld __conv intel_sub_group_shuffle_down( float3 cur, float3 next, uint c );
|
||||
float4 __ovld __conv intel_sub_group_shuffle_down( float4 cur, float4 next, uint c );
|
||||
float8 __ovld __conv intel_sub_group_shuffle_down( float8 cur, float8 next, uint c );
|
||||
float16 __ovld __conv intel_sub_group_shuffle_down( float16 cur, float16 next, uint c );
|
||||
|
||||
int __ovld __conv intel_sub_group_shuffle_down( int cur, int next, uint c );
|
||||
int2 __ovld __conv intel_sub_group_shuffle_down( int2 cur, int2 next, uint c );
|
||||
int3 __ovld __conv intel_sub_group_shuffle_down( int3 cur, int3 next, uint c );
|
||||
int4 __ovld __conv intel_sub_group_shuffle_down( int4 cur, int4 next, uint c );
|
||||
int8 __ovld __conv intel_sub_group_shuffle_down( int8 cur, int8 next, uint c );
|
||||
int16 __ovld __conv intel_sub_group_shuffle_down( int16 cur, int16 next, uint c );
|
||||
|
||||
uint __ovld __conv intel_sub_group_shuffle_down( uint cur, uint next, uint c );
|
||||
uint2 __ovld __conv intel_sub_group_shuffle_down( uint2 cur, uint2 next, uint c );
|
||||
uint3 __ovld __conv intel_sub_group_shuffle_down( uint3 cur, uint3 next, uint c );
|
||||
uint4 __ovld __conv intel_sub_group_shuffle_down( uint4 cur, uint4 next, uint c );
|
||||
uint8 __ovld __conv intel_sub_group_shuffle_down( uint8 cur, uint8 next, uint c );
|
||||
uint16 __ovld __conv intel_sub_group_shuffle_down( uint16 cur, uint16 next, uint c );
|
||||
|
||||
long __ovld __conv intel_sub_group_shuffle_down( long prev, long cur, uint c );
|
||||
ulong __ovld __conv intel_sub_group_shuffle_down( ulong prev, ulong cur, uint c );
|
||||
|
||||
float __ovld __conv intel_sub_group_shuffle_up( float prev, float cur, uint c );
|
||||
float2 __ovld __conv intel_sub_group_shuffle_up( float2 prev, float2 cur, uint c );
|
||||
float3 __ovld __conv intel_sub_group_shuffle_up( float3 prev, float3 cur, uint c );
|
||||
float4 __ovld __conv intel_sub_group_shuffle_up( float4 prev, float4 cur, uint c );
|
||||
float8 __ovld __conv intel_sub_group_shuffle_up( float8 prev, float8 cur, uint c );
|
||||
float16 __ovld __conv intel_sub_group_shuffle_up( float16 prev, float16 cur, uint c );
|
||||
|
||||
int __ovld __conv intel_sub_group_shuffle_up( int prev, int cur, uint c );
|
||||
int2 __ovld __conv intel_sub_group_shuffle_up( int2 prev, int2 cur, uint c );
|
||||
int3 __ovld __conv intel_sub_group_shuffle_up( int3 prev, int3 cur, uint c );
|
||||
int4 __ovld __conv intel_sub_group_shuffle_up( int4 prev, int4 cur, uint c );
|
||||
int8 __ovld __conv intel_sub_group_shuffle_up( int8 prev, int8 cur, uint c );
|
||||
int16 __ovld __conv intel_sub_group_shuffle_up( int16 prev, int16 cur, uint c );
|
||||
|
||||
uint __ovld __conv intel_sub_group_shuffle_up( uint prev, uint cur, uint c );
|
||||
uint2 __ovld __conv intel_sub_group_shuffle_up( uint2 prev, uint2 cur, uint c );
|
||||
uint3 __ovld __conv intel_sub_group_shuffle_up( uint3 prev, uint3 cur, uint c );
|
||||
uint4 __ovld __conv intel_sub_group_shuffle_up( uint4 prev, uint4 cur, uint c );
|
||||
uint8 __ovld __conv intel_sub_group_shuffle_up( uint8 prev, uint8 cur, uint c );
|
||||
uint16 __ovld __conv intel_sub_group_shuffle_up( uint16 prev, uint16 cur, uint c );
|
||||
|
||||
long __ovld __conv intel_sub_group_shuffle_up( long prev, long cur, uint c );
|
||||
ulong __ovld __conv intel_sub_group_shuffle_up( ulong prev, ulong cur, uint c );
|
||||
|
||||
float __ovld __conv intel_sub_group_shuffle_xor( float x, uint c );
|
||||
float2 __ovld __conv intel_sub_group_shuffle_xor( float2 x, uint c );
|
||||
float3 __ovld __conv intel_sub_group_shuffle_xor( float3 x, uint c );
|
||||
float4 __ovld __conv intel_sub_group_shuffle_xor( float4 x, uint c );
|
||||
float8 __ovld __conv intel_sub_group_shuffle_xor( float8 x, uint c );
|
||||
float16 __ovld __conv intel_sub_group_shuffle_xor( float16 x, uint c );
|
||||
|
||||
int __ovld __conv intel_sub_group_shuffle_xor( int x, uint c );
|
||||
int2 __ovld __conv intel_sub_group_shuffle_xor( int2 x, uint c );
|
||||
int3 __ovld __conv intel_sub_group_shuffle_xor( int3 x, uint c );
|
||||
int4 __ovld __conv intel_sub_group_shuffle_xor( int4 x, uint c );
|
||||
int8 __ovld __conv intel_sub_group_shuffle_xor( int8 x, uint c );
|
||||
int16 __ovld __conv intel_sub_group_shuffle_xor( int16 x, uint c );
|
||||
|
||||
uint __ovld __conv intel_sub_group_shuffle_xor( uint x, uint c );
|
||||
uint2 __ovld __conv intel_sub_group_shuffle_xor( uint2 x, uint c );
|
||||
uint3 __ovld __conv intel_sub_group_shuffle_xor( uint3 x, uint c );
|
||||
uint4 __ovld __conv intel_sub_group_shuffle_xor( uint4 x, uint c );
|
||||
uint8 __ovld __conv intel_sub_group_shuffle_xor( uint8 x, uint c );
|
||||
uint16 __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c );
|
||||
|
||||
long __ovld __conv intel_sub_group_shuffle_xor( long x, uint c );
|
||||
ulong __ovld __conv intel_sub_group_shuffle_xor( ulong x, uint c );
|
||||
|
||||
uint __ovld __conv intel_sub_group_block_read( read_only image2d_t image, int2 coord );
|
||||
uint2 __ovld __conv intel_sub_group_block_read2( read_only image2d_t image, int2 coord );
|
||||
uint4 __ovld __conv intel_sub_group_block_read4( read_only image2d_t image, int2 coord );
|
||||
uint8 __ovld __conv intel_sub_group_block_read8( read_only image2d_t image, int2 coord );
|
||||
|
||||
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
uint __ovld __conv intel_sub_group_block_read(read_write image2d_t image, int2 coord);
|
||||
uint2 __ovld __conv intel_sub_group_block_read2(read_write image2d_t image, int2 coord);
|
||||
uint4 __ovld __conv intel_sub_group_block_read4(read_write image2d_t image, int2 coord);
|
||||
uint8 __ovld __conv intel_sub_group_block_read8(read_write image2d_t image, int2 coord);
|
||||
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
uint __ovld __conv intel_sub_group_block_read( const __global uint* p );
|
||||
uint2 __ovld __conv intel_sub_group_block_read2( const __global uint* p );
|
||||
uint4 __ovld __conv intel_sub_group_block_read4( const __global uint* p );
|
||||
uint8 __ovld __conv intel_sub_group_block_read8( const __global uint* p );
|
||||
|
||||
void __ovld __conv intel_sub_group_block_write(write_only image2d_t image, int2 coord, uint data);
|
||||
void __ovld __conv intel_sub_group_block_write2(write_only image2d_t image, int2 coord, uint2 data);
|
||||
void __ovld __conv intel_sub_group_block_write4(write_only image2d_t image, int2 coord, uint4 data);
|
||||
void __ovld __conv intel_sub_group_block_write8(write_only image2d_t image, int2 coord, uint8 data);
|
||||
|
||||
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
void __ovld __conv intel_sub_group_block_write(read_write image2d_t image, int2 coord, uint data);
|
||||
void __ovld __conv intel_sub_group_block_write2(read_write image2d_t image, int2 coord, uint2 data);
|
||||
void __ovld __conv intel_sub_group_block_write4(read_write image2d_t image, int2 coord, uint4 data);
|
||||
void __ovld __conv intel_sub_group_block_write8(read_write image2d_t image, int2 coord, uint8 data);
|
||||
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
void __ovld __conv intel_sub_group_block_write( __global uint* p, uint data );
|
||||
void __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data );
|
||||
void __ovld __conv intel_sub_group_block_write4( __global uint* p, uint4 data );
|
||||
void __ovld __conv intel_sub_group_block_write8( __global uint* p, uint8 data );
|
||||
|
||||
#ifdef cl_khr_fp16
|
||||
half __ovld __conv intel_sub_group_shuffle( half x, uint c );
|
||||
half __ovld __conv intel_sub_group_shuffle_down( half prev, half cur, uint c );
|
||||
half __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c );
|
||||
half __ovld __conv intel_sub_group_shuffle_xor( half x, uint c );
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_fp64)
|
||||
double __ovld __conv intel_sub_group_shuffle( double x, uint c );
|
||||
double __ovld __conv intel_sub_group_shuffle_down( double prev, double cur, uint c );
|
||||
double __ovld __conv intel_sub_group_shuffle_up( double prev, double cur, uint c );
|
||||
double __ovld __conv intel_sub_group_shuffle_xor( double x, uint c );
|
||||
#endif
|
||||
|
||||
#endif //cl_intel_subgroups
|
||||
|
||||
#if defined(cl_intel_subgroups_short)
|
||||
short __ovld __conv intel_sub_group_broadcast( short x, uint sub_group_local_id );
|
||||
short2 __ovld __conv intel_sub_group_broadcast( short2 x, uint sub_group_local_id );
|
||||
short3 __ovld __conv intel_sub_group_broadcast( short3 x, uint sub_group_local_id );
|
||||
short4 __ovld __conv intel_sub_group_broadcast( short4 x, uint sub_group_local_id );
|
||||
short8 __ovld __conv intel_sub_group_broadcast( short8 x, uint sub_group_local_id );
|
||||
|
||||
ushort __ovld __conv intel_sub_group_broadcast( ushort x, uint sub_group_local_id );
|
||||
ushort2 __ovld __conv intel_sub_group_broadcast( ushort2 x, uint sub_group_local_id );
|
||||
ushort3 __ovld __conv intel_sub_group_broadcast( ushort3 x, uint sub_group_local_id );
|
||||
ushort4 __ovld __conv intel_sub_group_broadcast( ushort4 x, uint sub_group_local_id );
|
||||
ushort8 __ovld __conv intel_sub_group_broadcast( ushort8 x, uint sub_group_local_id );
|
||||
|
||||
short __ovld __conv intel_sub_group_shuffle( short x, uint c );
|
||||
short2 __ovld __conv intel_sub_group_shuffle( short2 x, uint c );
|
||||
short3 __ovld __conv intel_sub_group_shuffle( short3 x, uint c );
|
||||
short4 __ovld __conv intel_sub_group_shuffle( short4 x, uint c );
|
||||
short8 __ovld __conv intel_sub_group_shuffle( short8 x, uint c );
|
||||
short16 __ovld __conv intel_sub_group_shuffle( short16 x, uint c);
|
||||
|
||||
ushort __ovld __conv intel_sub_group_shuffle( ushort x, uint c );
|
||||
ushort2 __ovld __conv intel_sub_group_shuffle( ushort2 x, uint c );
|
||||
ushort3 __ovld __conv intel_sub_group_shuffle( ushort3 x, uint c );
|
||||
ushort4 __ovld __conv intel_sub_group_shuffle( ushort4 x, uint c );
|
||||
ushort8 __ovld __conv intel_sub_group_shuffle( ushort8 x, uint c );
|
||||
ushort16 __ovld __conv intel_sub_group_shuffle( ushort16 x, uint c );
|
||||
|
||||
short __ovld __conv intel_sub_group_shuffle_down( short cur, short next, uint c );
|
||||
short2 __ovld __conv intel_sub_group_shuffle_down( short2 cur, short2 next, uint c );
|
||||
short3 __ovld __conv intel_sub_group_shuffle_down( short3 cur, short3 next, uint c );
|
||||
short4 __ovld __conv intel_sub_group_shuffle_down( short4 cur, short4 next, uint c );
|
||||
short8 __ovld __conv intel_sub_group_shuffle_down( short8 cur, short8 next, uint c );
|
||||
short16 __ovld __conv intel_sub_group_shuffle_down( short16 cur, short16 next, uint c );
|
||||
|
||||
ushort __ovld __conv intel_sub_group_shuffle_down( ushort cur, ushort next, uint c );
|
||||
ushort2 __ovld __conv intel_sub_group_shuffle_down( ushort2 cur, ushort2 next, uint c );
|
||||
ushort3 __ovld __conv intel_sub_group_shuffle_down( ushort3 cur, ushort3 next, uint c );
|
||||
ushort4 __ovld __conv intel_sub_group_shuffle_down( ushort4 cur, ushort4 next, uint c );
|
||||
ushort8 __ovld __conv intel_sub_group_shuffle_down( ushort8 cur, ushort8 next, uint c );
|
||||
ushort16 __ovld __conv intel_sub_group_shuffle_down( ushort16 cur, ushort16 next, uint c );
|
||||
|
||||
short __ovld __conv intel_sub_group_shuffle_up( short cur, short next, uint c );
|
||||
short2 __ovld __conv intel_sub_group_shuffle_up( short2 cur, short2 next, uint c );
|
||||
short3 __ovld __conv intel_sub_group_shuffle_up( short3 cur, short3 next, uint c );
|
||||
short4 __ovld __conv intel_sub_group_shuffle_up( short4 cur, short4 next, uint c );
|
||||
short8 __ovld __conv intel_sub_group_shuffle_up( short8 cur, short8 next, uint c );
|
||||
short16 __ovld __conv intel_sub_group_shuffle_up( short16 cur, short16 next, uint c );
|
||||
|
||||
ushort __ovld __conv intel_sub_group_shuffle_up( ushort cur, ushort next, uint c );
|
||||
ushort2 __ovld __conv intel_sub_group_shuffle_up( ushort2 cur, ushort2 next, uint c );
|
||||
ushort3 __ovld __conv intel_sub_group_shuffle_up( ushort3 cur, ushort3 next, uint c );
|
||||
ushort4 __ovld __conv intel_sub_group_shuffle_up( ushort4 cur, ushort4 next, uint c );
|
||||
ushort8 __ovld __conv intel_sub_group_shuffle_up( ushort8 cur, ushort8 next, uint c );
|
||||
ushort16 __ovld __conv intel_sub_group_shuffle_up( ushort16 cur, ushort16 next, uint c );
|
||||
|
||||
short __ovld __conv intel_sub_group_shuffle_xor( short x, uint c );
|
||||
short2 __ovld __conv intel_sub_group_shuffle_xor( short2 x, uint c );
|
||||
short3 __ovld __conv intel_sub_group_shuffle_xor( short3 x, uint c );
|
||||
short4 __ovld __conv intel_sub_group_shuffle_xor( short4 x, uint c );
|
||||
short8 __ovld __conv intel_sub_group_shuffle_xor( short8 x, uint c );
|
||||
short16 __ovld __conv intel_sub_group_shuffle_xor( short16 x, uint c );
|
||||
|
||||
ushort __ovld __conv intel_sub_group_shuffle_xor( ushort x, uint c );
|
||||
ushort2 __ovld __conv intel_sub_group_shuffle_xor( ushort2 x, uint c );
|
||||
ushort3 __ovld __conv intel_sub_group_shuffle_xor( ushort3 x, uint c );
|
||||
ushort4 __ovld __conv intel_sub_group_shuffle_xor( ushort4 x, uint c );
|
||||
ushort8 __ovld __conv intel_sub_group_shuffle_xor( ushort8 x, uint c );
|
||||
ushort16 __ovld __conv intel_sub_group_shuffle_xor( ushort16 x, uint c );
|
||||
|
||||
short __ovld __conv intel_sub_group_reduce_add( short x );
|
||||
ushort __ovld __conv intel_sub_group_reduce_add( ushort x );
|
||||
short __ovld __conv intel_sub_group_reduce_min( short x );
|
||||
ushort __ovld __conv intel_sub_group_reduce_min( ushort x );
|
||||
short __ovld __conv intel_sub_group_reduce_max( short x );
|
||||
ushort __ovld __conv intel_sub_group_reduce_max( ushort x );
|
||||
|
||||
short __ovld __conv intel_sub_group_scan_exclusive_add( short x );
|
||||
ushort __ovld __conv intel_sub_group_scan_exclusive_add( ushort x );
|
||||
short __ovld __conv intel_sub_group_scan_exclusive_min( short x );
|
||||
ushort __ovld __conv intel_sub_group_scan_exclusive_min( ushort x );
|
||||
short __ovld __conv intel_sub_group_scan_exclusive_max( short x );
|
||||
ushort __ovld __conv intel_sub_group_scan_exclusive_max( ushort x );
|
||||
|
||||
short __ovld __conv intel_sub_group_scan_inclusive_add( short x );
|
||||
ushort __ovld __conv intel_sub_group_scan_inclusive_add( ushort x );
|
||||
short __ovld __conv intel_sub_group_scan_inclusive_min( short x );
|
||||
ushort __ovld __conv intel_sub_group_scan_inclusive_min( ushort x );
|
||||
short __ovld __conv intel_sub_group_scan_inclusive_max( short x );
|
||||
ushort __ovld __conv intel_sub_group_scan_inclusive_max( ushort x );
|
||||
|
||||
uint __ovld __conv intel_sub_group_block_read_ui( read_only image2d_t image, int2 byte_coord );
|
||||
uint2 __ovld __conv intel_sub_group_block_read_ui2( read_only image2d_t image, int2 byte_coord );
|
||||
uint4 __ovld __conv intel_sub_group_block_read_ui4( read_only image2d_t image, int2 byte_coord );
|
||||
uint8 __ovld __conv intel_sub_group_block_read_ui8( read_only image2d_t image, int2 byte_coord );
|
||||
|
||||
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
uint __ovld __conv intel_sub_group_block_read_ui( read_write image2d_t image, int2 byte_coord );
|
||||
uint2 __ovld __conv intel_sub_group_block_read_ui2( read_write image2d_t image, int2 byte_coord );
|
||||
uint4 __ovld __conv intel_sub_group_block_read_ui4( read_write image2d_t image, int2 byte_coord );
|
||||
uint8 __ovld __conv intel_sub_group_block_read_ui8( read_write image2d_t image, int2 byte_coord );
|
||||
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
uint __ovld __conv intel_sub_group_block_read_ui( const __global uint* p );
|
||||
uint2 __ovld __conv intel_sub_group_block_read_ui2( const __global uint* p );
|
||||
uint4 __ovld __conv intel_sub_group_block_read_ui4( const __global uint* p );
|
||||
uint8 __ovld __conv intel_sub_group_block_read_ui8( const __global uint* p );
|
||||
|
||||
void __ovld __conv intel_sub_group_block_write_ui( read_only image2d_t image, int2 byte_coord, uint data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui2( read_only image2d_t image, int2 byte_coord, uint2 data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui4( read_only image2d_t image, int2 byte_coord, uint4 data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui8( read_only image2d_t image, int2 byte_coord, uint8 data );
|
||||
|
||||
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
void __ovld __conv intel_sub_group_block_write_ui( read_write image2d_t image, int2 byte_coord, uint data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui2( read_write image2d_t image, int2 byte_coord, uint2 data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui4( read_write image2d_t image, int2 byte_coord, uint4 data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui8( read_write image2d_t image, int2 byte_coord, uint8 data );
|
||||
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
void __ovld __conv intel_sub_group_block_write_ui( __global uint* p, uint data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui2( __global uint* p, uint2 data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui4( __global uint* p, uint4 data );
|
||||
void __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint8 data );
|
||||
|
||||
ushort __ovld __conv intel_sub_group_block_read_us( read_only image2d_t image, int2 coord );
|
||||
ushort2 __ovld __conv intel_sub_group_block_read_us2( read_only image2d_t image, int2 coord );
|
||||
ushort4 __ovld __conv intel_sub_group_block_read_us4( read_only image2d_t image, int2 coord );
|
||||
ushort8 __ovld __conv intel_sub_group_block_read_us8( read_only image2d_t image, int2 coord );
|
||||
|
||||
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
ushort __ovld __conv intel_sub_group_block_read_us(read_write image2d_t image, int2 coord);
|
||||
ushort2 __ovld __conv intel_sub_group_block_read_us2(read_write image2d_t image, int2 coord);
|
||||
ushort4 __ovld __conv intel_sub_group_block_read_us4(read_write image2d_t image, int2 coord);
|
||||
ushort8 __ovld __conv intel_sub_group_block_read_us8(read_write image2d_t image, int2 coord);
|
||||
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
ushort __ovld __conv intel_sub_group_block_read_us( const __global ushort* p );
|
||||
ushort2 __ovld __conv intel_sub_group_block_read_us2( const __global ushort* p );
|
||||
ushort4 __ovld __conv intel_sub_group_block_read_us4( const __global ushort* p );
|
||||
ushort8 __ovld __conv intel_sub_group_block_read_us8( const __global ushort* p );
|
||||
|
||||
void __ovld __conv intel_sub_group_block_write_us(write_only image2d_t image, int2 coord, ushort data);
|
||||
void __ovld __conv intel_sub_group_block_write_us2(write_only image2d_t image, int2 coord, ushort2 data);
|
||||
void __ovld __conv intel_sub_group_block_write_us4(write_only image2d_t image, int2 coord, ushort4 data);
|
||||
void __ovld __conv intel_sub_group_block_write_us8(write_only image2d_t image, int2 coord, ushort8 data);
|
||||
|
||||
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
void __ovld __conv intel_sub_group_block_write_us(read_write image2d_t image, int2 coord, ushort data);
|
||||
void __ovld __conv intel_sub_group_block_write_us2(read_write image2d_t image, int2 coord, ushort2 data);
|
||||
void __ovld __conv intel_sub_group_block_write_us4(read_write image2d_t image, int2 coord, ushort4 data);
|
||||
void __ovld __conv intel_sub_group_block_write_us8(read_write image2d_t image, int2 coord, ushort8 data);
|
||||
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
void __ovld __conv intel_sub_group_block_write_us( __global ushort* p, ushort data );
|
||||
void __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data );
|
||||
void __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data );
|
||||
void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data );
|
||||
#endif // cl_intel_subgroups_short
|
||||
|
||||
#ifdef cl_amd_media_ops
|
||||
uint __ovld amd_bitalign(uint a, uint b, uint c);
|
||||
uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
|
||||
|
|
|
@ -115,8 +115,8 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
|
|||
return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
|
||||
}
|
||||
|
||||
/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
|
||||
/// vector of [4 x float] to float values stored in a 128-bit vector of
|
||||
/// \brief Moves and duplicates odd-indexed values from a 128-bit vector
|
||||
/// of [4 x float] to float values stored in a 128-bit vector of
|
||||
/// [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a)
|
|||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
|
||||
}
|
||||
|
||||
/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
|
||||
/// \brief Duplicates even-indexed values from a 128-bit vector of
|
||||
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
|
|
@ -648,7 +648,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
|
|||
/// input vectors are used as an input for dot product; otherwise that input
|
||||
/// is treated as zero. Bits [1:0] determine which elements of the result
|
||||
/// will receive a copy of the final dot product, with bit [0] corresponding
|
||||
/// to the lowest element and bit [3] corresponding to the highest element of
|
||||
/// to the lowest element and bit [1] corresponding to the highest element of
|
||||
/// each [2 x double] vector. If a bit is set, the dot product is returned in
|
||||
/// the corresponding element; otherwise that element is set to zero.
|
||||
#define _mm_dp_pd(X, Y, M) __extension__ ({\
|
||||
|
@ -866,8 +866,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||
/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
|
||||
/// Bits[3:0]: If any of these bits are set, the corresponding result
|
||||
/// element is cleared.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the copied single-
|
||||
/// precision floating point elements from the operands.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the copied
|
||||
/// single-precision floating point elements from the operands.
|
||||
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
|
||||
|
||||
/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
|
||||
|
|
|
@ -32,12 +32,15 @@
|
|||
#define true 1
|
||||
#define false 0
|
||||
#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
/* Define _Bool, bool, false, true as a GNU extension. */
|
||||
/* Define _Bool as a GNU extension. */
|
||||
#define _Bool bool
|
||||
#if __cplusplus < 201103L
|
||||
/* For C++98, define bool, false, true as a GNU extension. */
|
||||
#define bool bool
|
||||
#define false false
|
||||
#define true true
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define __bool_true_false_are_defined 1
|
||||
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
/*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __VAESINTRIN_H
|
||||
#define __VAESINTRIN_H
|
||||
|
||||
/* Default attributes for YMM forms. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes")))
|
||||
|
||||
/* Default attributes for ZMM forms. */
|
||||
#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes")))
|
||||
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_aesenc_epi128(__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_aesenc256((__v4di) __A,
|
||||
(__v4di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesenc_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_aesdec_epi128(__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_aesdec256((__v4di) __A,
|
||||
(__v4di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesdec_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_aesenclast_epi128(__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A,
|
||||
(__v4di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesenclast_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A,
|
||||
(__v4di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_F
|
||||
|
||||
#endif
|
|
@ -0,0 +1,42 @@
|
|||
/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __VPCLMULQDQINTRIN_H
|
||||
#define __VPCLMULQDQINTRIN_H
|
||||
|
||||
#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
|
||||
(__v4di)(__m256i)(B), \
|
||||
(char)(I)); })
|
||||
|
||||
#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({ \
|
||||
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
|
||||
(__v8di)(__m512i)(B), \
|
||||
(char)(I)); })
|
||||
|
||||
#endif // __VPCLMULQDQINTRIN_H
|
||||
|
|
@ -2035,9 +2035,11 @@ _mm_storer_ps(float *__p, __m128 __a)
|
|||
_mm_store_ps(__p, __a);
|
||||
}
|
||||
|
||||
#define _MM_HINT_T0 3
|
||||
#define _MM_HINT_T1 2
|
||||
#define _MM_HINT_T2 1
|
||||
#define _MM_HINT_ET0 7
|
||||
#define _MM_HINT_ET1 6
|
||||
#define _MM_HINT_T0 3
|
||||
#define _MM_HINT_T1 2
|
||||
#define _MM_HINT_T2 1
|
||||
#define _MM_HINT_NTA 0
|
||||
|
||||
#ifndef _MSC_VER
|
||||
|
@ -2068,7 +2070,8 @@ _mm_storer_ps(float *__p, __m128 __a)
|
|||
/// be generated. \n
|
||||
/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
|
||||
/// be generated.
|
||||
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
|
||||
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
|
||||
((sel) >> 2) & 1, (sel) & 0x3))
|
||||
#endif
|
||||
|
||||
/// \brief Stores a 64-bit integer in the specified aligned memory location. To
|
||||
|
|
Loading…
Reference in New Issue