update C headers to clang 6.0.0rc3

master
Andrew Kelley 2018-02-23 13:15:16 -05:00
parent 1ba6e1641a
commit 4955c4b8f9
34 changed files with 9573 additions and 4424 deletions

View File

@ -541,10 +541,12 @@ set(ZIG_C_HEADER_FILES
"adxintrin.h"
"altivec.h"
"ammintrin.h"
"arm64intr.h"
"arm_acle.h"
"arm_neon.h"
"armintr.h"
"avx2intrin.h"
"avx512bitalgintrin.h"
"avx512bwintrin.h"
"avx512cdintrin.h"
"avx512dqintrin.h"
@ -553,17 +555,25 @@ set(ZIG_C_HEADER_FILES
"avx512ifmaintrin.h"
"avx512ifmavlintrin.h"
"avx512pfintrin.h"
"avx512vbmi2intrin.h"
"avx512vbmiintrin.h"
"avx512vbmivlintrin.h"
"avx512vlbitalgintrin.h"
"avx512vlbwintrin.h"
"avx512vlcdintrin.h"
"avx512vldqintrin.h"
"avx512vlintrin.h"
"avx512vlvbmi2intrin.h"
"avx512vlvnniintrin.h"
"avx512vnniintrin.h"
"avx512vpopcntdqintrin.h"
"avx512vpopcntdqvlintrin.h"
"avxintrin.h"
"bmi2intrin.h"
"bmiintrin.h"
"cetintrin.h"
"clflushoptintrin.h"
"clwbintrin.h"
"clzerointrin.h"
"cpuid.h"
"cuda_wrappers/algorithm"
@ -575,6 +585,7 @@ set(ZIG_C_HEADER_FILES
"fma4intrin.h"
"fmaintrin.h"
"fxsrintrin.h"
"gfniintrin.h"
"htmintrin.h"
"htmxlintrin.h"
"ia32intrin.h"
@ -614,8 +625,10 @@ set(ZIG_C_HEADER_FILES
"tmmintrin.h"
"unwind.h"
"vadefs.h"
"vaesintrin.h"
"varargs.h"
"vecintrin.h"
"vpclmulqdqintrin.h"
"wmmintrin.h"
"x86intrin.h"
"xmmintrin.h"

View File

@ -131,15 +131,6 @@ __DEVICE__ float ldexp(float __arg, int __exp) {
__DEVICE__ float log(float __x) { return ::logf(__x); }
__DEVICE__ float log10(float __x) { return ::log10f(__x); }
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
__DEVICE__ float nexttoward(float __from, double __to) {
return __builtin_nexttowardf(__from, __to);
}
__DEVICE__ double nexttoward(double __from, double __to) {
return __builtin_nexttoward(__from, __to);
}
__DEVICE__ float nexttowardf(float __from, double __to) {
return __builtin_nexttowardf(__from, __to);
}
__DEVICE__ float pow(float __base, float __exp) {
return ::powf(__base, __exp);
}
@ -157,6 +148,10 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
// Notably missing above is nexttoward. We omit it because
// libdevice doesn't provide an implementation, and we don't want to be in the
// business of implementing tricky libm functions in this header.
// Now we've defined everything we promised we'd define in
// __clang_cuda_math_forward_declares.h. We need to do two additional things to
// fix up our math functions.
@ -295,13 +290,6 @@ ldexp(__T __x, int __exp) {
return std::ldexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
nexttoward(__T __from, double __to) {
return std::nexttoward((double)__from, __to);
}
template <typename __T1, typename __T2>
__DEVICE__ typename __clang_cuda_enable_if<
std::numeric_limits<__T1>::is_specialized &&
@ -388,7 +376,6 @@ using ::lrint;
using ::lround;
using ::nearbyint;
using ::nextafter;
using ::nexttoward;
using ::pow;
using ::remainder;
using ::remquo;
@ -456,8 +443,6 @@ using ::lroundf;
using ::modff;
using ::nearbyintf;
using ::nextafterf;
using ::nexttowardf;
using ::nexttowardf;
using ::powf;
using ::remainderf;
using ::remquof;

View File

@ -34,23 +34,24 @@
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
#pragma push_macro("__MAKE_SHUFFLES")
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \
inline __device__ int __FnName(int __val, int __offset, \
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
__Type) \
inline __device__ int __FnName(int __val, __Type __offset, \
int __width = warpSize) { \
return __IntIntrinsic(__val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(float __val, int __offset, \
inline __device__ float __FnName(float __val, __Type __offset, \
int __width = warpSize) { \
return __FloatIntrinsic(__val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \
inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(static_cast<int>(__val), __offset, __width)); \
} \
inline __device__ long long __FnName(long long __val, int __offset, \
inline __device__ long long __FnName(long long __val, __Type __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
@ -65,12 +66,29 @@
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \
} \
inline __device__ long __FnName(long __val, __Type __offset, \
int __width = warpSize) { \
_Static_assert(sizeof(long) == sizeof(long long) || \
sizeof(long) == sizeof(int)); \
if (sizeof(long) == sizeof(long long)) { \
return static_cast<long>( \
::__FnName(static_cast<long long>(__val), __offset, __width)); \
} else if (sizeof(long) == sizeof(int)) { \
return static_cast<long>( \
::__FnName(static_cast<int>(__val), __offset, __width)); \
} \
} \
inline __device__ unsigned long __FnName( \
unsigned long __val, __Type __offset, int __width = warpSize) { \
return static_cast<unsigned long>( \
::__FnName(static_cast<long>(__val), __offset, __width)); \
} \
inline __device__ unsigned long long __FnName( \
unsigned long long __val, int __offset, int __width = warpSize) { \
unsigned long long __val, __Type __offset, int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \
static_cast<unsigned long long>(__val), __offset, __width)); \
} \
inline __device__ double __FnName(double __val, int __offset, \
inline __device__ double __FnName(double __val, __Type __offset, \
int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
@ -81,13 +99,15 @@
return __ret; \
}
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
unsigned int);
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
unsigned int);
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
int);
#pragma pop_macro("__MAKE_SHUFFLES")
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
@ -97,25 +117,26 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
// __shfl_sync_* variants available in CUDA-9
#pragma push_macro("__MAKE_SYNC_SHUFFLES")
#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
__Mask) \
inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
int __width = warpSize) { \
__Mask, __Type) \
inline __device__ int __FnName(unsigned int __mask, int __val, \
__Type __offset, int __width = warpSize) { \
return __IntIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(unsigned int __mask, float __val, \
int __offset, int __width = warpSize) { \
__Type __offset, int __width = warpSize) { \
return __FloatIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __mask, \
unsigned int __val, int __offset, \
unsigned int __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
} \
inline __device__ long long __FnName(unsigned int __mask, long long __val, \
int __offset, int __width = warpSize) { \
__Type __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
@ -130,13 +151,31 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
return __ret; \
} \
inline __device__ unsigned long long __FnName( \
unsigned int __mask, unsigned long long __val, int __offset, \
unsigned int __mask, unsigned long long __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \
__mask, static_cast<unsigned long long>(__val), __offset, __width)); \
} \
inline __device__ long __FnName(unsigned int __mask, long __val, \
__Type __offset, int __width = warpSize) { \
_Static_assert(sizeof(long) == sizeof(long long) || \
sizeof(long) == sizeof(int)); \
if (sizeof(long) == sizeof(long long)) { \
return static_cast<long>(::__FnName( \
__mask, static_cast<long long>(__val), __offset, __width)); \
} else if (sizeof(long) == sizeof(int)) { \
return static_cast<long>( \
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
} \
} \
inline __device__ unsigned long __FnName( \
unsigned int __mask, unsigned long __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned long>( \
::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
} \
inline __device__ double __FnName(unsigned int __mask, double __val, \
int __offset, int __width = warpSize) { \
__Type __offset, int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
memcpy(&__tmp, &__val, sizeof(__val)); \
@ -146,15 +185,15 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
return __ret; \
}
__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
__nvvm_shfl_sync_idx_f32, 0x1f);
__nvvm_shfl_sync_idx_f32, 0x1f, int);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
__nvvm_shfl_sync_up_f32, 0);
__nvvm_shfl_sync_up_f32, 0, unsigned int);
__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
__nvvm_shfl_sync_down_f32, 0x1f);
__nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
__nvvm_shfl_sync_bfly_f32, 0x1f);
__nvvm_shfl_sync_bfly_f32, 0x1f, int);
#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
@ -188,6 +227,10 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
return __nvvm_fns(mask, base, offset);
}
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
// Define __match* builtins CUDA-9 headers expect to see.

View File

@ -149,9 +149,6 @@ __DEVICE__ double nearbyint(double);
__DEVICE__ float nearbyint(float);
__DEVICE__ double nextafter(double, double);
__DEVICE__ float nextafter(float, float);
__DEVICE__ double nexttoward(double, double);
__DEVICE__ float nexttoward(float, double);
__DEVICE__ float nexttowardf(float, double);
__DEVICE__ double pow(double, double);
__DEVICE__ double pow(double, int);
__DEVICE__ float pow(float, float);
@ -185,6 +182,10 @@ __DEVICE__ float tgamma(float);
__DEVICE__ double trunc(double);
__DEVICE__ float trunc(float);
// Notably missing above is nexttoward, which we don't define on
// the device side because libdevice doesn't give us an implementation, and we
// don't want to be in the business of writing one ourselves.
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
@ -255,7 +256,6 @@ using ::nan;
using ::nanf;
using ::nearbyint;
using ::nextafter;
using ::nexttoward;
using ::pow;
using ::remainder;
using ::remquo;

View File

@ -270,12 +270,18 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
// include guard from math.h wrapper from libstdc++. We have to undo the header
// guard temporarily to get the definitions we need.
#pragma push_macro("_GLIBCXX_MATH_H")
#pragma push_macro("_LIBCPP_VERSION")
#if CUDA_VERSION >= 9000
#undef _GLIBCXX_MATH_H
// We also need to undo another guard that checks for libc++ 3.8+
#ifdef _LIBCPP_VERSION
#define _LIBCPP_VERSION 3700
#endif
#endif
#include "math_functions.hpp"
#pragma pop_macro("_GLIBCXX_MATH_H")
#pragma pop_macro("_LIBCPP_VERSION")
#pragma pop_macro("__GNUC__")
#pragma pop_macro("signbit")

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,97 @@
/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512BITALGINTRIN_H
#define __AVX512BITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi16(__m512i __A)
{
return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
(__v32hi) _mm512_popcnt_epi16(__B),
(__v32hi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(),
__U,
__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi8(__m512i __A)
{
return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
(__v64qi) _mm512_popcnt_epi8(__B),
(__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(),
__U,
__B);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
{
return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
(__v64qi) __B,
__U);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
{
return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
__A,
__B);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -56,293 +56,145 @@ _mm512_setzero_hi(void) {
/* Integer compare */
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
(__mmask64)-1);
}
#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)-1); })
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
__u);
}
#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)(m)); })
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
(__mmask64)-1);
}
#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)-1); })
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
__u);
}
#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)(m)); })
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
(__mmask32)-1);
}
#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)-1); })
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
__u);
}
#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)(m)); })
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
(__mmask32)-1);
}
#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)-1); })
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
__u);
}
#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)(m)); })
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
(__mmask64)-1);
}
#define _mm512_cmpeq_epi8_mask(A, B) \
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
#define _mm512_cmpge_epi8_mask(A, B) \
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
#define _mm512_mask_cmpge_epi8_mask(k, A, B) \
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
#define _mm512_cmpgt_epi8_mask(A, B) \
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
#define _mm512_cmple_epi8_mask(A, B) \
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
#define _mm512_mask_cmple_epi8_mask(k, A, B) \
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
#define _mm512_cmplt_epi8_mask(A, B) \
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
#define _mm512_mask_cmplt_epi8_mask(k, A, B) \
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
#define _mm512_cmpneq_epi8_mask(A, B) \
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \
_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
__u);
}
#define _mm512_cmpeq_epu8_mask(A, B) \
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
#define _mm512_cmpge_epu8_mask(A, B) \
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
#define _mm512_mask_cmpge_epu8_mask(k, A, B) \
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
#define _mm512_cmpgt_epu8_mask(A, B) \
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
#define _mm512_cmple_epu8_mask(A, B) \
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
#define _mm512_mask_cmple_epu8_mask(k, A, B) \
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
#define _mm512_cmplt_epu8_mask(A, B) \
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
#define _mm512_mask_cmplt_epu8_mask(k, A, B) \
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
#define _mm512_cmpneq_epu8_mask(A, B) \
_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \
_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
(__mmask64)-1);
}
#define _mm512_cmpeq_epi16_mask(A, B) \
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
#define _mm512_cmpge_epi16_mask(A, B) \
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
#define _mm512_mask_cmpge_epi16_mask(k, A, B) \
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
#define _mm512_cmpgt_epi16_mask(A, B) \
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
#define _mm512_cmple_epi16_mask(A, B) \
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
#define _mm512_mask_cmple_epi16_mask(k, A, B) \
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
#define _mm512_cmplt_epi16_mask(A, B) \
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
#define _mm512_mask_cmplt_epi16_mask(k, A, B) \
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
#define _mm512_cmpneq_epi16_mask(A, B) \
_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \
_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
__u);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
(__mmask64)-1);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
__u);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
(__mmask64)-1);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
__u);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
(__mmask64)-1);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
__u);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
(__mmask64)-1);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
__u);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
(__mmask64)-1);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
__u);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
(__mmask64)-1);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
__u);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
(__mmask64)-1);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
__u);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
(__mmask64)-1);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
__u);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
(__mmask32)-1);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
__u);
}
#define _mm512_cmpeq_epu16_mask(A, B) \
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
#define _mm512_cmpge_epu16_mask(A, B) \
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
#define _mm512_mask_cmpge_epu16_mask(k, A, B) \
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
#define _mm512_cmpgt_epu16_mask(A, B) \
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
#define _mm512_cmple_epu16_mask(A, B) \
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
#define _mm512_mask_cmple_epu16_mask(k, A, B) \
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
#define _mm512_cmplt_epu16_mask(A, B) \
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
#define _mm512_mask_cmplt_epu16_mask(k, A, B) \
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
#define _mm512_cmpneq_epu16_mask(A, B) \
_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \
_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_add_epi8 (__m512i __A, __m512i __B) {
@ -1541,46 +1393,6 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
}
#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)-1); })
#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)(m)); })
#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)-1); })
#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)(m)); })
#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)-1); })
#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)(m)); })
#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)-1); })
#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)(m)); })
#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
(__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
(__v32hi)_mm512_undefined_epi32(), \
@ -2042,15 +1854,13 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
{
return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
(__mmask64) __B);
return (__mmask64) (( __A & 0xFFFFFFFF) | ( __B << 32));
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
{
return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
(__mmask32) __B);
return (__mmask32) (( __A & 0xFFFF) | ( __B << 16));
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@ -2105,61 +1915,56 @@ _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_test_epi8_mask (__m512i __A, __m512i __B)
{
return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
(__v64qi) __B,
(__mmask64) -1);
return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B),
_mm512_setzero_qi());
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
{
return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
(__v64qi) __B, __U);
return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
_mm512_setzero_qi());
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_test_epi16_mask (__m512i __A, __m512i __B)
{
return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
(__v32hi) __B,
(__mmask32) -1);
return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B),
_mm512_setzero_qi());
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
{
return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
(__v32hi) __B, __U);
return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
_mm512_setzero_qi());
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
{
return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
(__v64qi) __B,
(__mmask64) -1);
return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_qi());
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
{
return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
(__v64qi) __B, __U);
return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
_mm512_setzero_qi());
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
{
return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
(__v32hi) __B,
(__mmask32) -1);
return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B),
_mm512_setzero_qi());
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
{
return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
(__v32hi) __B, __U);
return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
_mm512_setzero_qi());
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS

View File

@ -130,13 +130,14 @@ _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmb_epi64 (__mmask8 __A)
{
return (__m512i) __builtin_ia32_broadcastmb512 (__A);
return (__m512i) _mm512_set1_epi64((long long) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmw_epi32 (__mmask16 __A)
{
return (__m512i) __builtin_ia32_broadcastmw512 (__A);
return (__m512i) _mm512_set1_epi32((int) __A);
}
#undef __DEFAULT_FN_ATTRS

View File

@ -8787,7 +8787,7 @@ _mm512_kortestz (__mmask16 __A, __mmask16 __B)
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
{
return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
return (__mmask16) (( __A & 0xFF) | ( __B << 8));
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS

View File

@ -0,0 +1,391 @@
/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VBMI2INTRIN_H
#define __AVX512VBMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
(__v32hi) _mm512_setzero_hi(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
(__v64qi) _mm512_setzero_qi(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
{
__builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
{
__builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
(__v32hi) _mm512_setzero_hi(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
(__v64qi) _mm512_setzero_qi(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
(__v32hi) _mm512_setzero_hi(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
(__v64qi) _mm512_setzero_qi(),
__U);
}
#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
(__v8di)(B), \
(int)(I), \
(__v8di)(S), \
(__mmask8)(U)); })
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
_mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_shldi_epi64(A, B, I) \
_mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
(__v16si)(B), \
(int)(I), \
(__v16si)(S), \
(__mmask16)(U)); })
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
_mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_shldi_epi32(A, B, I) \
_mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
(__v32hi)(B), \
(int)(I), \
(__v32hi)(S), \
(__mmask32)(U)); })
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
_mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_shldi_epi16(A, B, I) \
_mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
(__v8di)(B), \
(int)(I), \
(__v8di)(S), \
(__mmask8)(U)); })
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
_mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_shrdi_epi64(A, B, I) \
_mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
(__v16si)(B), \
(int)(I), \
(__v16si)(S), \
(__mmask16)(U)); })
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
_mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_shrdi_epi32(A, B, I) \
_mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
(__v32hi)(B), \
(int)(I), \
(__v32hi)(S), \
(__mmask32)(U)); })
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
_mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_shrdi_epi16(A, B, I) \
_mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
(__v8di) __A,
(__v8di) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S,
(__v8di) __A,
(__v8di) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
(__v8di) __A,
(__v8di) __B,
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
(__v32hi) __A,
(__v32hi) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S,
(__v32hi) __A,
(__v32hi) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
(__v32hi) __A,
(__v32hi) __B,
(__mmask32) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
(__v8di) __A,
(__v8di) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S,
(__v8di) __A,
(__v8di) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
(__v8di) __A,
(__v8di) __B,
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
(__v32hi) __A,
(__v32hi) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S,
(__v32hi) __A,
(__v32hi) __B,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
(__v32hi) __A,
(__v32hi) __B,
(__mmask32) -1);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -0,0 +1,157 @@
/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLBITALGINTRIN_H
#define __AVX512VLBITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg")))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_popcnt_epi16(__m256i __A)
{
return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
(__v16hi) _mm256_popcnt_epi16(__B),
(__v16hi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
{
return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
__U,
__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_popcnt_epi16(__m128i __A)
{
return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
{
return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
(__v8hi) _mm128_popcnt_epi16(__B),
(__v8hi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
{
return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
__U,
__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_popcnt_epi8(__m256i __A)
{
return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
(__v32qi) _mm256_popcnt_epi8(__B),
(__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
{
return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
__U,
__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_popcnt_epi8(__m128i __A)
{
return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
{
return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
(__v16qi) _mm128_popcnt_epi8(__B),
(__v16qi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
{
return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
__U,
__B);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B)
{
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
(__v32qi) __B,
__U);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B)
{
return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1,
__A,
__B);
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B)
{
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
(__v16qi) __B,
__U);
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B)
{
return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1,
__A,
__B);
}
#undef __DEFAULT_FN_ATTRS
#endif

File diff suppressed because it is too large Load Diff

View File

@ -33,26 +33,26 @@
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastmb_epi64 (__mmask8 __A)
{
return (__m128i) __builtin_ia32_broadcastmb128 (__A);
{
return (__m128i) _mm_set1_epi64x((long long) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastmb_epi64 (__mmask8 __A)
{
return (__m256i) __builtin_ia32_broadcastmb256 (__A);
return (__m256i) _mm256_set1_epi64x((long long)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastmw_epi32 (__mmask16 __A)
{
return (__m128i) __builtin_ia32_broadcastmw128 (__A);
return (__m128i) _mm_set1_epi32((int)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastmw_epi32 (__mmask16 __A)
{
return (__m256i) __builtin_ia32_broadcastmw256 (__A);
return (__m256i) _mm256_set1_epi32((int)__A);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,748 @@
/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLVBMI2INTRIN_H
#define __AVX512VLVBMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2")))
static __inline __m128i __DEFAULT_FN_ATTRS
_mm128_setzero_hi(void) {
return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
(__v8hi) _mm128_setzero_hi(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
(__v16qi) _mm128_setzero_hi(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
{
__builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
{
__builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
(__v8hi) _mm128_setzero_hi(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
(__v16qi) _mm128_setzero_hi(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
(__v8hi) _mm128_setzero_hi(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
(__v16qi) _mm128_setzero_hi(),
__U);
}
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setzero_hi(void) {
return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 };
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
(__v16hi) _mm256_setzero_hi(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
(__v32qi) _mm256_setzero_hi(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
{
__builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
{
__builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
(__v16hi) _mm256_setzero_hi(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
(__v32qi) _mm256_setzero_hi(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
(__v16hi) _mm256_setzero_hi(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
(__v32qi) _mm256_setzero_hi(),
__U);
}
#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
(__v4di)(B), \
(int)(I), \
(__v4di)(S), \
(__mmask8)(U)); })
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
_mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm256_shldi_epi64(A, B, I) \
_mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
(__v2di)(B), \
(int)(I), \
(__v2di)(S), \
(__mmask8)(U)); })
#define _mm128_maskz_shldi_epi64(U, A, B, I) \
_mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm128_shldi_epi64(A, B, I) \
_mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
(__v8si)(B), \
(int)(I), \
(__v8si)(S), \
(__mmask8)(U)); })
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
_mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm256_shldi_epi32(A, B, I) \
_mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
(__v4si)(B), \
(int)(I), \
(__v4si)(S), \
(__mmask8)(U)); })
#define _mm128_maskz_shldi_epi32(U, A, B, I) \
_mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm128_shldi_epi32(A, B, I) \
_mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
(__v16hi)(B), \
(int)(I), \
(__v16hi)(S), \
(__mmask16)(U)); })
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
_mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm256_shldi_epi16(A, B, I) \
_mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
(__v8hi)(B), \
(int)(I), \
(__v8hi)(S), \
(__mmask8)(U)); })
#define _mm128_maskz_shldi_epi16(U, A, B, I) \
_mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm128_shldi_epi16(A, B, I) \
_mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
(__v4di)(B), \
(int)(I), \
(__v4di)(S), \
(__mmask8)(U)); })
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
_mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm256_shrdi_epi64(A, B, I) \
_mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
(__v2di)(B), \
(int)(I), \
(__v2di)(S), \
(__mmask8)(U)); })
#define _mm128_maskz_shrdi_epi64(U, A, B, I) \
_mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm128_shrdi_epi64(A, B, I) \
_mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
(__v8si)(B), \
(int)(I), \
(__v8si)(S), \
(__mmask8)(U)); })
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
_mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm256_shrdi_epi32(A, B, I) \
_mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
(__v4si)(B), \
(int)(I), \
(__v4si)(S), \
(__mmask8)(U)); })
#define _mm128_maskz_shrdi_epi32(U, A, B, I) \
_mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm128_shrdi_epi32(A, B, I) \
_mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
(__v16hi)(B), \
(int)(I), \
(__v16hi)(S), \
(__mmask16)(U)); })
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
_mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm256_shrdi_epi16(A, B, I) \
_mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
(__v8hi)(B), \
(int)(I), \
(__v8hi)(S), \
(__mmask8)(U)); })
#define _mm128_maskz_shrdi_epi16(U, A, B, I) \
_mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm128_shrdi_epi16(A, B, I) \
_mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
(__v4di) __A,
(__v4di) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S,
(__v4di) __A,
(__v4di) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
(__v4di) __A,
(__v4di) __B,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
(__v2di) __A,
(__v2di) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S,
(__v2di) __A,
(__v2di) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
(__v2di) __A,
(__v2di) __B,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
(__v16hi) __A,
(__v16hi) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S,
(__v16hi) __A,
(__v16hi) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
(__v16hi) __A,
(__v16hi) __B,
(__mmask16) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
(__v8hi) __A,
(__v8hi) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S,
(__v8hi) __A,
(__v8hi) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
(__v8hi) __A,
(__v8hi) __B,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
(__v4di) __A,
(__v4di) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S,
(__v4di) __A,
(__v4di) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
(__v4di) __A,
(__v4di) __B,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
(__v2di) __A,
(__v2di) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S,
(__v2di) __A,
(__v2di) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
(__v2di) __A,
(__v2di) __B,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
(__v16hi) __A,
(__v16hi) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S,
(__v16hi) __A,
(__v16hi) __B,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
(__v16hi) __A,
(__v16hi) __B,
(__mmask16) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
(__v8hi) __A,
(__v8hi) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S,
(__v8hi) __A,
(__v8hi) __B,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
(__v8hi) __A,
(__v8hi) __B,
(__mmask8) -1);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -0,0 +1,254 @@
/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLVNNIINTRIN_H
#define __AVX512VLVNNIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni")))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -0,0 +1,146 @@
/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VNNIINTRIN_H
#define __AVX512VNNIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -0,0 +1,99 @@
/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
*------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VPOPCNTDQVLINTRIN_H
#define __AVX512VPOPCNTDQVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl")))
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
return (__m128i)__builtin_ia32_selectq_128(
(__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
return (__m128i)__builtin_ia32_selectd_128(
(__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
return (__m256i)__builtin_ia32_selectq_256(
(__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
return (__m256i)__builtin_ia32_selectd_256(
(__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
}
#undef __DEFAULT_FN_ATTRS
#endif

93
c_headers/cetintrin.h Normal file
View File

@ -0,0 +1,93 @@
/*===---- cetintrin.h - CET intrinsic ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __CETINTRIN_H
#define __CETINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("shstk")))
static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
__builtin_ia32_incsspd(__a);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
__builtin_ia32_incsspq(__a);
}
#endif /* __x86_64__ */
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
return __builtin_ia32_rdsspd(__a);
}
#ifdef __x86_64__
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
return __builtin_ia32_rdsspq(__a);
}
#endif /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
__builtin_ia32_saveprevssp();
}
static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
__builtin_ia32_rstorssp(__p);
}
static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
__builtin_ia32_wrssd(__a, __p);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
__builtin_ia32_wrssq(__a, __p);
}
#endif /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
__builtin_ia32_wrussd(__a, __p);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
__builtin_ia32_wrussq(__a, __p);
}
#endif /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
__builtin_ia32_setssbsy();
}
static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
__builtin_ia32_clrssbsy(__p);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __CETINTRIN_H */

View File

@ -173,16 +173,24 @@
#define bit_AVX512VL 0x80000000
/* Features in %ecx for leaf 7 sub-leaf 0 */
#define bit_PREFTCHWT1 0x00000001
#define bit_AVX512VBMI 0x00000002
#define bit_PKU 0x00000004
#define bit_OSPKE 0x00000010
#define bit_PREFTCHWT1 0x00000001
#define bit_AVX512VBMI 0x00000002
#define bit_PKU 0x00000004
#define bit_OSPKE 0x00000010
#define bit_AVX512VBMI2 0x00000040
#define bit_SHSTK 0x00000080
#define bit_GFNI 0x00000100
#define bit_VAES 0x00000200
#define bit_VPCLMULQDQ 0x00000400
#define bit_AVX512VNNI 0x00000800
#define bit_AVX512BITALG 0x00001000
#define bit_AVX512VPOPCNTDQ 0x00004000
#define bit_RDPID 0x00400000
#define bit_RDPID 0x00400000
/* Features in %edx for leaf 7 sub-leaf 0 */
#define bit_AVX5124VNNIW 0x00000004
#define bit_AVX5124FMAPS 0x00000008
#define bit_IBT 0x00100000
/* Features in %eax for leaf 13 sub-leaf 1 */
#define bit_XSAVEOPT 0x00000001
@ -192,6 +200,7 @@
/* Features in %ecx for leaf 0x80000001 */
#define bit_LAHF_LM 0x00000001
#define bit_ABM 0x00000020
#define bit_LZCNT bit_ABM /* for gcc compat */
#define bit_SSE4a 0x00000040
#define bit_PRFCHW 0x00000100
#define bit_XOP 0x00000800

View File

@ -80,7 +80,7 @@ min(const __T &__a, const __T &__b, __Cmp __cmp) {
template <class __T>
inline __device__ const __T &
min(const __T &__a, const __T &__b) {
return __a < __b ? __b : __a;
return __a < __b ? __a : __b;
}
#ifdef _LIBCPP_END_NAMESPACE_STD

View File

@ -217,8 +217,8 @@ _mm_div_pd(__m128d __a, __m128d __b)
/// \brief Calculates the square root of the lower double-precision value of
/// the second operand and returns it in the lower 64 bits of the result.
/// The upper 64 bits of the result are copied from the upper double-
/// precision value of the first operand.
/// The upper 64 bits of the result are copied from the upper
/// double-precision value of the first operand.
///
/// \headerfile <x86intrin.h>
///
@ -260,8 +260,8 @@ _mm_sqrt_pd(__m128d __a)
/// \brief Compares lower 64-bit double-precision values of both operands, and
/// returns the lesser of the pair of values in the lower 64-bits of the
/// result. The upper 64 bits of the result are copied from the upper double-
/// precision value of the first operand.
/// result. The upper 64 bits of the result are copied from the upper
/// double-precision value of the first operand.
///
/// \headerfile <x86intrin.h>
///
@ -304,8 +304,8 @@ _mm_min_pd(__m128d __a, __m128d __b)
/// \brief Compares lower 64-bit double-precision values of both operands, and
/// returns the greater of the pair of values in the lower 64-bits of the
/// result. The upper 64 bits of the result are copied from the upper double-
/// precision value of the first operand.
/// result. The upper 64 bits of the result are copied from the upper
/// double-precision value of the first operand.
///
/// \headerfile <x86intrin.h>
///
@ -983,8 +983,10 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
}
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] for equality. The
/// comparison yields 0 for false, 1 for true.
/// the two 128-bit floating-point vectors of [2 x double] for equality.
///
/// The comparison yields 0 for false, 1 for true. If either of the two
/// lower double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -996,7 +998,8 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
/// \param __b
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_sd(__m128d __a, __m128d __b)
{
@ -1008,7 +1011,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
/// the value in the first parameter is less than the corresponding value in
/// the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
/// The comparison yields 0 for false, 1 for true. If either of the two
/// lower double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1020,7 +1024,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
/// \param __b
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_sd(__m128d __a, __m128d __b)
{
@ -1032,7 +1037,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
/// the value in the first parameter is less than or equal to the
/// corresponding value in the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
/// The comparison yields 0 for false, 1 for true. If either of the two
/// lower double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1044,7 +1050,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
/// \param __b
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_sd(__m128d __a, __m128d __b)
{
@ -1056,7 +1063,8 @@ _mm_comile_sd(__m128d __a, __m128d __b)
/// the value in the first parameter is greater than the corresponding value
/// in the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
/// The comparison yields 0 for false, 1 for true. If either of the two
/// lower double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1068,7 +1076,8 @@ _mm_comile_sd(__m128d __a, __m128d __b)
/// \param __b
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_sd(__m128d __a, __m128d __b)
{
@ -1080,7 +1089,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
/// the value in the first parameter is greater than or equal to the
/// corresponding value in the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
/// The comparison yields 0 for false, 1 for true. If either of the two
/// lower double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1092,7 +1102,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
/// \param __b
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_sd(__m128d __a, __m128d __b)
{
@ -1104,7 +1115,8 @@ _mm_comige_sd(__m128d __a, __m128d __b)
/// the value in the first parameter is unequal to the corresponding value in
/// the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
/// The comparison yields 0 for false, 1 for true. If either of the two
/// lower double-precision values is NaN, 1 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1116,7 +1128,8 @@ _mm_comige_sd(__m128d __a, __m128d __b)
/// \param __b
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 1 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_sd(__m128d __a, __m128d __b)
{
@ -1127,7 +1140,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
/// the two 128-bit floating-point vectors of [2 x double] for equality. The
/// comparison yields 0 for false, 1 for true.
///
/// If either of the two lower double-precision values is NaN, 1 is returned.
/// If either of the two lower double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1140,7 +1153,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 1 is returned.
/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_sd(__m128d __a, __m128d __b)
{
@ -1153,7 +1166,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
/// the second parameter.
///
/// The comparison yields 0 for false, 1 for true. If either of the two lower
/// double-precision values is NaN, 1 is returned.
/// double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1166,7 +1179,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 1 is returned.
/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_sd(__m128d __a, __m128d __b)
{
@ -1179,7 +1192,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
/// corresponding value in the second parameter.
///
/// The comparison yields 0 for false, 1 for true. If either of the two lower
/// double-precision values is NaN, 1 is returned.
/// double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1192,7 +1205,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison results. If either of the two
/// lower double-precision values is NaN, 1 is returned.
/// lower double-precision values is NaN, 0 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_sd(__m128d __a, __m128d __b)
{
@ -1257,7 +1270,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
/// the second parameter.
///
/// The comparison yields 0 for false, 1 for true. If either of the two lower
/// double-precision values is NaN, 0 is returned.
/// double-precision values is NaN, 1 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1270,7 +1283,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
/// A 128-bit vector of [2 x double]. The lower double-precision value is
/// compared to the lower double-precision value of \a __a.
/// \returns An integer containing the comparison result. If either of the two
/// lower double-precision values is NaN, 0 is returned.
/// lower double-precision values is NaN, 1 is returned.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_sd(__m128d __a, __m128d __b)
{
@ -1935,14 +1948,15 @@ _mm_store_pd(double *__dp, __m128d __a)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
/// This intrinsic corresponds to the
/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
///
/// \param __dp
/// A pointer to a memory location that can store two double-precision
/// values.
/// \param __a
/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
/// of the values in \a dp.
/// of the values in \a __dp.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_pd(double *__dp, __m128d __a)
{
@ -1950,18 +1964,20 @@ _mm_store1_pd(double *__dp, __m128d __a)
_mm_store_pd(__dp, __a);
}
/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
/// location.
/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
/// the upper and lower 64 bits of a memory location.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
/// This intrinsic corresponds to the
/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
///
/// \param __dp
/// A pointer to a 128-bit memory location. The address of the memory
/// location has to be 16-byte aligned.
/// A pointer to a memory location that can store two double-precision
/// values.
/// \param __a
/// A 128-bit vector of [2 x double] containing the values to be stored.
/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
/// of the values in \a __dp.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_pd1(double *__dp, __m128d __a)
{
@ -3846,8 +3862,7 @@ _mm_set1_epi8(char __b)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
/// instruction.
/// This intrinsic does not correspond to a specific instruction.
///
/// \param __q0
/// A 64-bit integral value used to initialize the lower 64 bits of the
@ -4018,7 +4033,7 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
/// specified unaligned memory location. When a mask bit is 1, the
/// corresponding byte is written, otherwise it is not written.
///
/// To minimize caching, the date is flagged as non-temporal (unlikely to be
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon). Exception and trap behavior for elements not selected
/// for storage to memory are implementation dependent.
///
@ -4532,8 +4547,8 @@ _mm_unpackhi_epi32(__m128i __a, __m128i __b)
return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
}
/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
/// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
///
/// \headerfile <x86intrin.h>
///
@ -4665,7 +4680,7 @@ _mm_unpacklo_epi64(__m128i __a, __m128i __b)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
///
/// \param __a
/// A 128-bit integer vector operand. The lower 64 bits are moved to the
@ -4682,7 +4697,7 @@ _mm_movepi64_pi64(__m128i __a)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
///
/// \param __a
/// A 64-bit value.
@ -4712,8 +4727,8 @@ _mm_move_epi64(__m128i __a)
return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
}
/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
/// [2 x double] and interleaves them into a 128-bit vector of [2 x
/// double].
///
/// \headerfile <x86intrin.h>
@ -4733,7 +4748,7 @@ _mm_unpackhi_pd(__m128d __a, __m128d __b)
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
}
/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors
/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
/// double].
///
@ -4792,9 +4807,9 @@ _mm_movemask_pd(__m128d __a)
/// A 128-bit vector of [2 x double].
/// \param i
/// An 8-bit immediate value. The least significant two bits specify which
/// elements to copy from a and b: \n
/// Bit[0] = 0: lower element of a copied to lower element of result. \n
/// Bit[0] = 1: upper element of a copied to lower element of result. \n
/// elements to copy from \a a and \a b: \n
/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
/// \returns A 128-bit vector of [2 x double] containing the shuffled values.

View File

@ -60,73 +60,73 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@ -144,13 +144,13 @@ _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -168,37 +168,37 @@ _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -216,13 +216,13 @@ _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS

View File

@ -46,85 +46,85 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@ -142,13 +142,13 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -166,37 +166,37 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -214,13 +214,13 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS

202
c_headers/gfniintrin.h Normal file
View File

@ -0,0 +1,202 @@
/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __GFNIINTRIN_H
#define __GFNIINTRIN_H
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \
(char)(I)); })
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S)); })
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
(__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I); })
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \
(char)(I)); })
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S)); })
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I); })
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \
(char)(I)); })
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S)); })
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
(__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(), \
U, A, B, I); })
#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \
(char)(I)); })
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S)); })
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
(__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I); })
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \
(char)(I)); })
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S)); })
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
(__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I); })
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \
(char)(I)); })
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S)); })
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
(__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(), \
U, A, B, I); })
/* Default attributes for simple form (no masking). */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni")))
/* Default attributes for ZMM forms. */
#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni")))
/* Default attributes for VLX forms. */
#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni")))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
(__v16qi) __B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_selectb_128(__U,
(__v16qi) _mm_gf2p8mul_epi8(__A, __B),
(__v16qi) __S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
{
return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
__U, __A, __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
(__v32qi) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_selectb_256(__U,
(__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
(__v32qi) __S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
{
return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
__U, __A, __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
(__v64qi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_selectb_512(__U,
(__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
(__v64qi) __S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
{
return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(),
__U, __A, __B);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_F
#undef __DEFAULT_FN_ATTRS_VL
#endif // __GFNIINTRIN_H

View File

@ -118,6 +118,10 @@ _mm256_cvtph_ps(__m128i __a)
}
#endif /* __AVX2__ */
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
#include <vpclmulqdqintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
#include <bmiintrin.h>
#endif
@ -146,6 +150,10 @@ _mm256_cvtph_ps(__m128i __a)
#include <avx512bwintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__)
#include <avx512bitalgintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
#include <avx512cdintrin.h>
#endif
@ -154,10 +162,29 @@ _mm256_cvtph_ps(__m128i __a)
#include <avx512vpopcntdqintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
#include <avx512vpopcntdqvlintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__)
#include <avx512vnniintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512VNNI__))
#include <avx512vlvnniintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
#include <avx512dqintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512BITALG__))
#include <avx512vlbitalgintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512BW__))
#include <avx512vlbwintrin.h>
@ -195,6 +222,15 @@ _mm256_cvtph_ps(__m128i __a)
#include <avx512vbmivlintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__)
#include <avx512vbmi2intrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VBMI2__) && defined(__AVX512VL__))
#include <avx512vlvbmi2intrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
#include <avx512pfintrin.h>
#endif
@ -203,6 +239,14 @@ _mm256_cvtph_ps(__m128i __a)
#include <pkuintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__)
#include <vaesintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__)
#include <gfniintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand16_step(unsigned short *__p)
@ -319,6 +363,10 @@ _writegsbase_u64(unsigned long long __V)
#include <xsavesintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__)
#include <cetintrin.h>
#endif
/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
* whereas others are also available at all times. */
#include <adxintrin.h>

View File

@ -15886,6 +15886,313 @@ double __ovld __conv sub_group_scan_inclusive_max(double x);
#endif //cl_khr_subgroups cl_intel_subgroups
#if defined(cl_intel_subgroups)
// Intel-Specific Sub Group Functions
float __ovld __conv intel_sub_group_shuffle( float x, uint c );
float2 __ovld __conv intel_sub_group_shuffle( float2 x, uint c );
float3 __ovld __conv intel_sub_group_shuffle( float3 x, uint c );
float4 __ovld __conv intel_sub_group_shuffle( float4 x, uint c );
float8 __ovld __conv intel_sub_group_shuffle( float8 x, uint c );
float16 __ovld __conv intel_sub_group_shuffle( float16 x, uint c );
int __ovld __conv intel_sub_group_shuffle( int x, uint c );
int2 __ovld __conv intel_sub_group_shuffle( int2 x, uint c );
int3 __ovld __conv intel_sub_group_shuffle( int3 x, uint c );
int4 __ovld __conv intel_sub_group_shuffle( int4 x, uint c );
int8 __ovld __conv intel_sub_group_shuffle( int8 x, uint c );
int16 __ovld __conv intel_sub_group_shuffle( int16 x, uint c );
uint __ovld __conv intel_sub_group_shuffle( uint x, uint c );
uint2 __ovld __conv intel_sub_group_shuffle( uint2 x, uint c );
uint3 __ovld __conv intel_sub_group_shuffle( uint3 x, uint c );
uint4 __ovld __conv intel_sub_group_shuffle( uint4 x, uint c );
uint8 __ovld __conv intel_sub_group_shuffle( uint8 x, uint c );
uint16 __ovld __conv intel_sub_group_shuffle( uint16 x, uint c );
long __ovld __conv intel_sub_group_shuffle( long x, uint c );
ulong __ovld __conv intel_sub_group_shuffle( ulong x, uint c );
float __ovld __conv intel_sub_group_shuffle_down( float cur, float next, uint c );
float2 __ovld __conv intel_sub_group_shuffle_down( float2 cur, float2 next, uint c );
float3 __ovld __conv intel_sub_group_shuffle_down( float3 cur, float3 next, uint c );
float4 __ovld __conv intel_sub_group_shuffle_down( float4 cur, float4 next, uint c );
float8 __ovld __conv intel_sub_group_shuffle_down( float8 cur, float8 next, uint c );
float16 __ovld __conv intel_sub_group_shuffle_down( float16 cur, float16 next, uint c );
int __ovld __conv intel_sub_group_shuffle_down( int cur, int next, uint c );
int2 __ovld __conv intel_sub_group_shuffle_down( int2 cur, int2 next, uint c );
int3 __ovld __conv intel_sub_group_shuffle_down( int3 cur, int3 next, uint c );
int4 __ovld __conv intel_sub_group_shuffle_down( int4 cur, int4 next, uint c );
int8 __ovld __conv intel_sub_group_shuffle_down( int8 cur, int8 next, uint c );
int16 __ovld __conv intel_sub_group_shuffle_down( int16 cur, int16 next, uint c );
uint __ovld __conv intel_sub_group_shuffle_down( uint cur, uint next, uint c );
uint2 __ovld __conv intel_sub_group_shuffle_down( uint2 cur, uint2 next, uint c );
uint3 __ovld __conv intel_sub_group_shuffle_down( uint3 cur, uint3 next, uint c );
uint4 __ovld __conv intel_sub_group_shuffle_down( uint4 cur, uint4 next, uint c );
uint8 __ovld __conv intel_sub_group_shuffle_down( uint8 cur, uint8 next, uint c );
uint16 __ovld __conv intel_sub_group_shuffle_down( uint16 cur, uint16 next, uint c );
long __ovld __conv intel_sub_group_shuffle_down( long prev, long cur, uint c );
ulong __ovld __conv intel_sub_group_shuffle_down( ulong prev, ulong cur, uint c );
float __ovld __conv intel_sub_group_shuffle_up( float prev, float cur, uint c );
float2 __ovld __conv intel_sub_group_shuffle_up( float2 prev, float2 cur, uint c );
float3 __ovld __conv intel_sub_group_shuffle_up( float3 prev, float3 cur, uint c );
float4 __ovld __conv intel_sub_group_shuffle_up( float4 prev, float4 cur, uint c );
float8 __ovld __conv intel_sub_group_shuffle_up( float8 prev, float8 cur, uint c );
float16 __ovld __conv intel_sub_group_shuffle_up( float16 prev, float16 cur, uint c );
int __ovld __conv intel_sub_group_shuffle_up( int prev, int cur, uint c );
int2 __ovld __conv intel_sub_group_shuffle_up( int2 prev, int2 cur, uint c );
int3 __ovld __conv intel_sub_group_shuffle_up( int3 prev, int3 cur, uint c );
int4 __ovld __conv intel_sub_group_shuffle_up( int4 prev, int4 cur, uint c );
int8 __ovld __conv intel_sub_group_shuffle_up( int8 prev, int8 cur, uint c );
int16 __ovld __conv intel_sub_group_shuffle_up( int16 prev, int16 cur, uint c );
uint __ovld __conv intel_sub_group_shuffle_up( uint prev, uint cur, uint c );
uint2 __ovld __conv intel_sub_group_shuffle_up( uint2 prev, uint2 cur, uint c );
uint3 __ovld __conv intel_sub_group_shuffle_up( uint3 prev, uint3 cur, uint c );
uint4 __ovld __conv intel_sub_group_shuffle_up( uint4 prev, uint4 cur, uint c );
uint8 __ovld __conv intel_sub_group_shuffle_up( uint8 prev, uint8 cur, uint c );
uint16 __ovld __conv intel_sub_group_shuffle_up( uint16 prev, uint16 cur, uint c );
long __ovld __conv intel_sub_group_shuffle_up( long prev, long cur, uint c );
ulong __ovld __conv intel_sub_group_shuffle_up( ulong prev, ulong cur, uint c );
float __ovld __conv intel_sub_group_shuffle_xor( float x, uint c );
float2 __ovld __conv intel_sub_group_shuffle_xor( float2 x, uint c );
float3 __ovld __conv intel_sub_group_shuffle_xor( float3 x, uint c );
float4 __ovld __conv intel_sub_group_shuffle_xor( float4 x, uint c );
float8 __ovld __conv intel_sub_group_shuffle_xor( float8 x, uint c );
float16 __ovld __conv intel_sub_group_shuffle_xor( float16 x, uint c );
int __ovld __conv intel_sub_group_shuffle_xor( int x, uint c );
int2 __ovld __conv intel_sub_group_shuffle_xor( int2 x, uint c );
int3 __ovld __conv intel_sub_group_shuffle_xor( int3 x, uint c );
int4 __ovld __conv intel_sub_group_shuffle_xor( int4 x, uint c );
int8 __ovld __conv intel_sub_group_shuffle_xor( int8 x, uint c );
int16 __ovld __conv intel_sub_group_shuffle_xor( int16 x, uint c );
uint __ovld __conv intel_sub_group_shuffle_xor( uint x, uint c );
uint2 __ovld __conv intel_sub_group_shuffle_xor( uint2 x, uint c );
uint3 __ovld __conv intel_sub_group_shuffle_xor( uint3 x, uint c );
uint4 __ovld __conv intel_sub_group_shuffle_xor( uint4 x, uint c );
uint8 __ovld __conv intel_sub_group_shuffle_xor( uint8 x, uint c );
uint16 __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c );
long __ovld __conv intel_sub_group_shuffle_xor( long x, uint c );
ulong __ovld __conv intel_sub_group_shuffle_xor( ulong x, uint c );
uint __ovld __conv intel_sub_group_block_read( read_only image2d_t image, int2 coord );
uint2 __ovld __conv intel_sub_group_block_read2( read_only image2d_t image, int2 coord );
uint4 __ovld __conv intel_sub_group_block_read4( read_only image2d_t image, int2 coord );
uint8 __ovld __conv intel_sub_group_block_read8( read_only image2d_t image, int2 coord );
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
uint __ovld __conv intel_sub_group_block_read(read_write image2d_t image, int2 coord);
uint2 __ovld __conv intel_sub_group_block_read2(read_write image2d_t image, int2 coord);
uint4 __ovld __conv intel_sub_group_block_read4(read_write image2d_t image, int2 coord);
uint8 __ovld __conv intel_sub_group_block_read8(read_write image2d_t image, int2 coord);
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
uint __ovld __conv intel_sub_group_block_read( const __global uint* p );
uint2 __ovld __conv intel_sub_group_block_read2( const __global uint* p );
uint4 __ovld __conv intel_sub_group_block_read4( const __global uint* p );
uint8 __ovld __conv intel_sub_group_block_read8( const __global uint* p );
void __ovld __conv intel_sub_group_block_write(write_only image2d_t image, int2 coord, uint data);
void __ovld __conv intel_sub_group_block_write2(write_only image2d_t image, int2 coord, uint2 data);
void __ovld __conv intel_sub_group_block_write4(write_only image2d_t image, int2 coord, uint4 data);
void __ovld __conv intel_sub_group_block_write8(write_only image2d_t image, int2 coord, uint8 data);
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
void __ovld __conv intel_sub_group_block_write(read_write image2d_t image, int2 coord, uint data);
void __ovld __conv intel_sub_group_block_write2(read_write image2d_t image, int2 coord, uint2 data);
void __ovld __conv intel_sub_group_block_write4(read_write image2d_t image, int2 coord, uint4 data);
void __ovld __conv intel_sub_group_block_write8(read_write image2d_t image, int2 coord, uint8 data);
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
void __ovld __conv intel_sub_group_block_write( __global uint* p, uint data );
void __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data );
void __ovld __conv intel_sub_group_block_write4( __global uint* p, uint4 data );
void __ovld __conv intel_sub_group_block_write8( __global uint* p, uint8 data );
#ifdef cl_khr_fp16
half __ovld __conv intel_sub_group_shuffle( half x, uint c );
half __ovld __conv intel_sub_group_shuffle_down( half prev, half cur, uint c );
half __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c );
half __ovld __conv intel_sub_group_shuffle_xor( half x, uint c );
#endif
#if defined(cl_khr_fp64)
double __ovld __conv intel_sub_group_shuffle( double x, uint c );
double __ovld __conv intel_sub_group_shuffle_down( double prev, double cur, uint c );
double __ovld __conv intel_sub_group_shuffle_up( double prev, double cur, uint c );
double __ovld __conv intel_sub_group_shuffle_xor( double x, uint c );
#endif
#endif //cl_intel_subgroups
#if defined(cl_intel_subgroups_short)
short __ovld __conv intel_sub_group_broadcast( short x, uint sub_group_local_id );
short2 __ovld __conv intel_sub_group_broadcast( short2 x, uint sub_group_local_id );
short3 __ovld __conv intel_sub_group_broadcast( short3 x, uint sub_group_local_id );
short4 __ovld __conv intel_sub_group_broadcast( short4 x, uint sub_group_local_id );
short8 __ovld __conv intel_sub_group_broadcast( short8 x, uint sub_group_local_id );
ushort __ovld __conv intel_sub_group_broadcast( ushort x, uint sub_group_local_id );
ushort2 __ovld __conv intel_sub_group_broadcast( ushort2 x, uint sub_group_local_id );
ushort3 __ovld __conv intel_sub_group_broadcast( ushort3 x, uint sub_group_local_id );
ushort4 __ovld __conv intel_sub_group_broadcast( ushort4 x, uint sub_group_local_id );
ushort8 __ovld __conv intel_sub_group_broadcast( ushort8 x, uint sub_group_local_id );
short __ovld __conv intel_sub_group_shuffle( short x, uint c );
short2 __ovld __conv intel_sub_group_shuffle( short2 x, uint c );
short3 __ovld __conv intel_sub_group_shuffle( short3 x, uint c );
short4 __ovld __conv intel_sub_group_shuffle( short4 x, uint c );
short8 __ovld __conv intel_sub_group_shuffle( short8 x, uint c );
short16 __ovld __conv intel_sub_group_shuffle( short16 x, uint c);
ushort __ovld __conv intel_sub_group_shuffle( ushort x, uint c );
ushort2 __ovld __conv intel_sub_group_shuffle( ushort2 x, uint c );
ushort3 __ovld __conv intel_sub_group_shuffle( ushort3 x, uint c );
ushort4 __ovld __conv intel_sub_group_shuffle( ushort4 x, uint c );
ushort8 __ovld __conv intel_sub_group_shuffle( ushort8 x, uint c );
ushort16 __ovld __conv intel_sub_group_shuffle( ushort16 x, uint c );
short __ovld __conv intel_sub_group_shuffle_down( short cur, short next, uint c );
short2 __ovld __conv intel_sub_group_shuffle_down( short2 cur, short2 next, uint c );
short3 __ovld __conv intel_sub_group_shuffle_down( short3 cur, short3 next, uint c );
short4 __ovld __conv intel_sub_group_shuffle_down( short4 cur, short4 next, uint c );
short8 __ovld __conv intel_sub_group_shuffle_down( short8 cur, short8 next, uint c );
short16 __ovld __conv intel_sub_group_shuffle_down( short16 cur, short16 next, uint c );
ushort __ovld __conv intel_sub_group_shuffle_down( ushort cur, ushort next, uint c );
ushort2 __ovld __conv intel_sub_group_shuffle_down( ushort2 cur, ushort2 next, uint c );
ushort3 __ovld __conv intel_sub_group_shuffle_down( ushort3 cur, ushort3 next, uint c );
ushort4 __ovld __conv intel_sub_group_shuffle_down( ushort4 cur, ushort4 next, uint c );
ushort8 __ovld __conv intel_sub_group_shuffle_down( ushort8 cur, ushort8 next, uint c );
ushort16 __ovld __conv intel_sub_group_shuffle_down( ushort16 cur, ushort16 next, uint c );
short __ovld __conv intel_sub_group_shuffle_up( short cur, short next, uint c );
short2 __ovld __conv intel_sub_group_shuffle_up( short2 cur, short2 next, uint c );
short3 __ovld __conv intel_sub_group_shuffle_up( short3 cur, short3 next, uint c );
short4 __ovld __conv intel_sub_group_shuffle_up( short4 cur, short4 next, uint c );
short8 __ovld __conv intel_sub_group_shuffle_up( short8 cur, short8 next, uint c );
short16 __ovld __conv intel_sub_group_shuffle_up( short16 cur, short16 next, uint c );
ushort __ovld __conv intel_sub_group_shuffle_up( ushort cur, ushort next, uint c );
ushort2 __ovld __conv intel_sub_group_shuffle_up( ushort2 cur, ushort2 next, uint c );
ushort3 __ovld __conv intel_sub_group_shuffle_up( ushort3 cur, ushort3 next, uint c );
ushort4 __ovld __conv intel_sub_group_shuffle_up( ushort4 cur, ushort4 next, uint c );
ushort8 __ovld __conv intel_sub_group_shuffle_up( ushort8 cur, ushort8 next, uint c );
ushort16 __ovld __conv intel_sub_group_shuffle_up( ushort16 cur, ushort16 next, uint c );
short __ovld __conv intel_sub_group_shuffle_xor( short x, uint c );
short2 __ovld __conv intel_sub_group_shuffle_xor( short2 x, uint c );
short3 __ovld __conv intel_sub_group_shuffle_xor( short3 x, uint c );
short4 __ovld __conv intel_sub_group_shuffle_xor( short4 x, uint c );
short8 __ovld __conv intel_sub_group_shuffle_xor( short8 x, uint c );
short16 __ovld __conv intel_sub_group_shuffle_xor( short16 x, uint c );
ushort __ovld __conv intel_sub_group_shuffle_xor( ushort x, uint c );
ushort2 __ovld __conv intel_sub_group_shuffle_xor( ushort2 x, uint c );
ushort3 __ovld __conv intel_sub_group_shuffle_xor( ushort3 x, uint c );
ushort4 __ovld __conv intel_sub_group_shuffle_xor( ushort4 x, uint c );
ushort8 __ovld __conv intel_sub_group_shuffle_xor( ushort8 x, uint c );
ushort16 __ovld __conv intel_sub_group_shuffle_xor( ushort16 x, uint c );
short __ovld __conv intel_sub_group_reduce_add( short x );
ushort __ovld __conv intel_sub_group_reduce_add( ushort x );
short __ovld __conv intel_sub_group_reduce_min( short x );
ushort __ovld __conv intel_sub_group_reduce_min( ushort x );
short __ovld __conv intel_sub_group_reduce_max( short x );
ushort __ovld __conv intel_sub_group_reduce_max( ushort x );
short __ovld __conv intel_sub_group_scan_exclusive_add( short x );
ushort __ovld __conv intel_sub_group_scan_exclusive_add( ushort x );
short __ovld __conv intel_sub_group_scan_exclusive_min( short x );
ushort __ovld __conv intel_sub_group_scan_exclusive_min( ushort x );
short __ovld __conv intel_sub_group_scan_exclusive_max( short x );
ushort __ovld __conv intel_sub_group_scan_exclusive_max( ushort x );
short __ovld __conv intel_sub_group_scan_inclusive_add( short x );
ushort __ovld __conv intel_sub_group_scan_inclusive_add( ushort x );
short __ovld __conv intel_sub_group_scan_inclusive_min( short x );
ushort __ovld __conv intel_sub_group_scan_inclusive_min( ushort x );
short __ovld __conv intel_sub_group_scan_inclusive_max( short x );
ushort __ovld __conv intel_sub_group_scan_inclusive_max( ushort x );
uint __ovld __conv intel_sub_group_block_read_ui( read_only image2d_t image, int2 byte_coord );
uint2 __ovld __conv intel_sub_group_block_read_ui2( read_only image2d_t image, int2 byte_coord );
uint4 __ovld __conv intel_sub_group_block_read_ui4( read_only image2d_t image, int2 byte_coord );
uint8 __ovld __conv intel_sub_group_block_read_ui8( read_only image2d_t image, int2 byte_coord );
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
uint __ovld __conv intel_sub_group_block_read_ui( read_write image2d_t image, int2 byte_coord );
uint2 __ovld __conv intel_sub_group_block_read_ui2( read_write image2d_t image, int2 byte_coord );
uint4 __ovld __conv intel_sub_group_block_read_ui4( read_write image2d_t image, int2 byte_coord );
uint8 __ovld __conv intel_sub_group_block_read_ui8( read_write image2d_t image, int2 byte_coord );
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
uint __ovld __conv intel_sub_group_block_read_ui( const __global uint* p );
uint2 __ovld __conv intel_sub_group_block_read_ui2( const __global uint* p );
uint4 __ovld __conv intel_sub_group_block_read_ui4( const __global uint* p );
uint8 __ovld __conv intel_sub_group_block_read_ui8( const __global uint* p );
void __ovld __conv intel_sub_group_block_write_ui( read_only image2d_t image, int2 byte_coord, uint data );
void __ovld __conv intel_sub_group_block_write_ui2( read_only image2d_t image, int2 byte_coord, uint2 data );
void __ovld __conv intel_sub_group_block_write_ui4( read_only image2d_t image, int2 byte_coord, uint4 data );
void __ovld __conv intel_sub_group_block_write_ui8( read_only image2d_t image, int2 byte_coord, uint8 data );
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
void __ovld __conv intel_sub_group_block_write_ui( read_write image2d_t image, int2 byte_coord, uint data );
void __ovld __conv intel_sub_group_block_write_ui2( read_write image2d_t image, int2 byte_coord, uint2 data );
void __ovld __conv intel_sub_group_block_write_ui4( read_write image2d_t image, int2 byte_coord, uint4 data );
void __ovld __conv intel_sub_group_block_write_ui8( read_write image2d_t image, int2 byte_coord, uint8 data );
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
void __ovld __conv intel_sub_group_block_write_ui( __global uint* p, uint data );
void __ovld __conv intel_sub_group_block_write_ui2( __global uint* p, uint2 data );
void __ovld __conv intel_sub_group_block_write_ui4( __global uint* p, uint4 data );
void __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint8 data );
ushort __ovld __conv intel_sub_group_block_read_us( read_only image2d_t image, int2 coord );
ushort2 __ovld __conv intel_sub_group_block_read_us2( read_only image2d_t image, int2 coord );
ushort4 __ovld __conv intel_sub_group_block_read_us4( read_only image2d_t image, int2 coord );
ushort8 __ovld __conv intel_sub_group_block_read_us8( read_only image2d_t image, int2 coord );
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
ushort __ovld __conv intel_sub_group_block_read_us(read_write image2d_t image, int2 coord);
ushort2 __ovld __conv intel_sub_group_block_read_us2(read_write image2d_t image, int2 coord);
ushort4 __ovld __conv intel_sub_group_block_read_us4(read_write image2d_t image, int2 coord);
ushort8 __ovld __conv intel_sub_group_block_read_us8(read_write image2d_t image, int2 coord);
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
ushort __ovld __conv intel_sub_group_block_read_us( const __global ushort* p );
ushort2 __ovld __conv intel_sub_group_block_read_us2( const __global ushort* p );
ushort4 __ovld __conv intel_sub_group_block_read_us4( const __global ushort* p );
ushort8 __ovld __conv intel_sub_group_block_read_us8( const __global ushort* p );
void __ovld __conv intel_sub_group_block_write_us(write_only image2d_t image, int2 coord, ushort data);
void __ovld __conv intel_sub_group_block_write_us2(write_only image2d_t image, int2 coord, ushort2 data);
void __ovld __conv intel_sub_group_block_write_us4(write_only image2d_t image, int2 coord, ushort4 data);
void __ovld __conv intel_sub_group_block_write_us8(write_only image2d_t image, int2 coord, ushort8 data);
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
void __ovld __conv intel_sub_group_block_write_us(read_write image2d_t image, int2 coord, ushort data);
void __ovld __conv intel_sub_group_block_write_us2(read_write image2d_t image, int2 coord, ushort2 data);
void __ovld __conv intel_sub_group_block_write_us4(read_write image2d_t image, int2 coord, ushort4 data);
void __ovld __conv intel_sub_group_block_write_us8(read_write image2d_t image, int2 coord, ushort8 data);
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
void __ovld __conv intel_sub_group_block_write_us( __global ushort* p, ushort data );
void __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data );
void __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data );
void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data );
#endif // cl_intel_subgroups_short
#ifdef cl_amd_media_ops
uint __ovld amd_bitalign(uint a, uint b, uint c);
uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);

View File

@ -115,8 +115,8 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
}
/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
/// vector of [4 x float] to float values stored in a 128-bit vector of
/// \brief Moves and duplicates odd-indexed values from a 128-bit vector
/// of [4 x float] to float values stored in a 128-bit vector of
/// [4 x float].
///
/// \headerfile <x86intrin.h>
@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a)
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
}
/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
/// \brief Duplicates even-indexed values from a 128-bit vector of
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>

View File

@ -648,7 +648,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// input vectors are used as an input for dot product; otherwise that input
/// is treated as zero. Bits [1:0] determine which elements of the result
/// will receive a copy of the final dot product, with bit [0] corresponding
/// to the lowest element and bit [3] corresponding to the highest element of
/// to the lowest element and bit [1] corresponding to the highest element of
/// each [2 x double] vector. If a bit is set, the dot product is returned in
/// the corresponding element; otherwise that element is set to zero.
#define _mm_dp_pd(X, Y, M) __extension__ ({\
@ -866,8 +866,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
/// Bits[3:0]: If any of these bits are set, the corresponding result
/// element is cleared.
/// \returns A 128-bit vector of [4 x float] containing the copied single-
/// precision floating point elements from the operands.
/// \returns A 128-bit vector of [4 x float] containing the copied
/// single-precision floating point elements from the operands.
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and

View File

@ -32,12 +32,15 @@
#define true 1
#define false 0
#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
/* Define _Bool, bool, false, true as a GNU extension. */
/* Define _Bool as a GNU extension. */
#define _Bool bool
#if __cplusplus < 201103L
/* For C++98, define bool, false, true as a GNU extension. */
#define bool bool
#define false false
#define true true
#endif
#endif
#define __bool_true_false_are_defined 1

98
c_headers/vaesintrin.h Normal file
View File

@ -0,0 +1,98 @@
/*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VAESINTRIN_H
#define __VAESINTRIN_H
/* Default attributes for YMM forms. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes")))
/* Default attributes for ZMM forms. */
#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes")))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesenc_epi128(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_aesenc256((__v4di) __A,
(__v4di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesenc_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesdec_epi128(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_aesdec256((__v4di) __A,
(__v4di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesdec_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesenclast_epi128(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A,
(__v4di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesenclast_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A,
(__v4di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
(__v8di) __B);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_F
#endif

View File

@ -0,0 +1,42 @@
/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VPCLMULQDQINTRIN_H
#define __VPCLMULQDQINTRIN_H
#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), \
(char)(I)); })
#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), \
(char)(I)); })
#endif // __VPCLMULQDQINTRIN_H

View File

@ -2035,9 +2035,11 @@ _mm_storer_ps(float *__p, __m128 __a)
_mm_store_ps(__p, __a);
}
#define _MM_HINT_T0 3
#define _MM_HINT_T1 2
#define _MM_HINT_T2 1
#define _MM_HINT_ET0 7
#define _MM_HINT_ET1 6
#define _MM_HINT_T0 3
#define _MM_HINT_T1 2
#define _MM_HINT_T2 1
#define _MM_HINT_NTA 0
#ifndef _MSC_VER
@ -2068,7 +2070,8 @@ _mm_storer_ps(float *__p, __m128 __a)
/// be generated. \n
/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
/// be generated.
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
((sel) >> 2) & 1, (sel) & 0x3))
#endif
/// \brief Stores a 64-bit integer in the specified aligned memory location. To