update C headers to clang 5.0.0
parent
ba3d21ca67
commit
cd58b40011
|
@ -410,10 +410,12 @@ install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlbwintrin.h" DESTINATION "${
|
|||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlcdintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vldqintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vpopcntdqintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avxintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmi2intrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmiintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/clflushoptintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/clzerointrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cpuid.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/algorithm" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/complex" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
|
||||
|
@ -432,6 +434,7 @@ install(FILES "${CMAKE_SOURCE_DIR}/c_headers/intrin.h" DESTINATION "${C_HEADERS_
|
|||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/inttypes.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/iso646.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/limits.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/lwpintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/lzcntintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm3dnow.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm_malloc.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
|
|
|
@ -2887,87 +2887,79 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
|
|||
|
||||
/* vec_ctf */
|
||||
|
||||
static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
|
||||
return __builtin_altivec_vcfsx(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
|
||||
int __b) {
|
||||
return __builtin_altivec_vcfux((vector int)__a, __b);
|
||||
}
|
||||
|
||||
#ifdef __VSX__
|
||||
static __inline__ vector double __ATTRS_o_ai
|
||||
vec_ctf(vector unsigned long long __a, int __b) {
|
||||
vector double __ret = __builtin_convertvector(__a, vector double);
|
||||
__ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
|
||||
return __ret;
|
||||
}
|
||||
|
||||
static __inline__ vector double __ATTRS_o_ai
|
||||
vec_ctf(vector signed long long __a, int __b) {
|
||||
vector double __ret = __builtin_convertvector(__a, vector double);
|
||||
__ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
|
||||
return __ret;
|
||||
}
|
||||
#define vec_ctf(__a, __b) \
|
||||
_Generic((__a), vector int \
|
||||
: (vector float)__builtin_altivec_vcfsx((__a), (__b)), \
|
||||
vector unsigned int \
|
||||
: (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)), \
|
||||
vector unsigned long long \
|
||||
: (__builtin_convertvector((vector unsigned long long)(__a), \
|
||||
vector double) * \
|
||||
(vector double)(vector unsigned long long)((0x3ffULL - (__b)) \
|
||||
<< 52)), \
|
||||
vector signed long long \
|
||||
: (__builtin_convertvector((vector signed long long)(__a), \
|
||||
vector double) * \
|
||||
(vector double)(vector unsigned long long)((0x3ffULL - (__b)) \
|
||||
<< 52)))
|
||||
#else
|
||||
#define vec_ctf(__a, __b) \
|
||||
_Generic((__a), vector int \
|
||||
: (vector float)__builtin_altivec_vcfsx((__a), (__b)), \
|
||||
vector unsigned int \
|
||||
: (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)))
|
||||
#endif
|
||||
|
||||
/* vec_vcfsx */
|
||||
|
||||
static __inline__ vector float __attribute__((__always_inline__))
|
||||
vec_vcfsx(vector int __a, int __b) {
|
||||
return __builtin_altivec_vcfsx(__a, __b);
|
||||
}
|
||||
#define vec_vcfux __builtin_altivec_vcfux
|
||||
|
||||
/* vec_vcfux */
|
||||
|
||||
static __inline__ vector float __attribute__((__always_inline__))
|
||||
vec_vcfux(vector unsigned int __a, int __b) {
|
||||
return __builtin_altivec_vcfux((vector int)__a, __b);
|
||||
}
|
||||
#define vec_vcfsx(__a, __b) __builtin_altivec_vcfsx((vector int)(__a), (__b))
|
||||
|
||||
/* vec_cts */
|
||||
|
||||
static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
|
||||
return __builtin_altivec_vctsxs(__a, __b);
|
||||
}
|
||||
|
||||
#ifdef __VSX__
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_cts(vector double __a, int __b) {
|
||||
__a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
|
||||
return __builtin_convertvector(__a, vector signed long long);
|
||||
}
|
||||
#define vec_cts(__a, __b) \
|
||||
_Generic((__a), vector float \
|
||||
: __builtin_altivec_vctsxs((__a), (__b)), vector double \
|
||||
: __extension__({ \
|
||||
vector double __ret = \
|
||||
(__a) * \
|
||||
(vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
|
||||
<< 52); \
|
||||
__builtin_convertvector(__ret, vector signed long long); \
|
||||
}))
|
||||
#else
|
||||
#define vec_cts __builtin_altivec_vctsxs
|
||||
#endif
|
||||
|
||||
/* vec_vctsxs */
|
||||
|
||||
static __inline__ vector int __attribute__((__always_inline__))
|
||||
vec_vctsxs(vector float __a, int __b) {
|
||||
return __builtin_altivec_vctsxs(__a, __b);
|
||||
}
|
||||
#define vec_vctsxs __builtin_altivec_vctsxs
|
||||
|
||||
/* vec_ctu */
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
|
||||
int __b) {
|
||||
return __builtin_altivec_vctuxs(__a, __b);
|
||||
}
|
||||
|
||||
#ifdef __VSX__
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_ctu(vector double __a, int __b) {
|
||||
__a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
|
||||
return __builtin_convertvector(__a, vector unsigned long long);
|
||||
}
|
||||
#define vec_ctu(__a, __b) \
|
||||
_Generic((__a), vector float \
|
||||
: __builtin_altivec_vctuxs((__a), (__b)), vector double \
|
||||
: __extension__({ \
|
||||
vector double __ret = \
|
||||
(__a) * \
|
||||
(vector double)(vector unsigned long long)((0x3ffULL + __b) \
|
||||
<< 52); \
|
||||
__builtin_convertvector(__ret, vector unsigned long long); \
|
||||
}))
|
||||
#else
|
||||
#define vec_ctu __builtin_altivec_vctuxs
|
||||
#endif
|
||||
|
||||
/* vec_vctuxs */
|
||||
|
||||
static __inline__ vector unsigned int __attribute__((__always_inline__))
|
||||
vec_vctuxs(vector float __a, int __b) {
|
||||
return __builtin_altivec_vctuxs(__a, __b);
|
||||
}
|
||||
#define vec_vctuxs __builtin_altivec_vctuxs
|
||||
|
||||
/* vec_signed */
|
||||
|
||||
|
@ -8045,45 +8037,51 @@ static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
|
|||
|
||||
/* vec_sl */
|
||||
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
vec_sl(vector signed char __a, vector unsigned char __b) {
|
||||
return __a << (vector signed char)__b;
|
||||
}
|
||||
|
||||
// vec_sl does modulo arithmetic on __b first, so __b is allowed to be more
|
||||
// than the length of __a.
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_sl(vector unsigned char __a, vector unsigned char __b) {
|
||||
return __a << __b;
|
||||
return __a << (__b %
|
||||
(vector unsigned char)(sizeof(unsigned char) * __CHAR_BIT__));
|
||||
}
|
||||
|
||||
static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
|
||||
vector unsigned short __b) {
|
||||
return __a << (vector short)__b;
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
vec_sl(vector signed char __a, vector unsigned char __b) {
|
||||
return (vector signed char)vec_sl((vector unsigned char)__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_sl(vector unsigned short __a, vector unsigned short __b) {
|
||||
return __a << __b;
|
||||
return __a << (__b % (vector unsigned short)(sizeof(unsigned short) *
|
||||
__CHAR_BIT__));
|
||||
}
|
||||
|
||||
static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
|
||||
vector unsigned int __b) {
|
||||
return __a << (vector int)__b;
|
||||
static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
|
||||
vector unsigned short __b) {
|
||||
return (vector short)vec_sl((vector unsigned short)__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_sl(vector unsigned int __a, vector unsigned int __b) {
|
||||
return __a << __b;
|
||||
return __a << (__b %
|
||||
(vector unsigned int)(sizeof(unsigned int) * __CHAR_BIT__));
|
||||
}
|
||||
|
||||
static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
|
||||
vector unsigned int __b) {
|
||||
return (vector int)vec_sl((vector unsigned int)__a, __b);
|
||||
}
|
||||
|
||||
#ifdef __POWER8_VECTOR__
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_sl(vector signed long long __a, vector unsigned long long __b) {
|
||||
return __a << (vector long long)__b;
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __a << __b;
|
||||
return __a << (__b % (vector unsigned long long)(sizeof(unsigned long long) *
|
||||
__CHAR_BIT__));
|
||||
}
|
||||
|
||||
static __inline__ vector long long __ATTRS_o_ai
|
||||
vec_sl(vector long long __a, vector unsigned long long __b) {
|
||||
return (vector long long)vec_sl((vector unsigned long long)__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -12150,6 +12148,11 @@ static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef __VSX__
|
||||
#define vec_xxpermdi __builtin_vsx_xxpermdi
|
||||
#define vec_xxsldwi __builtin_vsx_xxsldwi
|
||||
#endif
|
||||
|
||||
/* vec_xor */
|
||||
|
||||
#define __builtin_altivec_vxor vec_xor
|
||||
|
|
|
@ -224,6 +224,36 @@ __rbitl(unsigned long __t) {
|
|||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* 9.3 16-bit multiplications
|
||||
*/
|
||||
#if __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
__smulbb(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulbb(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
__smulbt(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulbt(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
__smultb(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smultb(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
__smultt(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smultt(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
__smulwb(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulwb(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
__smulwt(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulwt(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* 9.4 Saturating intrinsics
|
||||
*
|
||||
|
@ -231,13 +261,13 @@ __rbitl(unsigned long __t) {
|
|||
* intrinsics are implemented and the flag is enabled.
|
||||
*/
|
||||
/* 9.4.1 Width-specified saturation intrinsics */
|
||||
#if __ARM_32BIT_STATE
|
||||
#if __ARM_FEATURE_SAT
|
||||
#define __ssat(x, y) __builtin_arm_ssat(x, y)
|
||||
#define __usat(x, y) __builtin_arm_usat(x, y)
|
||||
#endif
|
||||
|
||||
/* 9.4.2 Saturating addition and subtraction intrinsics */
|
||||
#if __ARM_32BIT_STATE
|
||||
#if __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qadd(int32_t __t, int32_t __v) {
|
||||
return __builtin_arm_qadd(__t, __v);
|
||||
|
@ -254,6 +284,290 @@ __qdbl(int32_t __t) {
|
|||
}
|
||||
#endif
|
||||
|
||||
/* 9.4.3 Accumultating multiplications */
|
||||
#if __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlabb(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlabt(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlabt(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlatb(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlatb(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlatt(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlatt(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlawb(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlawb(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlawt(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlawt(__a, __b, __c);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* 9.5.4 Parallel 16-bit saturation */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
|
||||
#define __usat16(x, y) __builtin_arm_usat16(x, y)
|
||||
#endif
|
||||
|
||||
/* 9.5.5 Packing and unpacking */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
typedef int32_t int8x4_t;
|
||||
typedef int32_t int16x2_t;
|
||||
typedef uint32_t uint8x4_t;
|
||||
typedef uint32_t uint16x2_t;
|
||||
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__sxtab16(int16x2_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_sxtab16(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__sxtb16(int8x4_t __a) {
|
||||
return __builtin_arm_sxtb16(__a);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uxtab16(int16x2_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_uxtab16(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uxtb16(int8x4_t __a) {
|
||||
return __builtin_arm_uxtb16(__a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 9.5.6 Parallel selection */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__sel(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_sel(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 9.5.7 Parallel 8-bit addition and subtraction */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qadd8(int8x4_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_qadd8(__a, __b);
|
||||
}
|
||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qsub8(int8x4_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_qsub8(__a, __b);
|
||||
}
|
||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__sadd8(int8x4_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_sadd8(__a, __b);
|
||||
}
|
||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__shadd8(int8x4_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_shadd8(__a, __b);
|
||||
}
|
||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__shsub8(int8x4_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_shsub8(__a, __b);
|
||||
}
|
||||
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__ssub8(int8x4_t __a, int8x4_t __b) {
|
||||
return __builtin_arm_ssub8(__a, __b);
|
||||
}
|
||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uadd8(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_uadd8(__a, __b);
|
||||
}
|
||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uhadd8(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_uhadd8(__a, __b);
|
||||
}
|
||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uhsub8(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_uhsub8(__a, __b);
|
||||
}
|
||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uqadd8(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_uqadd8(__a, __b);
|
||||
}
|
||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uqsub8(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_uqsub8(__a, __b);
|
||||
}
|
||||
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||
__usub8(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_usub8(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 9.5.8 Sum of 8-bit absolute differences */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__usad8(uint8x4_t __a, uint8x4_t __b) {
|
||||
return __builtin_arm_usad8(__a, __b);
|
||||
}
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
|
||||
return __builtin_arm_usada8(__a, __b, __c);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 9.5.9 Parallel 16-bit addition and subtraction */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qadd16(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_qadd16(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qasx(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_qasx(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qsax(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_qsax(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qsub16(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_qsub16(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__sadd16(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_sadd16(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__sasx(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_sasx(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__shadd16(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_shadd16(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__shasx(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_shasx(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__shsax(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_shsax(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__shsub16(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_shsub16(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__ssax(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_ssax(__a, __b);
|
||||
}
|
||||
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__ssub16(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_ssub16(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uadd16(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uadd16(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uasx(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uasx(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uhadd16(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uhadd16(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uhasx(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uhasx(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uhsax(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uhsax(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uhsub16(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uhsub16(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uqadd16(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uqadd16(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uqasx(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uqasx(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uqsax(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uqsax(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__uqsub16(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_uqsub16(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__usax(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_usax(__a, __b);
|
||||
}
|
||||
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||
__usub16(uint16x2_t __a, uint16x2_t __b) {
|
||||
return __builtin_arm_usub16(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 9.5.10 Parallel 16-bit multiplications */
|
||||
#if __ARM_FEATURE_SIMD32
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlad(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||
return __builtin_arm_smladx(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
||||
return __builtin_arm_smlald(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
||||
return __builtin_arm_smlaldx(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlsd(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlsdx(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
||||
return __builtin_arm_smlsld(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
||||
return __builtin_arm_smlsldx(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smuad(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_smuad(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smuadx(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_smuadx(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smusd(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_smusd(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__smusdx(int16x2_t __a, int16x2_t __b) {
|
||||
return __builtin_arm_smusdx(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 9.7 CRC32 intrinsics */
|
||||
#if __ARM_FEATURE_CRC32
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
|
|
|
@ -832,7 +832,8 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
|
|||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_stream_load_si256(__m256i const *__V)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
|
||||
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
|
||||
return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -504,115 +504,91 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_packs_epi32 (__m512i __A, __m512i __B)
|
||||
_mm512_packs_epi32(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__v32hi) _mm512_setzero_hi(),
|
||||
(__mmask32) -1);
|
||||
return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
|
||||
_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__v32hi) _mm512_setzero_hi(),
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
|
||||
(__v32hi)_mm512_packs_epi32(__A, __B),
|
||||
(__v32hi)_mm512_setzero_hi());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
|
||||
__m512i __B)
|
||||
_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__v32hi) __W,
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
|
||||
(__v32hi)_mm512_packs_epi32(__A, __B),
|
||||
(__v32hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_packs_epi16 (__m512i __A, __m512i __B)
|
||||
_mm512_packs_epi16(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__v64qi) _mm512_setzero_qi(),
|
||||
(__mmask64) -1);
|
||||
return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
|
||||
__m512i __B)
|
||||
_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__v64qi) __W,
|
||||
(__mmask64) __M);
|
||||
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
|
||||
(__v64qi)_mm512_packs_epi16(__A, __B),
|
||||
(__v64qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
|
||||
_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__v64qi) _mm512_setzero_qi(),
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
|
||||
(__v64qi)_mm512_packs_epi16(__A, __B),
|
||||
(__v64qi)_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_packus_epi32 (__m512i __A, __m512i __B)
|
||||
_mm512_packus_epi32(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__v32hi) _mm512_setzero_hi(),
|
||||
(__mmask32) -1);
|
||||
return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
|
||||
_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__v32hi) _mm512_setzero_hi(),
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
|
||||
(__v32hi)_mm512_packus_epi32(__A, __B),
|
||||
(__v32hi)_mm512_setzero_hi());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
|
||||
__m512i __B)
|
||||
_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
|
||||
(__v16si) __B,
|
||||
(__v32hi) __W,
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
|
||||
(__v32hi)_mm512_packus_epi32(__A, __B),
|
||||
(__v32hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_packus_epi16 (__m512i __A, __m512i __B)
|
||||
_mm512_packus_epi16(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__v64qi) _mm512_setzero_qi(),
|
||||
(__mmask64) -1);
|
||||
return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
|
||||
__m512i __B)
|
||||
_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__v64qi) __W,
|
||||
(__mmask64) __M);
|
||||
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
|
||||
(__v64qi)_mm512_packus_epi16(__A, __B),
|
||||
(__v64qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
|
||||
_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
|
||||
(__v32hi) __B,
|
||||
(__v64qi) _mm512_setzero_qi(),
|
||||
(__mmask64) __M);
|
||||
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
|
||||
(__v64qi)_mm512_packus_epi16(__A, __B),
|
||||
(__v64qi)_mm512_setzero_qi());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -995,51 +995,50 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
|
|||
}
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcast_f32x8 (__m256 __A)
|
||||
_mm512_broadcast_f32x8(__m256 __A)
|
||||
{
|
||||
return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
|
||||
_mm512_undefined_ps(),
|
||||
(__mmask16) -1);
|
||||
return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
0, 1, 2, 3, 4, 5, 6, 7);
|
||||
}
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
|
||||
_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
|
||||
{
|
||||
return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
|
||||
(__v16sf)__O,
|
||||
__M);
|
||||
return (__m512)__builtin_ia32_selectps_512((__mmask8)__M,
|
||||
(__v16sf)_mm512_broadcast_f32x8(__A),
|
||||
(__v16sf)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
|
||||
_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
|
||||
{
|
||||
return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
|
||||
(__v16sf)_mm512_setzero_ps (),
|
||||
__M);
|
||||
return (__m512)__builtin_ia32_selectps_512((__mmask8)__M,
|
||||
(__v16sf)_mm512_broadcast_f32x8(__A),
|
||||
(__v16sf)_mm512_setzero_ps());
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcast_f64x2 (__m128d __A)
|
||||
_mm512_broadcast_f64x2(__m128d __A)
|
||||
{
|
||||
return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
|
||||
(__v8df)_mm512_undefined_pd(),
|
||||
(__mmask8) -1);
|
||||
return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
|
||||
0, 1, 0, 1, 0, 1, 0, 1);
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
|
||||
_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
|
||||
{
|
||||
return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
|
||||
(__v8df)
|
||||
__O, __M);
|
||||
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
|
||||
(__v8df)_mm512_broadcast_f64x2(__A),
|
||||
(__v8df)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
|
||||
_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
|
||||
{
|
||||
return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
|
||||
(__v8df)_mm512_setzero_ps (),
|
||||
__M);
|
||||
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
|
||||
(__v8df)_mm512_broadcast_f64x2(__A),
|
||||
(__v8df)_mm512_setzero_pd());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
|
@ -1067,52 +1066,50 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcast_i32x8 (__m256i __A)
|
||||
_mm512_broadcast_i32x8(__m256i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
|
||||
(__v16si)_mm512_setzero_si512(),
|
||||
(__mmask16) -1);
|
||||
return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
0, 1, 2, 3, 4, 5, 6, 7);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
|
||||
_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
|
||||
(__v16si)__O,
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M,
|
||||
(__v16si)_mm512_broadcast_i32x8(__A),
|
||||
(__v16si)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
|
||||
_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
|
||||
(__v16si)
|
||||
_mm512_setzero_si512 (),
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M,
|
||||
(__v16si)_mm512_broadcast_i32x8(__A),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcast_i64x2 (__m128i __A)
|
||||
_mm512_broadcast_i64x2(__m128i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
|
||||
(__v8di)_mm512_setzero_si512(),
|
||||
(__mmask8) -1);
|
||||
return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
|
||||
0, 1, 0, 1, 0, 1, 0, 1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
|
||||
_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
|
||||
(__v8di)
|
||||
__O, __M);
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
|
||||
(__v8di)_mm512_broadcast_i64x2(__A),
|
||||
(__v8di)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
|
||||
_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
|
||||
(__v8di)_mm512_setzero_si512 (),
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
|
||||
(__v8di)_mm512_broadcast_i64x2(__A),
|
||||
(__v8di)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \
|
||||
|
|
|
@ -528,6 +528,116 @@ _mm512_mask2int(__mmask16 __a)
|
|||
return (int)__a;
|
||||
}
|
||||
|
||||
/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
|
||||
/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
|
||||
/// contain the value of the source vector. The upper 384 bits are set
|
||||
/// to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double].
|
||||
/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
|
||||
/// contain the value of the parameter. The upper 384 bits are set to zero.
|
||||
static __inline __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_zextpd128_pd512(__m128d __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
|
||||
/// 256-bit floating-point vector of [4 x double]. The lower 256 bits
|
||||
/// contain the value of the source vector. The upper 256 bits are set
|
||||
/// to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 256-bit vector of [4 x double].
|
||||
/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
|
||||
/// contain the value of the parameter. The upper 256 bits are set to zero.
|
||||
static __inline __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_zextpd256_pd512(__m256d __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
|
||||
/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
|
||||
/// the value of the source vector. The upper 384 bits are set to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float].
|
||||
/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
|
||||
/// contain the value of the parameter. The upper 384 bits are set to zero.
|
||||
static __inline __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_zextps128_ps512(__m128 __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
|
||||
/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain
|
||||
/// the value of the source vector. The upper 256 bits are set to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 256-bit vector of [8 x float].
|
||||
/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
|
||||
/// contain the value of the parameter. The upper 256 bits are set to zero.
|
||||
static __inline __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_zextps256_ps512(__m256 __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 512-bit integer vector from a 128-bit integer vector.
|
||||
/// The lower 128 bits contain the value of the source vector. The upper
|
||||
/// 384 bits are set to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector.
|
||||
/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
|
||||
/// the parameter. The upper 384 bits are set to zero.
|
||||
static __inline __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_zextsi128_si512(__m128i __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 512-bit integer vector from a 256-bit integer vector.
|
||||
/// The lower 256 bits contain the value of the source vector. The upper
|
||||
/// 256 bits are set to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 256-bit integer vector.
|
||||
/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
|
||||
/// the parameter. The upper 256 bits are set to zero.
|
||||
static __inline __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_zextsi256_si512(__m256i __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
}
|
||||
|
||||
/* Bitwise operators */
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_and_epi32(__m512i __a, __m512i __b)
|
||||
|
@ -4179,7 +4289,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
|
|||
_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
|
||||
(__v16si)
|
||||
(__v16si)
|
||||
_mm512_setzero_si512 (),
|
||||
(__mmask16) __U ,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
|
@ -4229,6 +4339,18 @@ _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
|
|||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS
|
||||
_mm512_cvtsd_f64(__m512d __a)
|
||||
{
|
||||
return __a[0];
|
||||
}
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS
|
||||
_mm512_cvtss_f32(__m512 __a)
|
||||
{
|
||||
return __a[0];
|
||||
}
|
||||
|
||||
/* Unpack and Interleave */
|
||||
|
||||
static __inline __m512d __DEFAULT_FN_ATTRS
|
||||
|
@ -4540,7 +4662,7 @@ _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
|
|||
}
|
||||
|
||||
static __inline __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_loadu_pd(double const *__p)
|
||||
_mm512_loadu_pd(void const *__p)
|
||||
{
|
||||
struct __loadu_pd {
|
||||
__m512d __v;
|
||||
|
@ -4549,7 +4671,7 @@ _mm512_loadu_pd(double const *__p)
|
|||
}
|
||||
|
||||
static __inline __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_loadu_ps(float const *__p)
|
||||
_mm512_loadu_ps(void const *__p)
|
||||
{
|
||||
struct __loadu_ps {
|
||||
__m512 __v;
|
||||
|
@ -4558,7 +4680,7 @@ _mm512_loadu_ps(float const *__p)
|
|||
}
|
||||
|
||||
static __inline __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_load_ps(float const *__p)
|
||||
_mm512_load_ps(void const *__p)
|
||||
{
|
||||
return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
|
||||
(__v16sf)
|
||||
|
@ -4584,7 +4706,7 @@ _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
|
|||
}
|
||||
|
||||
static __inline __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_load_pd(double const *__p)
|
||||
_mm512_load_pd(void const *__p)
|
||||
{
|
||||
return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
|
||||
(__v8df)
|
||||
|
@ -7278,107 +7400,97 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
|
|||
(__mmask8)(U), (int)(R)); })
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcast_f32x4 (__m128 __A)
|
||||
_mm512_broadcast_f32x4(__m128 __A)
|
||||
{
|
||||
return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
|
||||
(__v16sf)
|
||||
_mm512_undefined_ps (),
|
||||
(__mmask16) -1);
|
||||
return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
|
||||
0, 1, 2, 3, 0, 1, 2, 3,
|
||||
0, 1, 2, 3, 0, 1, 2, 3);
|
||||
}
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
|
||||
_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
|
||||
{
|
||||
return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
|
||||
(__v16sf) __O,
|
||||
__M);
|
||||
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
|
||||
(__v16sf)_mm512_broadcast_f32x4(__A),
|
||||
(__v16sf)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
|
||||
_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
|
||||
{
|
||||
return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
|
||||
(__v16sf)
|
||||
_mm512_setzero_ps (),
|
||||
__M);
|
||||
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
|
||||
(__v16sf)_mm512_broadcast_f32x4(__A),
|
||||
(__v16sf)_mm512_setzero_ps());
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcast_f64x4 (__m256d __A)
|
||||
_mm512_broadcast_f64x4(__m256d __A)
|
||||
{
|
||||
return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
|
||||
(__v8df)
|
||||
_mm512_undefined_pd (),
|
||||
(__mmask8) -1);
|
||||
return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
|
||||
0, 1, 2, 3, 0, 1, 2, 3);
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
|
||||
_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
|
||||
{
|
||||
return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
|
||||
(__v8df) __O,
|
||||
__M);
|
||||
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
|
||||
(__v8df)_mm512_broadcast_f64x4(__A),
|
||||
(__v8df)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
|
||||
_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
|
||||
{
|
||||
return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
|
||||
(__v8df)
|
||||
_mm512_setzero_pd (),
|
||||
__M);
|
||||
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
|
||||
(__v8df)_mm512_broadcast_f64x4(__A),
|
||||
(__v8df)_mm512_setzero_pd());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcast_i32x4 (__m128i __A)
|
||||
_mm512_broadcast_i32x4(__m128i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
|
||||
(__v16si)
|
||||
_mm512_undefined_epi32 (),
|
||||
(__mmask16) -1);
|
||||
return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
|
||||
0, 1, 2, 3, 0, 1, 2, 3,
|
||||
0, 1, 2, 3, 0, 1, 2, 3);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
|
||||
_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
|
||||
(__v16si) __O,
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
|
||||
(__v16si)_mm512_broadcast_i32x4(__A),
|
||||
(__v16si)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
|
||||
_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
|
||||
(__v16si)
|
||||
_mm512_setzero_si512 (),
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
|
||||
(__v16si)_mm512_broadcast_i32x4(__A),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcast_i64x4 (__m256i __A)
|
||||
_mm512_broadcast_i64x4(__m256i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
|
||||
(__v8di)
|
||||
_mm512_undefined_epi32 (),
|
||||
(__mmask8) -1);
|
||||
return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
|
||||
0, 1, 2, 3, 0, 1, 2, 3);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
|
||||
_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
|
||||
(__v8di) __O,
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
|
||||
(__v8di)_mm512_broadcast_i64x4(__A),
|
||||
(__v8di)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
|
||||
_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
|
||||
(__v8di)
|
||||
_mm512_setzero_si512 (),
|
||||
__M);
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
|
||||
(__v8di)_mm512_broadcast_i64x4(__A),
|
||||
(__v8di)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS
|
||||
|
@ -7860,12 +7972,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
|
|||
3 + ((imm) & 0x3) * 4); })
|
||||
|
||||
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
|
||||
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
||||
(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
|
||||
(__v4si)__W); })
|
||||
(__v4si)(W)); })
|
||||
|
||||
#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
|
||||
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
||||
(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
|
||||
(__v4si)_mm_setzero_si128()); })
|
||||
|
||||
|
@ -7878,12 +7990,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
|
|||
((imm) & 1) ? 7 : 3); })
|
||||
|
||||
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
|
||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
||||
(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
|
||||
(__v4di)__W); })
|
||||
(__v4di)(W)); })
|
||||
|
||||
#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
|
||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
|
||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
||||
(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
|
||||
(__v4di)_mm256_setzero_si256()); })
|
||||
|
||||
|
@ -8159,11 +8271,11 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
|
|||
(__v8di)(__m512i)(index), (__mmask8)-1, \
|
||||
(int)(scale)); })
|
||||
|
||||
#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\
|
||||
__addr, __scale) __extension__({\
|
||||
__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
|
||||
__addr,(__v8di) __index, __mask, __scale);\
|
||||
})
|
||||
#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__({\
|
||||
(__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
|
||||
(float const *)(addr), \
|
||||
(__v8di)(__m512i)(index), \
|
||||
(__mmask8)(mask), (int)(scale)); })
|
||||
|
||||
#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
|
||||
(__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
|
||||
|
@ -8858,6 +8970,8 @@ _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
|
|||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
|
||||
__m512i __Y)
|
||||
|
@ -8868,6 +8982,8 @@ _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
|
|||
__M);
|
||||
}
|
||||
|
||||
#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
|
||||
|
||||
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
|
||||
_mm512_kand (__mmask16 __A, __mmask16 __B)
|
||||
{
|
||||
|
@ -8919,25 +9035,29 @@ _mm512_kxor (__mmask16 __A, __mmask16 __B)
|
|||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm512_stream_si512 (__m512i * __P, __m512i __A)
|
||||
{
|
||||
__builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
|
||||
typedef __v8di __v8di_aligned __attribute__((aligned(64)));
|
||||
__builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_stream_load_si512 (void *__P)
|
||||
{
|
||||
return __builtin_ia32_movntdqa512 ((__v8di *)__P);
|
||||
typedef __v8di __v8di_aligned __attribute__((aligned(64)));
|
||||
return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm512_stream_pd (double *__P, __m512d __A)
|
||||
{
|
||||
__builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
|
||||
typedef __v8df __v8df_aligned __attribute__((aligned(64)));
|
||||
__builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm512_stream_ps (float *__P, __m512 __A)
|
||||
{
|
||||
__builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
|
||||
typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
|
||||
__builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS
|
||||
|
@ -9101,39 +9221,39 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
|
|||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
|
||||
{
|
||||
__m128 res = __A;
|
||||
__m128 res = __A;
|
||||
res[0] = (__U & 1) ? __B[0] : __W[0];
|
||||
return res;
|
||||
return res;
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
|
||||
{
|
||||
__m128 res = __A;
|
||||
res[0] = (__U & 1) ? __B[0] : 0;
|
||||
return res;
|
||||
__m128 res = __A;
|
||||
res[0] = (__U & 1) ? __B[0] : 0;
|
||||
return res;
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
|
||||
{
|
||||
__m128d res = __A;
|
||||
__m128d res = __A;
|
||||
res[0] = (__U & 1) ? __B[0] : __W[0];
|
||||
return res;
|
||||
return res;
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
|
||||
{
|
||||
__m128d res = __A;
|
||||
res[0] = (__U & 1) ? __B[0] : 0;
|
||||
return res;
|
||||
__m128d res = __A;
|
||||
res[0] = (__U & 1) ? __B[0] : 0;
|
||||
return res;
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
|
||||
{
|
||||
__builtin_ia32_storess128_mask ((__v16sf *)__W,
|
||||
__builtin_ia32_storess128_mask ((__v16sf *)__W,
|
||||
(__v16sf) _mm512_castps128_ps512(__A),
|
||||
(__mmask16) __U & (__mmask16)1);
|
||||
}
|
||||
|
@ -9141,7 +9261,7 @@ _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
|
|||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
|
||||
{
|
||||
__builtin_ia32_storesd128_mask ((__v8df *)__W,
|
||||
__builtin_ia32_storesd128_mask ((__v8df *)__W,
|
||||
(__v8df) _mm512_castpd128_pd512(__A),
|
||||
(__mmask8) __U & 1);
|
||||
}
|
||||
|
@ -9490,7 +9610,7 @@ _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
|
|||
{
|
||||
return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
|
||||
(__v2df)(__B),
|
||||
(__v4sf)(__W),
|
||||
(__v4sf)(__W),
|
||||
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
|
@ -9499,7 +9619,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
|
|||
{
|
||||
return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
|
||||
(__v2df)(__B),
|
||||
(__v4sf)_mm_setzero_ps(),
|
||||
(__v4sf)_mm_setzero_ps(),
|
||||
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
|
@ -9564,7 +9684,7 @@ _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
|
|||
return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
|
||||
(__v4sf)(__B),
|
||||
(__v2df)(__W),
|
||||
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
|
||||
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
|
@ -9572,8 +9692,8 @@ _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
|
|||
{
|
||||
return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
|
||||
(__v4sf)(__B),
|
||||
(__v2df)_mm_setzero_pd(),
|
||||
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
|
||||
(__v2df)_mm_setzero_pd(),
|
||||
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
|
@ -9635,6 +9755,45 @@ _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
|
|||
}
|
||||
#endif
|
||||
|
||||
static __inline __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
|
||||
char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
|
||||
char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
|
||||
char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
|
||||
char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
|
||||
char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
|
||||
char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
|
||||
char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
|
||||
char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
|
||||
char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
|
||||
char __e4, char __e3, char __e2, char __e1, char __e0) {
|
||||
|
||||
return __extension__ (__m512i)(__v64qi)
|
||||
{__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
|
||||
__e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
|
||||
__e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
|
||||
__e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
|
||||
__e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
|
||||
__e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
|
||||
__e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
|
||||
__e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
|
||||
}
|
||||
|
||||
static __inline __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
|
||||
short __e27, short __e26, short __e25, short __e24, short __e23,
|
||||
short __e22, short __e21, short __e20, short __e19, short __e18,
|
||||
short __e17, short __e16, short __e15, short __e14, short __e13,
|
||||
short __e12, short __e11, short __e10, short __e9, short __e8,
|
||||
short __e7, short __e6, short __e5, short __e4, short __e3,
|
||||
short __e2, short __e1, short __e0) {
|
||||
return __extension__ (__m512i)(__v32hi)
|
||||
{__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
|
||||
__e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
|
||||
__e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
|
||||
__e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
|
||||
}
|
||||
|
||||
static __inline __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_set_epi32 (int __A, int __B, int __C, int __D,
|
||||
int __E, int __F, int __G, int __H,
|
||||
|
@ -9780,7 +9939,7 @@ static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
|
|||
}
|
||||
|
||||
// Vec512 - Vector with size 512.
|
||||
// Vec512Neutral - All vector elements set to the identity element.
|
||||
// Vec512Neutral - All vector elements set to the identity element.
|
||||
// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
|
||||
// Operator - Can be one of following: +,*,&,|
|
||||
// Mask - Intrinsic Mask
|
||||
|
@ -9810,19 +9969,19 @@ _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
|
|||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
|
||||
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
|
||||
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
|
||||
&, __M, i, i, q);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
|
||||
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M,
|
||||
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M,
|
||||
i, i, q);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
|
||||
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
|
||||
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
|
||||
f, d, pd);
|
||||
}
|
||||
|
||||
|
@ -9884,17 +10043,17 @@ _mm512_reduce_add_epi32(__m512i __W) {
|
|||
_mm512_reduce_operator_32bit(__W, +, i, i);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm512_reduce_mul_epi32(__m512i __W) {
|
||||
_mm512_reduce_operator_32bit(__W, *, i, i);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm512_reduce_and_epi32(__m512i __W) {
|
||||
_mm512_reduce_operator_32bit(__W, &, i, i);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm512_reduce_or_epi32(__m512i __W) {
|
||||
_mm512_reduce_operator_32bit(__W, |, i, i);
|
||||
}
|
||||
|
@ -9910,7 +10069,7 @@ _mm512_reduce_mul_ps(__m512 __W) {
|
|||
}
|
||||
|
||||
// Vec512 - Vector with size 512.
|
||||
// Vec512Neutral - All vector elements set to the identity element.
|
||||
// Vec512Neutral - All vector elements set to the identity element.
|
||||
// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
|
||||
// Operator - Can be one of following: +,*,&,|
|
||||
// Mask - Intrinsic Mask
|
||||
|
@ -9940,7 +10099,7 @@ _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
|
|||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
|
||||
_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
|
||||
_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
|
||||
i, i, d);
|
||||
}
|
||||
|
||||
|
@ -10003,7 +10162,7 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
|
|||
return Vec512[0]; \
|
||||
})
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
_mm512_reduce_max_epi64(__m512i __V) {
|
||||
_mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
|
||||
}
|
||||
|
@ -10013,7 +10172,7 @@ _mm512_reduce_max_epu64(__m512i __V) {
|
|||
_mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS
|
||||
static __inline__ double __DEFAULT_FN_ATTRS
|
||||
_mm512_reduce_max_pd(__m512d __V) {
|
||||
_mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
|
||||
}
|
||||
|
@ -10028,7 +10187,7 @@ _mm512_reduce_min_epu64(__m512i __V) {
|
|||
_mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS
|
||||
static __inline__ double __DEFAULT_FN_ATTRS
|
||||
_mm512_reduce_min_pd(__m512d __V) {
|
||||
_mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
|
||||
}
|
||||
|
|
|
@ -1000,27 +1000,26 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
|
|||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_broadcast_f64x2 (__m128d __A)
|
||||
_mm256_broadcast_f64x2(__m128d __A)
|
||||
{
|
||||
return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
|
||||
(__v4df)_mm256_undefined_pd(),
|
||||
(__mmask8) -1);
|
||||
return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
|
||||
0, 1, 0, 1);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
|
||||
_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
|
||||
{
|
||||
return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
|
||||
(__v4df) __O,
|
||||
__M);
|
||||
return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
|
||||
(__v4df)_mm256_broadcast_f64x2(__A),
|
||||
(__v4df)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
|
||||
{
|
||||
return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
|
||||
(__v4df) _mm256_setzero_ps (),
|
||||
__M);
|
||||
return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
|
||||
(__v4df)_mm256_broadcast_f64x2(__A),
|
||||
(__v4df)_mm256_setzero_pd());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
|
@ -1072,27 +1071,26 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_broadcast_i64x2 (__m128i __A)
|
||||
_mm256_broadcast_i64x2(__m128i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
|
||||
(__v4di)_mm256_undefined_si256(),
|
||||
(__mmask8) -1);
|
||||
return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
|
||||
0, 1, 0, 1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
|
||||
_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
|
||||
(__v4di) __O,
|
||||
__M);
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
|
||||
(__v4di)_mm256_broadcast_i64x2(__A),
|
||||
(__v4di)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
|
||||
(__v4di) _mm256_setzero_si256 (),
|
||||
__M);
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
|
||||
(__v4di)_mm256_broadcast_i64x2(__A),
|
||||
(__v4di)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \
|
||||
|
|
|
@ -7189,52 +7189,49 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
|
|||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_broadcast_f32x4 (__m128 __A)
|
||||
_mm256_broadcast_f32x4(__m128 __A)
|
||||
{
|
||||
return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
|
||||
(__v8sf)_mm256_undefined_pd (),
|
||||
(__mmask8) -1);
|
||||
return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
|
||||
0, 1, 2, 3, 0, 1, 2, 3);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
|
||||
_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
|
||||
{
|
||||
return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
|
||||
(__v8sf) __O,
|
||||
__M);
|
||||
return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
|
||||
(__v8sf)_mm256_broadcast_f32x4(__A),
|
||||
(__v8sf)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
|
||||
{
|
||||
return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
|
||||
(__v8sf) _mm256_setzero_ps (),
|
||||
__M);
|
||||
return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
|
||||
(__v8sf)_mm256_broadcast_f32x4(__A),
|
||||
(__v8sf)_mm256_setzero_ps());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_broadcast_i32x4 (__m128i __A)
|
||||
_mm256_broadcast_i32x4(__m128i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
|
||||
(__v8si)_mm256_undefined_si256 (),
|
||||
(__mmask8) -1);
|
||||
return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
|
||||
0, 1, 2, 3, 0, 1, 2, 3);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
|
||||
_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
|
||||
(__v8si)
|
||||
__O, __M);
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
|
||||
(__v8si)_mm256_broadcast_i32x4(__A),
|
||||
(__v8si)__O);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
|
||||
_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
|
||||
__A,
|
||||
(__v8si) _mm256_setzero_si256 (),
|
||||
__M);
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
|
||||
(__v8si)_mm256_broadcast_i32x4(__A),
|
||||
(__v8si)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
|
||||
*------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512VPOPCNTDQINTRIN_H
|
||||
#define __AVX512VPOPCNTDQINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd" \
|
||||
"q")))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
|
||||
return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
|
||||
return (__m512i)__builtin_ia32_selectq_512(
|
||||
(__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
|
||||
return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
|
||||
return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
|
||||
return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -1458,12 +1458,13 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
/// \brief Computes two dot products in parallel, using the lower and upper
|
||||
/// halves of two [8 x float] vectors as input to the two computations, and
|
||||
/// returning the two dot products in the lower and upper halves of the
|
||||
/// [8 x float] result. The immediate integer operand controls which input
|
||||
/// elements will contribute to the dot product, and where the final results
|
||||
/// are returned. In general, for each dot product, the four corresponding
|
||||
/// elements of the input vectors are multiplied; the first two and second
|
||||
/// two products are summed, then the two sums are added to form the final
|
||||
/// result.
|
||||
/// [8 x float] result.
|
||||
///
|
||||
/// The immediate integer operand controls which input elements will
|
||||
/// contribute to the dot product, and where the final results are returned.
|
||||
/// In general, for each dot product, the four corresponding elements of the
|
||||
/// input vectors are multiplied; the first two and second two products are
|
||||
/// summed, then the two sums are added to form the final result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1497,15 +1498,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
|
||||
/* Vector shuffle */
|
||||
/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
|
||||
/// specified by the immediate value operand. The four selected elements in
|
||||
/// each operand are copied to the destination according to the bits
|
||||
/// specified in the immediate operand. The selected elements from the first
|
||||
/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the
|
||||
/// destination, and the selected elements from the second 256-bit operand
|
||||
/// are copied to bits [127:64] and bits [255:192] of the destination. For
|
||||
/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,
|
||||
/// the 256-bit destination vector would contain the following values: b[7],
|
||||
/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].
|
||||
/// specified by the immediate value operand.
|
||||
///
|
||||
/// The four selected elements in each operand are copied to the destination
|
||||
/// according to the bits specified in the immediate operand. The selected
|
||||
/// elements from the first 256-bit operand are copied to bits [63:0] and
|
||||
/// bits [191:128] of the destination, and the selected elements from the
|
||||
/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
|
||||
/// the destination. For example, if bits [7:0] of the immediate operand
|
||||
/// contain a value of 0xFF, the 256-bit destination vector would contain the
|
||||
/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1557,13 +1559,14 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
12 + (((mask) >> 6) & 0x3)); })
|
||||
|
||||
/// \brief Selects four double-precision values from the 256-bit operands of
|
||||
/// [4 x double], as specified by the immediate value operand. The selected
|
||||
/// elements from the first 256-bit operand are copied to bits [63:0] and
|
||||
/// bits [191:128] in the destination, and the selected elements from the
|
||||
/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in
|
||||
/// the destination. For example, if bits [3:0] of the immediate operand
|
||||
/// contain a value of 0xF, the 256-bit destination vector would contain the
|
||||
/// following values: b[3], a[3], b[1], a[1].
|
||||
/// [4 x double], as specified by the immediate value operand.
|
||||
///
|
||||
/// The selected elements from the first 256-bit operand are copied to bits
|
||||
/// [63:0] and bits [191:128] in the destination, and the selected elements
|
||||
/// from the second 256-bit operand are copied to bits [127:64] and bits
|
||||
/// [255:192] in the destination. For example, if bits [3:0] of the immediate
|
||||
/// operand contain a value of 0xF, the 256-bit destination vector would
|
||||
/// contain the following values: b[3], a[3], b[1], a[1].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1613,9 +1616,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
|
||||
#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
|
||||
#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
|
||||
#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
|
||||
#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
|
||||
#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
|
||||
#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
|
||||
#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
|
||||
#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
|
||||
#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
|
||||
#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
|
||||
|
@ -1628,10 +1631,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
|
||||
#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
|
||||
#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
|
||||
#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
|
||||
#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
|
||||
#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
|
||||
#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
|
||||
#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
|
||||
#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
|
||||
#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
|
||||
#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
|
||||
#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
|
||||
|
@ -1641,9 +1644,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of two
|
||||
/// 128-bit vectors of [2 x double], using the operation specified by the
|
||||
/// immediate integer operand. Returns a [2 x double] vector consisting of
|
||||
/// two doubles corresponding to the two comparison results: zero if the
|
||||
/// comparison is false, and all 1's if the comparison is true.
|
||||
/// immediate integer operand.
|
||||
///
|
||||
/// Returns a [2 x double] vector consisting of two doubles corresponding to
|
||||
/// the two comparison results: zero if the comparison is false, and all 1's
|
||||
/// if the comparison is true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1660,17 +1665,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
/// \param c
|
||||
/// An immediate integer operand, with bits [4:0] specifying which comparison
|
||||
/// operation to use: \n
|
||||
/// 00h, 08h, 10h, 18h: Equal \n
|
||||
/// 01h, 09h, 11h, 19h: Less than \n
|
||||
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
|
||||
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
|
||||
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
|
||||
/// (swapped operands) \n
|
||||
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 07h, 0Fh, 17h, 1Fh: Ordered
|
||||
/// 0x00 : Equal (ordered, non-signaling)
|
||||
/// 0x01 : Less-than (ordered, signaling)
|
||||
/// 0x02 : Less-than-or-equal (ordered, signaling)
|
||||
/// 0x03 : Unordered (non-signaling)
|
||||
/// 0x04 : Not-equal (unordered, non-signaling)
|
||||
/// 0x05 : Not-less-than (unordered, signaling)
|
||||
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
|
||||
/// 0x07 : Ordered (non-signaling)
|
||||
/// 0x08 : Equal (unordered, non-signaling)
|
||||
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
|
||||
/// 0x0a : Not-greater-than (unordered, signaling)
|
||||
/// 0x0b : False (ordered, non-signaling)
|
||||
/// 0x0c : Not-equal (ordered, non-signaling)
|
||||
/// 0x0d : Greater-than-or-equal (ordered, signaling)
|
||||
/// 0x0e : Greater-than (ordered, signaling)
|
||||
/// 0x0f : True (unordered, non-signaling)
|
||||
/// 0x10 : Equal (ordered, signaling)
|
||||
/// 0x11 : Less-than (ordered, non-signaling)
|
||||
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
|
||||
/// 0x13 : Unordered (signaling)
|
||||
/// 0x14 : Not-equal (unordered, signaling)
|
||||
/// 0x15 : Not-less-than (unordered, non-signaling)
|
||||
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
|
||||
/// 0x17 : Ordered (signaling)
|
||||
/// 0x18 : Equal (unordered, signaling)
|
||||
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
|
||||
/// 0x1a : Not-greater-than (unordered, non-signaling)
|
||||
/// 0x1b : False (ordered, signaling)
|
||||
/// 0x1c : Not-equal (ordered, signaling)
|
||||
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
|
||||
/// 0x1e : Greater-than (ordered, non-signaling)
|
||||
/// 0x1f : True (unordered, signaling)
|
||||
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
|
||||
#define _mm_cmp_pd(a, b, c) __extension__ ({ \
|
||||
(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
|
||||
|
@ -1678,9 +1704,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
|
||||
/// \brief Compares each of the corresponding values of two 128-bit vectors of
|
||||
/// [4 x float], using the operation specified by the immediate integer
|
||||
/// operand. Returns a [4 x float] vector consisting of four floats
|
||||
/// corresponding to the four comparison results: zero if the comparison is
|
||||
/// false, and all 1's if the comparison is true.
|
||||
/// operand.
|
||||
///
|
||||
/// Returns a [4 x float] vector consisting of four floats corresponding to
|
||||
/// the four comparison results: zero if the comparison is false, and all 1's
|
||||
/// if the comparison is true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1697,17 +1725,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
/// \param c
|
||||
/// An immediate integer operand, with bits [4:0] specifying which comparison
|
||||
/// operation to use: \n
|
||||
/// 00h, 08h, 10h, 18h: Equal \n
|
||||
/// 01h, 09h, 11h, 19h: Less than \n
|
||||
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
|
||||
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
|
||||
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
|
||||
/// (swapped operands) \n
|
||||
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 07h, 0Fh, 17h, 1Fh: Ordered
|
||||
/// 0x00 : Equal (ordered, non-signaling)
|
||||
/// 0x01 : Less-than (ordered, signaling)
|
||||
/// 0x02 : Less-than-or-equal (ordered, signaling)
|
||||
/// 0x03 : Unordered (non-signaling)
|
||||
/// 0x04 : Not-equal (unordered, non-signaling)
|
||||
/// 0x05 : Not-less-than (unordered, signaling)
|
||||
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
|
||||
/// 0x07 : Ordered (non-signaling)
|
||||
/// 0x08 : Equal (unordered, non-signaling)
|
||||
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
|
||||
/// 0x0a : Not-greater-than (unordered, signaling)
|
||||
/// 0x0b : False (ordered, non-signaling)
|
||||
/// 0x0c : Not-equal (ordered, non-signaling)
|
||||
/// 0x0d : Greater-than-or-equal (ordered, signaling)
|
||||
/// 0x0e : Greater-than (ordered, signaling)
|
||||
/// 0x0f : True (unordered, non-signaling)
|
||||
/// 0x10 : Equal (ordered, signaling)
|
||||
/// 0x11 : Less-than (ordered, non-signaling)
|
||||
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
|
||||
/// 0x13 : Unordered (signaling)
|
||||
/// 0x14 : Not-equal (unordered, signaling)
|
||||
/// 0x15 : Not-less-than (unordered, non-signaling)
|
||||
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
|
||||
/// 0x17 : Ordered (signaling)
|
||||
/// 0x18 : Equal (unordered, signaling)
|
||||
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
|
||||
/// 0x1a : Not-greater-than (unordered, non-signaling)
|
||||
/// 0x1b : False (ordered, signaling)
|
||||
/// 0x1c : Not-equal (ordered, signaling)
|
||||
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
|
||||
/// 0x1e : Greater-than (ordered, non-signaling)
|
||||
/// 0x1f : True (unordered, signaling)
|
||||
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
|
||||
#define _mm_cmp_ps(a, b, c) __extension__ ({ \
|
||||
(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
|
||||
|
@ -1715,9 +1764,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of two
|
||||
/// 256-bit vectors of [4 x double], using the operation specified by the
|
||||
/// immediate integer operand. Returns a [4 x double] vector consisting of
|
||||
/// four doubles corresponding to the four comparison results: zero if the
|
||||
/// comparison is false, and all 1's if the comparison is true.
|
||||
/// immediate integer operand.
|
||||
///
|
||||
/// Returns a [4 x double] vector consisting of four doubles corresponding to
|
||||
/// the four comparison results: zero if the comparison is false, and all 1's
|
||||
/// if the comparison is true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1734,17 +1785,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
/// \param c
|
||||
/// An immediate integer operand, with bits [4:0] specifying which comparison
|
||||
/// operation to use: \n
|
||||
/// 00h, 08h, 10h, 18h: Equal \n
|
||||
/// 01h, 09h, 11h, 19h: Less than \n
|
||||
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
|
||||
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
|
||||
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
|
||||
/// (swapped operands) \n
|
||||
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 07h, 0Fh, 17h, 1Fh: Ordered
|
||||
/// 0x00 : Equal (ordered, non-signaling)
|
||||
/// 0x01 : Less-than (ordered, signaling)
|
||||
/// 0x02 : Less-than-or-equal (ordered, signaling)
|
||||
/// 0x03 : Unordered (non-signaling)
|
||||
/// 0x04 : Not-equal (unordered, non-signaling)
|
||||
/// 0x05 : Not-less-than (unordered, signaling)
|
||||
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
|
||||
/// 0x07 : Ordered (non-signaling)
|
||||
/// 0x08 : Equal (unordered, non-signaling)
|
||||
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
|
||||
/// 0x0a : Not-greater-than (unordered, signaling)
|
||||
/// 0x0b : False (ordered, non-signaling)
|
||||
/// 0x0c : Not-equal (ordered, non-signaling)
|
||||
/// 0x0d : Greater-than-or-equal (ordered, signaling)
|
||||
/// 0x0e : Greater-than (ordered, signaling)
|
||||
/// 0x0f : True (unordered, non-signaling)
|
||||
/// 0x10 : Equal (ordered, signaling)
|
||||
/// 0x11 : Less-than (ordered, non-signaling)
|
||||
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
|
||||
/// 0x13 : Unordered (signaling)
|
||||
/// 0x14 : Not-equal (unordered, signaling)
|
||||
/// 0x15 : Not-less-than (unordered, non-signaling)
|
||||
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
|
||||
/// 0x17 : Ordered (signaling)
|
||||
/// 0x18 : Equal (unordered, signaling)
|
||||
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
|
||||
/// 0x1a : Not-greater-than (unordered, non-signaling)
|
||||
/// 0x1b : False (ordered, signaling)
|
||||
/// 0x1c : Not-equal (ordered, signaling)
|
||||
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
|
||||
/// 0x1e : Greater-than (ordered, non-signaling)
|
||||
/// 0x1f : True (unordered, signaling)
|
||||
/// \returns A 256-bit vector of [4 x double] containing the comparison results.
|
||||
#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
|
||||
(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
|
||||
|
@ -1752,9 +1824,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
|
||||
/// \brief Compares each of the corresponding values of two 256-bit vectors of
|
||||
/// [8 x float], using the operation specified by the immediate integer
|
||||
/// operand. Returns a [8 x float] vector consisting of eight floats
|
||||
/// corresponding to the eight comparison results: zero if the comparison is
|
||||
/// false, and all 1's if the comparison is true.
|
||||
/// operand.
|
||||
///
|
||||
/// Returns a [8 x float] vector consisting of eight floats corresponding to
|
||||
/// the eight comparison results: zero if the comparison is false, and all
|
||||
/// 1's if the comparison is true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1771,17 +1845,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
/// \param c
|
||||
/// An immediate integer operand, with bits [4:0] specifying which comparison
|
||||
/// operation to use: \n
|
||||
/// 00h, 08h, 10h, 18h: Equal \n
|
||||
/// 01h, 09h, 11h, 19h: Less than \n
|
||||
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
|
||||
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
|
||||
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
|
||||
/// (swapped operands) \n
|
||||
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 07h, 0Fh, 17h, 1Fh: Ordered
|
||||
/// 0x00 : Equal (ordered, non-signaling)
|
||||
/// 0x01 : Less-than (ordered, signaling)
|
||||
/// 0x02 : Less-than-or-equal (ordered, signaling)
|
||||
/// 0x03 : Unordered (non-signaling)
|
||||
/// 0x04 : Not-equal (unordered, non-signaling)
|
||||
/// 0x05 : Not-less-than (unordered, signaling)
|
||||
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
|
||||
/// 0x07 : Ordered (non-signaling)
|
||||
/// 0x08 : Equal (unordered, non-signaling)
|
||||
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
|
||||
/// 0x0a : Not-greater-than (unordered, signaling)
|
||||
/// 0x0b : False (ordered, non-signaling)
|
||||
/// 0x0c : Not-equal (ordered, non-signaling)
|
||||
/// 0x0d : Greater-than-or-equal (ordered, signaling)
|
||||
/// 0x0e : Greater-than (ordered, signaling)
|
||||
/// 0x0f : True (unordered, non-signaling)
|
||||
/// 0x10 : Equal (ordered, signaling)
|
||||
/// 0x11 : Less-than (ordered, non-signaling)
|
||||
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
|
||||
/// 0x13 : Unordered (signaling)
|
||||
/// 0x14 : Not-equal (unordered, signaling)
|
||||
/// 0x15 : Not-less-than (unordered, non-signaling)
|
||||
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
|
||||
/// 0x17 : Ordered (signaling)
|
||||
/// 0x18 : Equal (unordered, signaling)
|
||||
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
|
||||
/// 0x1a : Not-greater-than (unordered, non-signaling)
|
||||
/// 0x1b : False (ordered, signaling)
|
||||
/// 0x1c : Not-equal (ordered, signaling)
|
||||
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
|
||||
/// 0x1e : Greater-than (ordered, non-signaling)
|
||||
/// 0x1f : True (unordered, signaling)
|
||||
/// \returns A 256-bit vector of [8 x float] containing the comparison results.
|
||||
#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
|
||||
(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
|
||||
|
@ -1789,8 +1884,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
|
||||
/// \brief Compares each of the corresponding scalar double-precision values of
|
||||
/// two 128-bit vectors of [2 x double], using the operation specified by the
|
||||
/// immediate integer operand. If the result is true, all 64 bits of the
|
||||
/// destination vector are set; otherwise they are cleared.
|
||||
/// immediate integer operand.
|
||||
///
|
||||
/// If the result is true, all 64 bits of the destination vector are set;
|
||||
/// otherwise they are cleared.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1807,17 +1904,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
/// \param c
|
||||
/// An immediate integer operand, with bits [4:0] specifying which comparison
|
||||
/// operation to use: \n
|
||||
/// 00h, 08h, 10h, 18h: Equal \n
|
||||
/// 01h, 09h, 11h, 19h: Less than \n
|
||||
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
|
||||
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
|
||||
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
|
||||
/// (swapped operands) \n
|
||||
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 07h, 0Fh, 17h, 1Fh: Ordered
|
||||
/// 0x00 : Equal (ordered, non-signaling)
|
||||
/// 0x01 : Less-than (ordered, signaling)
|
||||
/// 0x02 : Less-than-or-equal (ordered, signaling)
|
||||
/// 0x03 : Unordered (non-signaling)
|
||||
/// 0x04 : Not-equal (unordered, non-signaling)
|
||||
/// 0x05 : Not-less-than (unordered, signaling)
|
||||
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
|
||||
/// 0x07 : Ordered (non-signaling)
|
||||
/// 0x08 : Equal (unordered, non-signaling)
|
||||
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
|
||||
/// 0x0a : Not-greater-than (unordered, signaling)
|
||||
/// 0x0b : False (ordered, non-signaling)
|
||||
/// 0x0c : Not-equal (ordered, non-signaling)
|
||||
/// 0x0d : Greater-than-or-equal (ordered, signaling)
|
||||
/// 0x0e : Greater-than (ordered, signaling)
|
||||
/// 0x0f : True (unordered, non-signaling)
|
||||
/// 0x10 : Equal (ordered, signaling)
|
||||
/// 0x11 : Less-than (ordered, non-signaling)
|
||||
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
|
||||
/// 0x13 : Unordered (signaling)
|
||||
/// 0x14 : Not-equal (unordered, signaling)
|
||||
/// 0x15 : Not-less-than (unordered, non-signaling)
|
||||
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
|
||||
/// 0x17 : Ordered (signaling)
|
||||
/// 0x18 : Equal (unordered, signaling)
|
||||
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
|
||||
/// 0x1a : Not-greater-than (unordered, non-signaling)
|
||||
/// 0x1b : False (ordered, signaling)
|
||||
/// 0x1c : Not-equal (ordered, signaling)
|
||||
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
|
||||
/// 0x1e : Greater-than (ordered, non-signaling)
|
||||
/// 0x1f : True (unordered, signaling)
|
||||
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
|
||||
#define _mm_cmp_sd(a, b, c) __extension__ ({ \
|
||||
(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
|
||||
|
@ -1825,8 +1943,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
|
||||
/// \brief Compares each of the corresponding scalar values of two 128-bit
|
||||
/// vectors of [4 x float], using the operation specified by the immediate
|
||||
/// integer operand. If the result is true, all 32 bits of the destination
|
||||
/// vector are set; otherwise they are cleared.
|
||||
/// integer operand.
|
||||
///
|
||||
/// If the result is true, all 32 bits of the destination vector are set;
|
||||
/// otherwise they are cleared.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1843,17 +1963,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||
/// \param c
|
||||
/// An immediate integer operand, with bits [4:0] specifying which comparison
|
||||
/// operation to use: \n
|
||||
/// 00h, 08h, 10h, 18h: Equal \n
|
||||
/// 01h, 09h, 11h, 19h: Less than \n
|
||||
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
|
||||
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
|
||||
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
|
||||
/// (swapped operands) \n
|
||||
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
|
||||
/// (swapped operands) \n
|
||||
/// 07h, 0Fh, 17h, 1Fh: Ordered
|
||||
/// 0x00 : Equal (ordered, non-signaling)
|
||||
/// 0x01 : Less-than (ordered, signaling)
|
||||
/// 0x02 : Less-than-or-equal (ordered, signaling)
|
||||
/// 0x03 : Unordered (non-signaling)
|
||||
/// 0x04 : Not-equal (unordered, non-signaling)
|
||||
/// 0x05 : Not-less-than (unordered, signaling)
|
||||
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
|
||||
/// 0x07 : Ordered (non-signaling)
|
||||
/// 0x08 : Equal (unordered, non-signaling)
|
||||
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
|
||||
/// 0x0a : Not-greater-than (unordered, signaling)
|
||||
/// 0x0b : False (ordered, non-signaling)
|
||||
/// 0x0c : Not-equal (ordered, non-signaling)
|
||||
/// 0x0d : Greater-than-or-equal (ordered, signaling)
|
||||
/// 0x0e : Greater-than (ordered, signaling)
|
||||
/// 0x0f : True (unordered, non-signaling)
|
||||
/// 0x10 : Equal (ordered, signaling)
|
||||
/// 0x11 : Less-than (ordered, non-signaling)
|
||||
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
|
||||
/// 0x13 : Unordered (signaling)
|
||||
/// 0x14 : Not-equal (unordered, signaling)
|
||||
/// 0x15 : Not-less-than (unordered, non-signaling)
|
||||
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
|
||||
/// 0x17 : Ordered (signaling)
|
||||
/// 0x18 : Equal (unordered, signaling)
|
||||
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
|
||||
/// 0x1a : Not-greater-than (unordered, non-signaling)
|
||||
/// 0x1b : False (ordered, signaling)
|
||||
/// 0x1c : Not-equal (ordered, signaling)
|
||||
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
|
||||
/// 0x1e : Greater-than (ordered, non-signaling)
|
||||
/// 0x1f : True (unordered, signaling)
|
||||
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
|
||||
#define _mm_cmp_ss(a, b, c) __extension__ ({ \
|
||||
(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
|
||||
|
@ -2184,12 +2325,32 @@ _mm256_cvttps_epi32(__m256 __a)
|
|||
return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
|
||||
}
|
||||
|
||||
/// \brief Returns the first element of the input vector of [4 x double].
|
||||
///
|
||||
/// \headerfile <avxintrin.h>
|
||||
///
|
||||
/// This intrinsic is a utility function and does not correspond to a specific
|
||||
/// instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 256-bit vector of [4 x double].
|
||||
/// \returns A 64 bit double containing the first element of the input vector.
|
||||
static __inline double __DEFAULT_FN_ATTRS
|
||||
_mm256_cvtsd_f64(__m256d __a)
|
||||
{
|
||||
return __a[0];
|
||||
}
|
||||
|
||||
/// \brief Returns the first element of the input vector of [8 x i32].
|
||||
///
|
||||
/// \headerfile <avxintrin.h>
|
||||
///
|
||||
/// This intrinsic is a utility function and does not correspond to a specific
|
||||
/// instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 256-bit vector of [8 x i32].
|
||||
/// \returns A 32 bit integer containing the first element of the input vector.
|
||||
static __inline int __DEFAULT_FN_ATTRS
|
||||
_mm256_cvtsi256_si32(__m256i __a)
|
||||
{
|
||||
|
@ -2197,6 +2358,16 @@ _mm256_cvtsi256_si32(__m256i __a)
|
|||
return __b[0];
|
||||
}
|
||||
|
||||
/// \brief Returns the first element of the input vector of [8 x float].
|
||||
///
|
||||
/// \headerfile <avxintrin.h>
|
||||
///
|
||||
/// This intrinsic is a utility function and does not correspond to a specific
|
||||
/// instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 256-bit vector of [8 x float].
|
||||
/// \returns A 32 bit float containing the first element of the input vector.
|
||||
static __inline float __DEFAULT_FN_ATTRS
|
||||
_mm256_cvtss_f32(__m256 __a)
|
||||
{
|
||||
|
@ -2380,7 +2551,9 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b)
|
|||
/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
|
||||
/// element-by-element comparison of the double-precision element in the
|
||||
/// first source vector and the corresponding element in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of double-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2407,7 +2580,9 @@ _mm_testz_pd(__m128d __a, __m128d __b)
|
|||
/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
|
||||
/// element-by-element comparison of the double-precision element in the
|
||||
/// first source vector and the corresponding element in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of double-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2434,7 +2609,9 @@ _mm_testc_pd(__m128d __a, __m128d __b)
|
|||
/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
|
||||
/// element-by-element comparison of the double-precision element in the
|
||||
/// first source vector and the corresponding element in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of double-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2462,7 +2639,9 @@ _mm_testnzc_pd(__m128d __a, __m128d __b)
|
|||
/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
|
||||
/// element-by-element comparison of the single-precision element in the
|
||||
/// first source vector and the corresponding element in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of single-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2489,7 +2668,9 @@ _mm_testz_ps(__m128 __a, __m128 __b)
|
|||
/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
|
||||
/// element-by-element comparison of the single-precision element in the
|
||||
/// first source vector and the corresponding element in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of single-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2516,7 +2697,9 @@ _mm_testc_ps(__m128 __a, __m128 __b)
|
|||
/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
|
||||
/// element-by-element comparison of the single-precision element in the
|
||||
/// first source vector and the corresponding element in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of single-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2544,7 +2727,9 @@ _mm_testnzc_ps(__m128 __a, __m128 __b)
|
|||
/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
|
||||
/// element-by-element comparison of the double-precision elements in the
|
||||
/// first source vector and the corresponding elements in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of double-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2571,7 +2756,9 @@ _mm256_testz_pd(__m256d __a, __m256d __b)
|
|||
/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
|
||||
/// element-by-element comparison of the double-precision elements in the
|
||||
/// first source vector and the corresponding elements in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of double-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2598,7 +2785,9 @@ _mm256_testc_pd(__m256d __a, __m256d __b)
|
|||
/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
|
||||
/// element-by-element comparison of the double-precision elements in the
|
||||
/// first source vector and the corresponding elements in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of double-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2626,7 +2815,9 @@ _mm256_testnzc_pd(__m256d __a, __m256d __b)
|
|||
/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
|
||||
/// element-by-element comparison of the single-precision element in the
|
||||
/// first source vector and the corresponding element in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of single-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2653,7 +2844,9 @@ _mm256_testz_ps(__m256 __a, __m256 __b)
|
|||
/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
|
||||
/// element-by-element comparison of the single-precision element in the
|
||||
/// first source vector and the corresponding element in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of single-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2680,7 +2873,9 @@ _mm256_testc_ps(__m256 __a, __m256 __b)
|
|||
/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
|
||||
/// element-by-element comparison of the single-precision elements in the
|
||||
/// first source vector and the corresponding elements in the second source
|
||||
/// vector. The EFLAGS register is updated as follows: \n
|
||||
/// vector.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of single-precision elements where the
|
||||
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
|
||||
/// ZF flag is set to 1. \n
|
||||
|
@ -2706,7 +2901,9 @@ _mm256_testnzc_ps(__m256 __a, __m256 __b)
|
|||
}
|
||||
|
||||
/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
|
||||
/// of the two source vectors and update the EFLAGS register as follows: \n
|
||||
/// of the two source vectors.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of bits where both bits are 1, the ZF flag
|
||||
/// is set to 0. Otherwise the ZF flag is set to 1. \n
|
||||
/// If there is at least one pair of bits where the bit from the first source
|
||||
|
@ -2730,7 +2927,9 @@ _mm256_testz_si256(__m256i __a, __m256i __b)
|
|||
}
|
||||
|
||||
/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
|
||||
/// of the two source vectors and update the EFLAGS register as follows: \n
|
||||
/// of the two source vectors.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of bits where both bits are 1, the ZF flag
|
||||
/// is set to 0. Otherwise the ZF flag is set to 1. \n
|
||||
/// If there is at least one pair of bits where the bit from the first source
|
||||
|
@ -2754,7 +2953,9 @@ _mm256_testc_si256(__m256i __a, __m256i __b)
|
|||
}
|
||||
|
||||
/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
|
||||
/// of the two source vectors and update the EFLAGS register as follows: \n
|
||||
/// of the two source vectors.
|
||||
///
|
||||
/// The EFLAGS register is updated as follows: \n
|
||||
/// If there is at least one pair of bits where both bits are 1, the ZF flag
|
||||
/// is set to 0. Otherwise the ZF flag is set to 1. \n
|
||||
/// If there is at least one pair of bits where the bit from the first source
|
||||
|
@ -3389,7 +3590,8 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
|
|||
static __inline void __DEFAULT_FN_ATTRS
|
||||
_mm256_stream_si256(__m256i *__a, __m256i __b)
|
||||
{
|
||||
__builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
|
||||
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
|
||||
__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
|
||||
}
|
||||
|
||||
/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
|
||||
|
@ -3402,13 +3604,14 @@ _mm256_stream_si256(__m256i *__a, __m256i __b)
|
|||
///
|
||||
/// \param __a
|
||||
/// A pointer to a 32-byte aligned memory location that will receive the
|
||||
/// integer values.
|
||||
/// double-precision floating-point values.
|
||||
/// \param __b
|
||||
/// A 256-bit vector of [4 x double] containing the values to be moved.
|
||||
static __inline void __DEFAULT_FN_ATTRS
|
||||
_mm256_stream_pd(double *__a, __m256d __b)
|
||||
{
|
||||
__builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
|
||||
typedef __v4df __v4df_aligned __attribute__((aligned(32)));
|
||||
__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
|
||||
}
|
||||
|
||||
/// \brief Moves single-precision floating point values from a 256-bit vector
|
||||
|
@ -3428,7 +3631,8 @@ _mm256_stream_pd(double *__a, __m256d __b)
|
|||
static __inline void __DEFAULT_FN_ATTRS
|
||||
_mm256_stream_ps(float *__p, __m256 __a)
|
||||
{
|
||||
__builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
|
||||
typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
|
||||
__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
|
||||
}
|
||||
|
||||
/* Create vectors */
|
||||
|
@ -4310,9 +4514,10 @@ _mm256_castsi256_si128(__m256i __a)
|
|||
}
|
||||
|
||||
/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
|
||||
/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
|
||||
/// contain the value of the source vector. The contents of the upper 128
|
||||
/// bits are undefined.
|
||||
/// 128-bit floating-point vector of [2 x double].
|
||||
///
|
||||
/// The lower 128 bits contain the value of the source vector. The contents
|
||||
/// of the upper 128 bits are undefined.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -4330,9 +4535,10 @@ _mm256_castpd128_pd256(__m128d __a)
|
|||
}
|
||||
|
||||
/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
|
||||
/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
|
||||
/// the value of the source vector. The contents of the upper 128 bits are
|
||||
/// undefined.
|
||||
/// 128-bit floating-point vector of [4 x float].
|
||||
///
|
||||
/// The lower 128 bits contain the value of the source vector. The contents
|
||||
/// of the upper 128 bits are undefined.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -4350,6 +4556,7 @@ _mm256_castps128_ps256(__m128 __a)
|
|||
}
|
||||
|
||||
/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
|
||||
///
|
||||
/// The lower 128 bits contain the value of the source vector. The contents
|
||||
/// of the upper 128 bits are undefined.
|
||||
///
|
||||
|
@ -4367,6 +4574,61 @@ _mm256_castsi128_si256(__m128i __a)
|
|||
return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
|
||||
/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
|
||||
/// contain the value of the source vector. The upper 128 bits are set
|
||||
/// to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double].
|
||||
/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
|
||||
/// contain the value of the parameter. The upper 128 bits are set to zero.
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_zextpd128_pd256(__m128d __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
|
||||
/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
|
||||
/// the value of the source vector. The upper 128 bits are set to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float].
|
||||
/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
|
||||
/// contain the value of the parameter. The upper 128 bits are set to zero.
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_zextps128_ps256(__m128 __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
|
||||
/// The lower 128 bits contain the value of the source vector. The upper
|
||||
/// 128 bits are set to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic has no corresponding instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector.
|
||||
/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
|
||||
/// the parameter. The upper 128 bits are set to zero.
|
||||
static __inline __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_zextsi128_si256(__m128i __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
|
||||
}
|
||||
|
||||
/*
|
||||
Vector insert.
|
||||
We use macros rather than inlines because we only want to accept
|
||||
|
@ -4375,8 +4637,10 @@ _mm256_castsi128_si256(__m128i __a)
|
|||
/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
|
||||
/// a 256-bit vector of [8 x float] given in the first parameter, and then
|
||||
/// replacing either the upper or the lower 128 bits with the contents of a
|
||||
/// 128-bit vector of [4 x float] in the second parameter. The immediate
|
||||
/// integer parameter determines between the upper or the lower 128 bits.
|
||||
/// 128-bit vector of [4 x float] in the second parameter.
|
||||
///
|
||||
/// The immediate integer parameter determines between the upper or the lower
|
||||
/// 128 bits.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -4420,8 +4684,10 @@ _mm256_castsi128_si256(__m128i __a)
|
|||
/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
|
||||
/// a 256-bit vector of [4 x double] given in the first parameter, and then
|
||||
/// replacing either the upper or the lower 128 bits with the contents of a
|
||||
/// 128-bit vector of [2 x double] in the second parameter. The immediate
|
||||
/// integer parameter determines between the upper or the lower 128 bits.
|
||||
/// 128-bit vector of [2 x double] in the second parameter.
|
||||
///
|
||||
/// The immediate integer parameter determines between the upper or the lower
|
||||
/// 128 bits.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -4461,8 +4727,10 @@ _mm256_castsi128_si256(__m128i __a)
|
|||
/// \brief Constructs a new 256-bit integer vector by first duplicating a
|
||||
/// 256-bit integer vector given in the first parameter, and then replacing
|
||||
/// either the upper or the lower 128 bits with the contents of a 128-bit
|
||||
/// integer vector in the second parameter. The immediate integer parameter
|
||||
/// determines between the upper or the lower 128 bits.
|
||||
/// integer vector in the second parameter.
|
||||
///
|
||||
/// The immediate integer parameter determines between the upper or the lower
|
||||
/// 128 bits.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
|
|
@ -28,107 +28,17 @@
|
|||
#ifndef __BMIINTRIN_H
|
||||
#define __BMIINTRIN_H
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned short _tzcnt_u16(unsigned short a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 16-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
#define _tzcnt_u16(a) (__tzcnt_u16((a)))
|
||||
|
||||
/// \brief Performs a bitwise AND of the second operand with the one's
|
||||
/// complement of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _andn_u32(unsigned int a, unsigned int b);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned integer containing one of the operands.
|
||||
/// \param b
|
||||
/// An unsigned integer containing one of the operands.
|
||||
/// \returns An unsigned integer containing the bitwise AND of the second
|
||||
/// operand with the one's complement of the first operand.
|
||||
#define _andn_u32(a, b) (__andn_u32((a), (b)))
|
||||
|
||||
/* _bextr_u32 != __bextr_u32 */
|
||||
/// \brief Clears all bits in the source except for the least significant bit
|
||||
/// containing a value of 1 and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _blsi_u32(unsigned int a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned integer whose bits are to be cleared.
|
||||
/// \returns An unsigned integer containing the result of clearing the bits from
|
||||
/// the source operand.
|
||||
#define _blsi_u32(a) (__blsi_u32((a)))
|
||||
|
||||
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
|
||||
/// including the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _blsmsk_u32(unsigned int a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned integer used to create the mask.
|
||||
/// \returns An unsigned integer containing the newly created mask.
|
||||
#define _blsmsk_u32(a) (__blsmsk_u32((a)))
|
||||
|
||||
/// \brief Clears the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _blsr_u32(unsigned int a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned integer containing the operand to be cleared.
|
||||
/// \returns An unsigned integer containing the result of clearing the source
|
||||
/// operand.
|
||||
#define _blsr_u32(a) (__blsr_u32((a)))
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _tzcnt_u32(unsigned int a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 32-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
#define _tzcnt_u32(a) (__tzcnt_u32((a)))
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
|
@ -238,7 +148,7 @@ __blsi_u32(unsigned int __X)
|
|||
}
|
||||
|
||||
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
|
||||
/// including the least siginificant bit that is set to 1 in the source
|
||||
/// including the least significant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -254,7 +164,7 @@ __blsmsk_u32(unsigned int __X)
|
|||
return __X ^ (__X - 1);
|
||||
}
|
||||
|
||||
/// \brief Clears the least siginificant bit that is set to 1 in the source
|
||||
/// \brief Clears the least significant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -305,91 +215,15 @@ _mm_tzcnt_32(unsigned int __X)
|
|||
|
||||
#ifdef __x86_64__
|
||||
|
||||
/// \brief Performs a bitwise AND of the second operand with the one's
|
||||
/// complement of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer containing one of the operands.
|
||||
/// \param b
|
||||
/// An unsigned 64-bit integer containing one of the operands.
|
||||
/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
|
||||
/// operand with the one's complement of the first operand.
|
||||
#define _andn_u64(a, b) (__andn_u64((a), (b)))
|
||||
|
||||
/* _bextr_u64 != __bextr_u64 */
|
||||
/// \brief Clears all bits in the source except for the least significant bit
|
||||
/// containing a value of 1 and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _blsi_u64(unsigned long long a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer whose bits are to be cleared.
|
||||
/// \returns An unsigned 64-bit integer containing the result of clearing the
|
||||
/// bits from the source operand.
|
||||
#define _blsi_u64(a) (__blsi_u64((a)))
|
||||
|
||||
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
|
||||
/// including the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _blsmsk_u64(unsigned long long a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer used to create the mask.
|
||||
/// \returns A unsigned 64-bit integer containing the newly created mask.
|
||||
#define _blsmsk_u64(a) (__blsmsk_u64((a)))
|
||||
|
||||
/// \brief Clears the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _blsr_u64(unsigned long long a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer containing the operand to be cleared.
|
||||
/// \returns An unsigned 64-bit integer containing the result of clearing the
|
||||
/// source operand.
|
||||
#define _blsr_u64(a) (__blsr_u64((a)))
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _tzcnt_u64(unsigned long long a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 64-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
#define _tzcnt_u64(a) (__tzcnt_u64((a)))
|
||||
|
||||
/// \brief Performs a bitwise AND of the second operand with the one's
|
||||
|
@ -475,7 +309,7 @@ __blsi_u64(unsigned long long __X)
|
|||
}
|
||||
|
||||
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
|
||||
/// including the least siginificant bit that is set to 1 in the source
|
||||
/// including the least significant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -484,14 +318,14 @@ __blsi_u64(unsigned long long __X)
|
|||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer used to create the mask.
|
||||
/// \returns A unsigned 64-bit integer containing the newly created mask.
|
||||
/// \returns An unsigned 64-bit integer containing the newly created mask.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsmsk_u64(unsigned long long __X)
|
||||
{
|
||||
return __X ^ (__X - 1);
|
||||
}
|
||||
|
||||
/// \brief Clears the least siginificant bit that is set to 1 in the source
|
||||
/// \brief Clears the least significant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
/*===----------------------- clzerointrin.h - CLZERO ----------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __X86INTRIN_H
|
||||
#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef _CLZEROINTRIN_H
|
||||
#define _CLZEROINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("clzero")))
|
||||
|
||||
/// \brief Loads the cache line address and zero's out the cacheline
|
||||
///
|
||||
/// \headerfile <clzerointrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CLZERO </c> instruction.
|
||||
///
|
||||
/// \param __line
|
||||
/// A pointer to a cacheline which needs to be zeroed out.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_clzero (void * __line)
|
||||
{
|
||||
__builtin_ia32_clzero ((void *)__line);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* _CLZEROINTRIN_H */
|
|
@ -79,7 +79,7 @@
|
|||
#define signature_VORTEX_edx 0x36387865
|
||||
#define signature_VORTEX_ecx 0x436f5320
|
||||
|
||||
/* Features in %ecx for level 1 */
|
||||
/* Features in %ecx for leaf 1 */
|
||||
#define bit_SSE3 0x00000001
|
||||
#define bit_PCLMULQDQ 0x00000002
|
||||
#define bit_PCLMUL bit_PCLMULQDQ /* for gcc compat */
|
||||
|
@ -114,7 +114,7 @@
|
|||
#define bit_F16C 0x20000000
|
||||
#define bit_RDRND 0x40000000
|
||||
|
||||
/* Features in %edx for level 1 */
|
||||
/* Features in %edx for leaf 1 */
|
||||
#define bit_FPU 0x00000001
|
||||
#define bit_VME 0x00000002
|
||||
#define bit_DE 0x00000004
|
||||
|
@ -147,44 +147,95 @@
|
|||
#define bit_TM 0x20000000
|
||||
#define bit_PBE 0x80000000
|
||||
|
||||
/* Features in %ebx for level 7 sub-leaf 0 */
|
||||
/* Features in %ebx for leaf 7 sub-leaf 0 */
|
||||
#define bit_FSGSBASE 0x00000001
|
||||
#define bit_SGX 0x00000004
|
||||
#define bit_BMI 0x00000008
|
||||
#define bit_HLE 0x00000010
|
||||
#define bit_AVX2 0x00000020
|
||||
#define bit_SMEP 0x00000080
|
||||
#define bit_BMI2 0x00000100
|
||||
#define bit_ENH_MOVSB 0x00000200
|
||||
#define bit_RTM 0x00000800
|
||||
#define bit_MPX 0x00004000
|
||||
#define bit_AVX512F 0x00010000
|
||||
#define bit_AVX512DQ 0x00020000
|
||||
#define bit_RDSEED 0x00040000
|
||||
#define bit_ADX 0x00080000
|
||||
#define bit_AVX512IFMA 0x00200000
|
||||
#define bit_CLFLUSHOPT 0x00800000
|
||||
#define bit_CLWB 0x01000000
|
||||
#define bit_AVX512PF 0x04000000
|
||||
#define bit_AVX51SER 0x08000000
|
||||
#define bit_AVX512CD 0x10000000
|
||||
#define bit_SHA 0x20000000
|
||||
#define bit_AVX512BW 0x40000000
|
||||
#define bit_AVX512VL 0x80000000
|
||||
|
||||
/* Features in %ecx for leaf 7 sub-leaf 0 */
|
||||
#define bit_PREFTCHWT1 0x00000001
|
||||
#define bit_AVX512VBMI 0x00000002
|
||||
#define bit_PKU 0x00000004
|
||||
#define bit_OSPKE 0x00000010
|
||||
#define bit_AVX512VPOPCNTDQ 0x00004000
|
||||
#define bit_RDPID 0x00400000
|
||||
|
||||
/* Features in %edx for leaf 7 sub-leaf 0 */
|
||||
#define bit_AVX5124VNNIW 0x00000004
|
||||
#define bit_AVX5124FMAPS 0x00000008
|
||||
|
||||
/* Features in %eax for leaf 13 sub-leaf 1 */
|
||||
#define bit_XSAVEOPT 0x00000001
|
||||
#define bit_XSAVEC 0x00000002
|
||||
#define bit_XSAVES 0x00000008
|
||||
|
||||
/* Features in %ecx for leaf 0x80000001 */
|
||||
#define bit_LAHF_LM 0x00000001
|
||||
#define bit_ABM 0x00000020
|
||||
#define bit_SSE4a 0x00000040
|
||||
#define bit_PRFCHW 0x00000100
|
||||
#define bit_XOP 0x00000800
|
||||
#define bit_LWP 0x00008000
|
||||
#define bit_FMA4 0x00010000
|
||||
#define bit_TBM 0x00200000
|
||||
#define bit_MWAITX 0x20000000
|
||||
|
||||
/* Features in %edx for leaf 0x80000001 */
|
||||
#define bit_MMXEXT 0x00400000
|
||||
#define bit_LM 0x20000000
|
||||
#define bit_3DNOWP 0x40000000
|
||||
#define bit_3DNOW 0x80000000
|
||||
|
||||
/* Features in %ebx for leaf 0x80000001 */
|
||||
#define bit_CLZERO 0x00000001
|
||||
|
||||
|
||||
#if __i386__
|
||||
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
|
||||
#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
|
||||
__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
|
||||
: "0"(__level))
|
||||
: "0"(__leaf))
|
||||
|
||||
#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
|
||||
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
|
||||
__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
|
||||
: "0"(__level), "2"(__count))
|
||||
: "0"(__leaf), "2"(__count))
|
||||
#else
|
||||
/* x86-64 uses %rbx as the base register, so preserve it. */
|
||||
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
|
||||
#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
|
||||
__asm(" xchgq %%rbx,%q1\n" \
|
||||
" cpuid\n" \
|
||||
" xchgq %%rbx,%q1" \
|
||||
: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
|
||||
: "0"(__level))
|
||||
: "0"(__leaf))
|
||||
|
||||
#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
|
||||
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
|
||||
__asm(" xchgq %%rbx,%q1\n" \
|
||||
" cpuid\n" \
|
||||
" xchgq %%rbx,%q1" \
|
||||
: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
|
||||
: "0"(__level), "2"(__count))
|
||||
: "0"(__leaf), "2"(__count))
|
||||
#endif
|
||||
|
||||
static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
|
||||
unsigned int *__ebx, unsigned int *__ecx,
|
||||
unsigned int *__edx) {
|
||||
__cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
|
||||
static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
|
||||
{
|
||||
unsigned int __eax, __ebx, __ecx, __edx;
|
||||
#if __i386__
|
||||
|
@ -208,8 +259,35 @@ static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
|
|||
return 0;
|
||||
#endif
|
||||
|
||||
__cpuid(__level, __eax, __ebx, __ecx, __edx);
|
||||
__cpuid(__leaf, __eax, __ebx, __ecx, __edx);
|
||||
if (__sig)
|
||||
*__sig = __ebx;
|
||||
return __eax;
|
||||
}
|
||||
|
||||
static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax,
|
||||
unsigned int *__ebx, unsigned int *__ecx,
|
||||
unsigned int *__edx)
|
||||
{
|
||||
unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
|
||||
|
||||
if (__max_leaf == 0 || __max_leaf < __leaf)
|
||||
return 0;
|
||||
|
||||
__cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static __inline int __get_cpuid_count (unsigned int __leaf,
|
||||
unsigned int __subleaf,
|
||||
unsigned int *__eax, unsigned int *__ebx,
|
||||
unsigned int *__ecx, unsigned int *__edx)
|
||||
{
|
||||
unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
|
||||
|
||||
if (__max_leaf == 0 || __max_leaf < __leaf)
|
||||
return 0;
|
||||
|
||||
__cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -302,7 +302,7 @@ _mm_min_pd(__m128d __a, __m128d __b)
|
|||
return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
|
||||
}
|
||||
|
||||
/// \brief Compares lower 64-bits double-precision values of both operands, and
|
||||
/// \brief Compares lower 64-bit double-precision values of both operands, and
|
||||
/// returns the greater of the pair of values in the lower 64-bits of the
|
||||
/// result. The upper 64 bits of the result are copied from the upper double-
|
||||
/// precision value of the first operand.
|
||||
|
@ -462,8 +462,9 @@ _mm_cmplt_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are less than or equal to those in the second operand. Each
|
||||
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// operand are less than or equal to those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -482,8 +483,9 @@ _mm_cmple_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are greater than those in the second operand. Each comparison
|
||||
/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// operand are greater than those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -502,8 +504,9 @@ _mm_cmpgt_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are greater than or equal to those in the second operand. Each
|
||||
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// operand are greater than or equal to those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -522,9 +525,10 @@ _mm_cmpge_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are ordered with respect to those in the second operand. A pair
|
||||
/// of double-precision values are "ordered" with respect to each other if
|
||||
/// neither value is a NaN. Each comparison yields 0h for false,
|
||||
/// operand are ordered with respect to those in the second operand.
|
||||
///
|
||||
/// A pair of double-precision values are "ordered" with respect to each
|
||||
/// other if neither value is a NaN. Each comparison yields 0h for false,
|
||||
/// FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -544,9 +548,10 @@ _mm_cmpord_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are unordered with respect to those in the second operand. A pair
|
||||
/// of double-precision values are "unordered" with respect to each other if
|
||||
/// one or both values are NaN. Each comparison yields 0h for false,
|
||||
/// operand are unordered with respect to those in the second operand.
|
||||
///
|
||||
/// A pair of double-precision values are "unordered" with respect to each
|
||||
/// other if one or both values are NaN. Each comparison yields 0h for false,
|
||||
/// FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -567,8 +572,9 @@ _mm_cmpunord_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are unequal to those in the second operand. Each comparison
|
||||
/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// operand are unequal to those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -587,8 +593,9 @@ _mm_cmpneq_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are not less than those in the second operand. Each comparison
|
||||
/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// operand are not less than those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -607,8 +614,9 @@ _mm_cmpnlt_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are not less than or equal to those in the second operand. Each
|
||||
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// operand are not less than or equal to those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -627,8 +635,9 @@ _mm_cmpnle_pd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are not greater than those in the second operand. Each
|
||||
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// operand are not greater than those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -648,6 +657,7 @@ _mm_cmpngt_pd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares each of the corresponding double-precision values of the
|
||||
/// 128-bit vectors of [2 x double] to determine if the values in the first
|
||||
/// operand are not greater than or equal to those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -666,8 +676,9 @@ _mm_cmpnge_pd(__m128d __a, __m128d __b)
|
|||
}
|
||||
|
||||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] for equality. The
|
||||
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// the two 128-bit floating-point vectors of [2 x double] for equality.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -690,8 +701,9 @@ _mm_cmpeq_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is less than the corresponding value in
|
||||
/// the second parameter. The comparison yields 0h for false,
|
||||
/// FFFFFFFFFFFFFFFFh for true.
|
||||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -714,8 +726,9 @@ _mm_cmplt_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is less than or equal to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0h for
|
||||
/// false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -738,8 +751,9 @@ _mm_cmple_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is greater than the corresponding value
|
||||
/// in the second parameter. The comparison yields 0h for false,
|
||||
/// FFFFFFFFFFFFFFFFh for true.
|
||||
/// in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -763,8 +777,9 @@ _mm_cmpgt_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is greater than or equal to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0h for
|
||||
/// false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -788,9 +803,11 @@ _mm_cmpge_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is "ordered" with respect to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0h for
|
||||
/// false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
|
||||
/// "ordered" with respect to each other if neither value is a NaN.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
|
||||
/// double-precision values are "ordered" with respect to each other if
|
||||
/// neither value is a NaN.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -813,9 +830,11 @@ _mm_cmpord_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is "unordered" with respect to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0h
|
||||
/// for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
|
||||
/// are "unordered" with respect to each other if one or both values are NaN.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
|
||||
/// double-precision values are "unordered" with respect to each other if one
|
||||
/// or both values are NaN.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -839,8 +858,9 @@ _mm_cmpunord_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is unequal to the corresponding value in
|
||||
/// the second parameter. The comparison yields 0h for false,
|
||||
/// FFFFFFFFFFFFFFFFh for true.
|
||||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -863,8 +883,9 @@ _mm_cmpneq_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is not less than the corresponding
|
||||
/// value in the second parameter. The comparison yields 0h for false,
|
||||
/// FFFFFFFFFFFFFFFFh for true.
|
||||
/// value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -887,8 +908,9 @@ _mm_cmpnlt_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is not less than or equal to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0h
|
||||
/// for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -911,8 +933,9 @@ _mm_cmpnle_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is not greater than the corresponding
|
||||
/// value in the second parameter. The comparison yields 0h for false,
|
||||
/// FFFFFFFFFFFFFFFFh for true.
|
||||
/// value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -936,8 +959,9 @@ _mm_cmpngt_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is not greater than or equal to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0h
|
||||
/// for false, FFFFFFFFFFFFFFFFh for true.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -982,7 +1006,9 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is less than the corresponding value in
|
||||
/// the second parameter. The comparison yields 0 for false, 1 for true.
|
||||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1004,8 +1030,9 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is less than or equal to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0 for
|
||||
/// false, 1 for true.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1027,7 +1054,9 @@ _mm_comile_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is greater than the corresponding value
|
||||
/// in the second parameter. The comparison yields 0 for false, 1 for true.
|
||||
/// in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1049,8 +1078,9 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is greater than or equal to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0 for
|
||||
/// false, 1 for true.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1072,7 +1102,9 @@ _mm_comige_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is unequal to the corresponding value in
|
||||
/// the second parameter. The comparison yields 0 for false, 1 for true.
|
||||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1093,8 +1125,9 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
|
|||
|
||||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] for equality. The
|
||||
/// comparison yields 0 for false, 1 for true. If either of the two lower
|
||||
/// double-precision values is NaN, 1 is returned.
|
||||
/// comparison yields 0 for false, 1 for true.
|
||||
///
|
||||
/// If either of the two lower double-precision values is NaN, 1 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1117,8 +1150,10 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is less than the corresponding value in
|
||||
/// the second parameter. The comparison yields 0 for false, 1 for true. If
|
||||
/// either of the two lower double-precision values is NaN, 1 is returned.
|
||||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two lower
|
||||
/// double-precision values is NaN, 1 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1141,9 +1176,10 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is less than or equal to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0 for
|
||||
/// false, 1 for true. If either of the two lower double-precision values is
|
||||
/// NaN, 1 is returned.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two lower
|
||||
/// double-precision values is NaN, 1 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1166,8 +1202,10 @@ _mm_ucomile_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is greater than the corresponding value
|
||||
/// in the second parameter. The comparison yields 0 for false, 1 for true.
|
||||
/// If either of the two lower double-precision values is NaN, 0 is returned.
|
||||
/// in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two lower
|
||||
/// double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1190,9 +1228,10 @@ _mm_ucomigt_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is greater than or equal to the
|
||||
/// corresponding value in the second parameter. The comparison yields 0 for
|
||||
/// false, 1 for true. If either of the two lower double-precision values
|
||||
/// is NaN, 0 is returned.
|
||||
/// corresponding value in the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two
|
||||
/// lower double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1215,8 +1254,10 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
|
|||
/// \brief Compares the lower double-precision floating-point values in each of
|
||||
/// the two 128-bit floating-point vectors of [2 x double] to determine if
|
||||
/// the value in the first parameter is unequal to the corresponding value in
|
||||
/// the second parameter. The comparison yields 0 for false, 1 for true. If
|
||||
/// either of the two lower double-precision values is NaN, 0 is returned.
|
||||
/// the second parameter.
|
||||
///
|
||||
/// The comparison yields 0 for false, 1 for true. If either of the two lower
|
||||
/// double-precision values is NaN, 0 is returned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1278,8 +1319,9 @@ _mm_cvtps_pd(__m128 __a)
|
|||
|
||||
/// \brief Converts the lower two integer elements of a 128-bit vector of
|
||||
/// [4 x i32] into two double-precision floating-point values, returned in a
|
||||
/// 128-bit vector of [2 x double]. The upper two elements of the input
|
||||
/// vector are unused.
|
||||
/// 128-bit vector of [2 x double].
|
||||
///
|
||||
/// The upper two elements of the input vector are unused.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1287,7 +1329,9 @@ _mm_cvtps_pd(__m128 __a)
|
|||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
|
||||
/// converted to double-precision values. The upper two elements are unused.
|
||||
/// converted to double-precision values.
|
||||
///
|
||||
/// The upper two elements are unused.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the converted values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepi32_pd(__m128i __a)
|
||||
|
@ -1409,10 +1453,11 @@ _mm_cvtss_sd(__m128d __a, __m128 __b)
|
|||
|
||||
/// \brief Converts the two double-precision floating-point elements of a
|
||||
/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
|
||||
/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
|
||||
/// result of either conversion is inexact, the result is truncated (rounded
|
||||
/// towards zero) regardless of the current MXCSR setting. The upper 64 bits
|
||||
/// of the result vector are set to zero.
|
||||
/// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
|
||||
///
|
||||
/// If the result of either conversion is inexact, the result is truncated
|
||||
/// (rounded towards zero) regardless of the current MXCSR setting. The upper
|
||||
/// 64 bits of the result vector are set to zero.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1466,9 +1511,10 @@ _mm_cvtpd_pi32(__m128d __a)
|
|||
|
||||
/// \brief Converts the two double-precision floating-point elements of a
|
||||
/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
|
||||
/// returned in a 64-bit vector of [2 x i32]. If the result of either
|
||||
/// conversion is inexact, the result is truncated (rounded towards zero)
|
||||
/// regardless of the current MXCSR setting.
|
||||
/// returned in a 64-bit vector of [2 x i32].
|
||||
///
|
||||
/// If the result of either conversion is inexact, the result is truncated
|
||||
/// (rounded towards zero) regardless of the current MXCSR setting.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1599,6 +1645,17 @@ _mm_loadu_pd(double const *__dp)
|
|||
return ((struct __loadu_pd*)__dp)->__v;
|
||||
}
|
||||
|
||||
/// \brief Loads a 64-bit integer value to the low element of a 128-bit integer
|
||||
/// vector and clears the upper element.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A pointer to a 64-bit memory location. The address of the memory
|
||||
/// location does not have to be aligned.
|
||||
/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_loadu_si64(void const *__a)
|
||||
{
|
||||
|
@ -1609,6 +1666,17 @@ _mm_loadu_si64(void const *__a)
|
|||
return (__m128i){__u, 0L};
|
||||
}
|
||||
|
||||
/// \brief Loads a 64-bit double-precision value to the low element of a
|
||||
/// 128-bit integer vector and clears the upper element.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
|
||||
///
|
||||
/// \param __dp
|
||||
/// A pointer to a memory location containing a double-precision value.
|
||||
/// The address of the memory location does not have to be aligned.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the loaded value.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_load_sd(double const *__dp)
|
||||
{
|
||||
|
@ -1728,6 +1796,24 @@ _mm_set1_pd(double __w)
|
|||
return (__m128d){ __w, __w };
|
||||
}
|
||||
|
||||
/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
|
||||
/// of the two double-precision floating-point vector elements set to the
|
||||
/// specified double-precision floating-point value.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
|
||||
///
|
||||
/// \param __w
|
||||
/// A double-precision floating-point value used to initialize each vector
|
||||
/// element of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [2 x double].
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_set_pd1(double __w)
|
||||
{
|
||||
return _mm_set1_pd(__w);
|
||||
}
|
||||
|
||||
/// \brief Constructs a 128-bit floating-point vector of [2 x double]
|
||||
/// initialized with the specified double-precision floating-point values.
|
||||
///
|
||||
|
@ -1787,7 +1873,7 @@ _mm_setzero_pd(void)
|
|||
/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
|
||||
/// 64 bits are set to the lower 64 bits of the second parameter. The upper
|
||||
/// 64 bits are set to the upper 64 bits of the first parameter.
|
||||
//
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
|
||||
|
@ -1825,12 +1911,38 @@ _mm_store_sd(double *__dp, __m128d __a)
|
|||
((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
|
||||
}
|
||||
|
||||
/// \brief Moves packed double-precision values from a 128-bit vector of
|
||||
/// [2 x double] to a memory location.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
|
||||
///
|
||||
/// \param __dp
|
||||
/// A pointer to an aligned memory location that can store two
|
||||
/// double-precision values.
|
||||
/// \param __a
|
||||
/// A packed 128-bit vector of [2 x double] containing the values to be
|
||||
/// moved.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_store_pd(double *__dp, __m128d __a)
|
||||
{
|
||||
*(__m128d*)__dp = __a;
|
||||
}
|
||||
|
||||
/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
|
||||
/// the upper and lower 64 bits of a memory location.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
|
||||
///
|
||||
/// \param __dp
|
||||
/// A pointer to a memory location that can store two double-precision
|
||||
/// values.
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
|
||||
/// of the values in \a dp.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_store1_pd(double *__dp, __m128d __a)
|
||||
{
|
||||
|
@ -1940,8 +2052,9 @@ _mm_storel_pd(double *__dp, __m128d __a)
|
|||
|
||||
/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
|
||||
/// saving the lower 8 bits of each sum in the corresponding element of a
|
||||
/// 128-bit result vector of [16 x i8]. The integer elements of both
|
||||
/// parameters can be either signed or unsigned.
|
||||
/// 128-bit result vector of [16 x i8].
|
||||
///
|
||||
/// The integer elements of both parameters can be either signed or unsigned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1961,8 +2074,9 @@ _mm_add_epi8(__m128i __a, __m128i __b)
|
|||
|
||||
/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
|
||||
/// saving the lower 16 bits of each sum in the corresponding element of a
|
||||
/// 128-bit result vector of [8 x i16]. The integer elements of both
|
||||
/// parameters can be either signed or unsigned.
|
||||
/// 128-bit result vector of [8 x i16].
|
||||
///
|
||||
/// The integer elements of both parameters can be either signed or unsigned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1982,8 +2096,9 @@ _mm_add_epi16(__m128i __a, __m128i __b)
|
|||
|
||||
/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
|
||||
/// saving the lower 32 bits of each sum in the corresponding element of a
|
||||
/// 128-bit result vector of [4 x i32]. The integer elements of both
|
||||
/// parameters can be either signed or unsigned.
|
||||
/// 128-bit result vector of [4 x i32].
|
||||
///
|
||||
/// The integer elements of both parameters can be either signed or unsigned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -2021,8 +2136,9 @@ _mm_add_si64(__m64 __a, __m64 __b)
|
|||
|
||||
/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
|
||||
/// saving the lower 64 bits of each sum in the corresponding element of a
|
||||
/// 128-bit result vector of [2 x i64]. The integer elements of both
|
||||
/// parameters can be either signed or unsigned.
|
||||
/// 128-bit result vector of [2 x i64].
|
||||
///
|
||||
/// The integer elements of both parameters can be either signed or unsigned.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -2168,10 +2284,12 @@ _mm_avg_epu16(__m128i __a, __m128i __b)
|
|||
/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
|
||||
/// vectors, producing eight intermediate 32-bit signed integer products, and
|
||||
/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
|
||||
/// [4 x i32] vector. For example, bits [15:0] of both parameters are
|
||||
/// multiplied producing a 32-bit product, bits [31:16] of both parameters
|
||||
/// are multiplied producing a 32-bit product, and the sum of those two
|
||||
/// products becomes bits [31:0] of the result.
|
||||
/// [4 x i32] vector.
|
||||
///
|
||||
/// For example, bits [15:0] of both parameters are multiplied producing a
|
||||
/// 32-bit product, bits [31:16] of both parameters are multiplied producing
|
||||
/// a 32-bit product, and the sum of those two products becomes bits [31:0]
|
||||
/// of the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -2369,7 +2487,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b)
|
|||
|
||||
/// \brief Computes the absolute differences of corresponding 8-bit integer
|
||||
/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
|
||||
/// separately sums the second 8 absolute differences. Packss these two
|
||||
/// separately sums the second 8 absolute differences. Packs these two
|
||||
/// unsigned 16-bit integer sums into the upper and lower elements of a
|
||||
/// [2 x i64] vector.
|
||||
///
|
||||
|
@ -3106,8 +3224,9 @@ _mm_cmpgt_epi8(__m128i __a, __m128i __b)
|
|||
|
||||
/// \brief Compares each of the corresponding signed 16-bit values of the
|
||||
/// 128-bit integer vectors to determine if the values in the first operand
|
||||
/// are greater than those in the second operand. Each comparison yields 0h
|
||||
/// for false, FFFFh for true.
|
||||
/// are greater than those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -3126,8 +3245,9 @@ _mm_cmpgt_epi16(__m128i __a, __m128i __b)
|
|||
|
||||
/// \brief Compares each of the corresponding signed 32-bit values of the
|
||||
/// 128-bit integer vectors to determine if the values in the first operand
|
||||
/// are greater than those in the second operand. Each comparison yields 0h
|
||||
/// for false, FFFFFFFFh for true.
|
||||
/// are greater than those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -3146,8 +3266,9 @@ _mm_cmpgt_epi32(__m128i __a, __m128i __b)
|
|||
|
||||
/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
|
||||
/// integer vectors to determine if the values in the first operand are less
|
||||
/// than those in the second operand. Each comparison yields 0h for false,
|
||||
/// FFh for true.
|
||||
/// than those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -3166,8 +3287,9 @@ _mm_cmplt_epi8(__m128i __a, __m128i __b)
|
|||
|
||||
/// \brief Compares each of the corresponding signed 16-bit values of the
|
||||
/// 128-bit integer vectors to determine if the values in the first operand
|
||||
/// are less than those in the second operand. Each comparison yields 0h for
|
||||
/// false, FFFFh for true.
|
||||
/// are less than those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -3186,8 +3308,9 @@ _mm_cmplt_epi16(__m128i __a, __m128i __b)
|
|||
|
||||
/// \brief Compares each of the corresponding signed 32-bit values of the
|
||||
/// 128-bit integer vectors to determine if the values in the first operand
|
||||
/// are less than those in the second operand. Each comparison yields 0h for
|
||||
/// false, FFFFFFFFh for true.
|
||||
/// are less than those in the second operand.
|
||||
///
|
||||
/// Each comparison yields 0h for false, FFFFFFFFh for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -3885,10 +4008,11 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
|
|||
|
||||
/// \brief Moves bytes selected by the mask from the first operand to the
|
||||
/// specified unaligned memory location. When a mask bit is 1, the
|
||||
/// corresponding byte is written, otherwise it is not written. To minimize
|
||||
/// caching, the date is flagged as non-temporal (unlikely to be used again
|
||||
/// soon). Exception and trap behavior for elements not selected for storage
|
||||
/// to memory are implementation dependent.
|
||||
/// corresponding byte is written, otherwise it is not written.
|
||||
///
|
||||
/// To minimize caching, the date is flagged as non-temporal (unlikely to be
|
||||
/// used again soon). Exception and trap behavior for elements not selected
|
||||
/// for storage to memory are implementation dependent.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -3932,8 +4056,10 @@ _mm_storel_epi64(__m128i *__p, __m128i __a)
|
|||
}
|
||||
|
||||
/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
|
||||
/// aligned memory location. To minimize caching, the data is flagged as
|
||||
/// non-temporal (unlikely to be used again soon).
|
||||
/// aligned memory location.
|
||||
///
|
||||
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||
/// used again soon).
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -3950,6 +4076,7 @@ _mm_stream_pd(double *__p, __m128d __a)
|
|||
}
|
||||
|
||||
/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
|
||||
///
|
||||
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||
/// used again soon).
|
||||
///
|
||||
|
@ -3967,8 +4094,9 @@ _mm_stream_si128(__m128i *__p, __m128i __a)
|
|||
__builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
|
||||
}
|
||||
|
||||
/// \brief Stores a 32-bit integer value in the specified memory location. To
|
||||
/// minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||
/// \brief Stores a 32-bit integer value in the specified memory location.
|
||||
///
|
||||
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||
/// used again soon).
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -3986,8 +4114,9 @@ _mm_stream_si32(int *__p, int __a)
|
|||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
/// \brief Stores a 64-bit integer value in the specified memory location. To
|
||||
/// minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||
/// \brief Stores a 64-bit integer value in the specified memory location.
|
||||
///
|
||||
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||
/// used again soon).
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
|
@ -4019,7 +4148,7 @@ extern "C" {
|
|||
/// \param __p
|
||||
/// A pointer to the memory location used to identify the cache line to be
|
||||
/// flushed.
|
||||
void _mm_clflush(void const *);
|
||||
void _mm_clflush(void const * __p);
|
||||
|
||||
/// \brief Forces strong memory ordering (serialization) between load
|
||||
/// instructions preceding this instruction and load instructions following
|
||||
|
@ -4141,7 +4270,7 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
|
|||
/// \param __a
|
||||
/// A 128-bit integer vector.
|
||||
/// \param __imm
|
||||
/// An immediate value. Bits [3:0] selects values from \a __a to be assigned
|
||||
/// An immediate value. Bits [2:0] selects values from \a __a to be assigned
|
||||
/// to bits[15:0] of the result. \n
|
||||
/// 000: assign values from bits [15:0] of \a __a. \n
|
||||
/// 001: assign values from bits [31:16] of \a __a. \n
|
||||
|
@ -4788,4 +4917,12 @@ void _mm_pause(void);
|
|||
|
||||
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
|
||||
|
||||
#define _MM_DENORMALS_ZERO_ON (0x0040)
|
||||
#define _MM_DENORMALS_ZERO_OFF (0x0000)
|
||||
|
||||
#define _MM_DENORMALS_ZERO_MASK (0x0040)
|
||||
|
||||
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
|
||||
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
|
||||
|
||||
#endif /* __EMMINTRIN_H */
|
||||
|
|
|
@ -72,9 +72,9 @@ _cvtsh_ss(unsigned short __a)
|
|||
/// 011: Truncate \n
|
||||
/// 1XX: Use MXCSR.RC for rounding
|
||||
/// \returns The converted 16-bit half-precision float value.
|
||||
#define _cvtss_sh(a, imm) \
|
||||
((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
|
||||
(imm)))[0]))
|
||||
#define _cvtss_sh(a, imm) __extension__ ({ \
|
||||
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
|
||||
(imm)))[0]); })
|
||||
|
||||
/// \brief Converts a 128-bit vector containing 32-bit float values into a
|
||||
/// 128-bit vector containing 16-bit half-precision float values.
|
||||
|
@ -99,8 +99,8 @@ _cvtsh_ss(unsigned short __a)
|
|||
/// \returns A 128-bit vector containing converted 16-bit half-precision float
|
||||
/// values. The lower 64 bits are used to store the converted 16-bit
|
||||
/// half-precision floating-point values.
|
||||
#define _mm_cvtps_ph(a, imm) \
|
||||
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
|
||||
#define _mm_cvtps_ph(a, imm) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
|
||||
|
||||
/// \brief Converts a 128-bit vector containing 16-bit half-precision float
|
||||
/// values into a 128-bit vector containing 32-bit float values.
|
||||
|
|
|
@ -33,6 +33,15 @@
|
|||
*/
|
||||
#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
|
||||
__STDC_HOSTED__ && __has_include_next(<float.h>)
|
||||
|
||||
/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
|
||||
* of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this
|
||||
* extra indirection.
|
||||
*/
|
||||
#ifdef __APPLE__
|
||||
#define _FLOAT_H_
|
||||
#endif
|
||||
|
||||
# include_next <float.h>
|
||||
|
||||
/* Undefine anything that we'll be redefining below. */
|
||||
|
|
|
@ -35,14 +35,10 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define _TEXASR_PTR(TM_BUF) \
|
||||
((texasr_t *)((TM_BUF)+0))
|
||||
#define _TEXASRU_PTR(TM_BUF) \
|
||||
((texasru_t *)((TM_BUF)+0))
|
||||
#define _TEXASRL_PTR(TM_BUF) \
|
||||
((texasrl_t *)((TM_BUF)+4))
|
||||
#define _TFIAR_PTR(TM_BUF) \
|
||||
((tfiar_t *)((TM_BUF)+8))
|
||||
#define _TEXASR_PTR(TM_BUF) ((texasr_t *)((char *)(TM_BUF) + 0))
|
||||
#define _TEXASRU_PTR(TM_BUF) ((texasru_t *)((char *)(TM_BUF) + 0))
|
||||
#define _TEXASRL_PTR(TM_BUF) ((texasrl_t *)((char *)(TM_BUF) + 4))
|
||||
#define _TFIAR_PTR(TM_BUF) ((tfiar_t *)((char *)(TM_BUF) + 8))
|
||||
|
||||
typedef char TM_buff_type[16];
|
||||
|
||||
|
@ -178,7 +174,7 @@ extern __inline long
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_is_conflict(void* const __TM_buff)
|
||||
{
|
||||
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
|
||||
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
|
||||
/* Return TEXASR bits 11 (Self-Induced Conflict) through
|
||||
14 (Translation Invalidation Conflict). */
|
||||
return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
|
||||
|
|
|
@ -146,6 +146,10 @@ _mm256_cvtph_ps(__m128i __a)
|
|||
#include <avx512cdintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
|
||||
#include <avx512vpopcntdqintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
|
||||
#include <avx512dqintrin.h>
|
||||
#endif
|
||||
|
@ -208,6 +212,15 @@ _rdrand32_step(unsigned int *__p)
|
|||
return __builtin_ia32_rdrand32_step(__p);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||
_rdrand64_step(unsigned long long *__p)
|
||||
{
|
||||
return __builtin_ia32_rdrand64_step(__p);
|
||||
}
|
||||
#endif
|
||||
#endif /* __RDRND__ */
|
||||
|
||||
/* __bit_scan_forward */
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
_bit_scan_forward(int __A) {
|
||||
|
@ -220,15 +233,6 @@ _bit_scan_reverse(int __A) {
|
|||
return 31 - __builtin_clz(__A);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||
_rdrand64_step(unsigned long long *__p)
|
||||
{
|
||||
return __builtin_ia32_rdrand64_step(__p);
|
||||
}
|
||||
#endif
|
||||
#endif /* __RDRND__ */
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
|
||||
#ifdef __x86_64__
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
|
|
|
@ -69,7 +69,6 @@ static __inline__
|
|||
__int64 __emul(int, int);
|
||||
static __inline__
|
||||
unsigned __int64 __emulu(unsigned int, unsigned int);
|
||||
void __cdecl __fastfail(unsigned int);
|
||||
unsigned int __getcallerseflags(void);
|
||||
static __inline__
|
||||
void __halt(void);
|
||||
|
@ -80,16 +79,12 @@ void __incfsdword(unsigned long);
|
|||
void __incfsword(unsigned long);
|
||||
unsigned long __indword(unsigned short);
|
||||
void __indwordstring(unsigned short, unsigned long *, unsigned long);
|
||||
void __int2c(void);
|
||||
void __invlpg(void *);
|
||||
unsigned short __inword(unsigned short);
|
||||
void __inwordstring(unsigned short, unsigned short *, unsigned long);
|
||||
void __lidt(void *);
|
||||
unsigned __int64 __ll_lshift(unsigned __int64, int);
|
||||
__int64 __ll_rshift(__int64, int);
|
||||
void __llwpcb(void *);
|
||||
unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
|
||||
void __lwpval32(unsigned int, unsigned int, unsigned int);
|
||||
unsigned int __lzcnt(unsigned int);
|
||||
unsigned short __lzcnt16(unsigned short);
|
||||
static __inline__
|
||||
|
@ -128,7 +123,6 @@ unsigned __int64 __readmsr(unsigned long);
|
|||
unsigned __int64 __readpmc(unsigned long);
|
||||
unsigned long __segmentlimit(unsigned long);
|
||||
void __sidt(void *);
|
||||
void *__slwpcb(void);
|
||||
static __inline__
|
||||
void __stosb(unsigned char *, unsigned char, size_t);
|
||||
static __inline__
|
||||
|
@ -142,7 +136,6 @@ void __svm_stgi(void);
|
|||
void __svm_vmload(size_t);
|
||||
void __svm_vmrun(size_t);
|
||||
void __svm_vmsave(size_t);
|
||||
void __ud2(void);
|
||||
unsigned __int64 __ull_rshift(unsigned __int64, int);
|
||||
void __vmx_off(void);
|
||||
void __vmx_vmptrst(unsigned __int64 *);
|
||||
|
@ -176,7 +169,6 @@ void __cdecl _disable(void);
|
|||
void __cdecl _enable(void);
|
||||
long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
|
||||
unsigned char _interlockedbittestandreset(long volatile *, long);
|
||||
static __inline__
|
||||
unsigned char _interlockedbittestandset(long volatile *, long);
|
||||
long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
|
||||
long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
|
||||
|
@ -231,8 +223,6 @@ void __incgsbyte(unsigned long);
|
|||
void __incgsdword(unsigned long);
|
||||
void __incgsqword(unsigned long);
|
||||
void __incgsword(unsigned long);
|
||||
unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
|
||||
void __lwpval64(unsigned __int64, unsigned int, unsigned int);
|
||||
unsigned __int64 __lzcnt64(unsigned __int64);
|
||||
static __inline__
|
||||
void __movsq(unsigned long long *, unsigned long long const *, size_t);
|
||||
|
@ -372,11 +362,6 @@ _bittestandset(long *_BitBase, long _BitPos) {
|
|||
*_BitBase = *_BitBase | (1 << _BitPos);
|
||||
return _Res;
|
||||
}
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
|
||||
long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
|
||||
return (_PrevVal >> _BitPos) & 1;
|
||||
}
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
|
||||
|
@ -872,48 +857,7 @@ _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
|
|||
return _Comparand;
|
||||
}
|
||||
#endif
|
||||
/*----------------------------------------------------------------------------*\
|
||||
|* readfs, readgs
|
||||
|* (Pointers in address space #256 and #257 are relative to the GS and FS
|
||||
|* segment registers, respectively.)
|
||||
\*----------------------------------------------------------------------------*/
|
||||
#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset) \
|
||||
((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
|
||||
(__offset))
|
||||
|
||||
#ifdef __i386__
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
__readfsbyte(unsigned long __offset) {
|
||||
return *__ptr_to_addr_space(257, unsigned char, __offset);
|
||||
}
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS
|
||||
__readfsword(unsigned long __offset) {
|
||||
return *__ptr_to_addr_space(257, unsigned short, __offset);
|
||||
}
|
||||
static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
|
||||
__readfsqword(unsigned long __offset) {
|
||||
return *__ptr_to_addr_space(257, unsigned __int64, __offset);
|
||||
}
|
||||
#endif
|
||||
#ifdef __x86_64__
|
||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||
__readgsbyte(unsigned long __offset) {
|
||||
return *__ptr_to_addr_space(256, unsigned char, __offset);
|
||||
}
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS
|
||||
__readgsword(unsigned long __offset) {
|
||||
return *__ptr_to_addr_space(256, unsigned short, __offset);
|
||||
}
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__readgsdword(unsigned long __offset) {
|
||||
return *__ptr_to_addr_space(256, unsigned long, __offset);
|
||||
}
|
||||
static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
|
||||
__readgsqword(unsigned long __offset) {
|
||||
return *__ptr_to_addr_space(256, unsigned __int64, __offset);
|
||||
}
|
||||
#endif
|
||||
#undef __ptr_to_addr_space
|
||||
/*----------------------------------------------------------------------------*\
|
||||
|* movs, stos
|
||||
\*----------------------------------------------------------------------------*/
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __X86INTRIN_H
|
||||
#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __LWPINTRIN_H
|
||||
#define __LWPINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
|
||||
|
||||
/// \brief Parses the LWPCB at the specified address and enables
|
||||
/// profiling if valid.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
|
||||
///
|
||||
/// \param __addr
|
||||
/// Address to the new Lightweight Profiling Control Block (LWPCB). If the
|
||||
/// LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
|
||||
/// Lightweight Profiling.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
__llwpcb (void *__addr)
|
||||
{
|
||||
__builtin_ia32_llwpcb(__addr);
|
||||
}
|
||||
|
||||
/// \brief Flushes the LWP state to memory and returns the address of the LWPCB.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
|
||||
///
|
||||
/// \return
|
||||
/// Address to the current Lightweight Profiling Control Block (LWPCB).
|
||||
/// If LWP is not currently enabled, returns NULL.
|
||||
static __inline__ void* __DEFAULT_FN_ATTRS
|
||||
__slwpcb ()
|
||||
{
|
||||
return __builtin_ia32_slwpcb();
|
||||
}
|
||||
|
||||
/// \brief Inserts programmed event record into the LWP event ring buffer
|
||||
/// and advances the ring buffer pointer.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
|
||||
///
|
||||
/// \param DATA2
|
||||
/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
|
||||
/// \param DATA1
|
||||
/// A 32-bit value is inserted into the 32-bit Data1 field.
|
||||
/// \param FLAGS
|
||||
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
|
||||
/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
|
||||
/// the event record overwrites the last record in the buffer, the MissedEvents
|
||||
/// counter in the LWPCB is incremented, the head pointer is not advanced, and
|
||||
/// 1 is returned. Otherwise 0 is returned.
|
||||
#define __lwpins32(DATA2, DATA1, FLAGS) \
|
||||
(__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
|
||||
(unsigned int) (FLAGS)))
|
||||
|
||||
/// \brief Decrements the LWP programmed value sample event counter. If the result is
|
||||
/// negative, inserts an event record into the LWP event ring buffer in memory
|
||||
/// and advances the ring buffer pointer.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
|
||||
///
|
||||
/// \param DATA2
|
||||
/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
|
||||
/// \param DATA1
|
||||
/// A 32-bit value is inserted into the 32-bit Data1 field.
|
||||
/// \param FLAGS
|
||||
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
|
||||
#define __lwpval32(DATA2, DATA1, FLAGS) \
|
||||
(__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
|
||||
(unsigned int) (FLAGS)))
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
/// \brief Inserts programmed event record into the LWP event ring buffer
|
||||
/// and advances the ring buffer pointer.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
|
||||
///
|
||||
/// \param DATA2
|
||||
/// A 64-bit value is inserted into the 64-bit Data2 field.
|
||||
/// \param DATA1
|
||||
/// A 32-bit value is inserted into the 32-bit Data1 field.
|
||||
/// \param FLAGS
|
||||
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
|
||||
/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
|
||||
/// the event record overwrites the last record in the buffer, the MissedEvents
|
||||
/// counter in the LWPCB is incremented, the head pointer is not advanced, and
|
||||
/// 1 is returned. Otherwise 0 is returned.
|
||||
#define __lwpins64(DATA2, DATA1, FLAGS) \
|
||||
(__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
|
||||
(unsigned int) (FLAGS)))
|
||||
|
||||
/// \brief Decrements the LWP programmed value sample event counter. If the result is
|
||||
/// negative, inserts an event record into the LWP event ring buffer in memory
|
||||
/// and advances the ring buffer pointer.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
|
||||
///
|
||||
/// \param DATA2
|
||||
/// A 64-bit value is and inserted into the 64-bit Data2 field.
|
||||
/// \param DATA1
|
||||
/// A 32-bit value is inserted into the 32-bit Data1 field.
|
||||
/// \param FLAGS
|
||||
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
|
||||
#define __lwpval64(DATA2, DATA1, FLAGS) \
|
||||
(__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
|
||||
(unsigned int) (FLAGS)))
|
||||
|
||||
#endif
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __LWPINTRIN_H */
|
|
@ -211,7 +211,7 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
|
|||
/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
|
||||
///
|
||||
/// \param __m1
|
||||
/// A 64-bit integer vector of [8 x i8]. \n
|
||||
/// A 64-bit integer vector of [8 x i8]. \n
|
||||
/// Bits [39:32] are written to bits [7:0] of the result. \n
|
||||
/// Bits [47:40] are written to bits [23:16] of the result. \n
|
||||
/// Bits [55:48] are written to bits [39:32] of the result. \n
|
||||
|
@ -608,10 +608,11 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
|
|||
|
||||
/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
|
||||
/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
|
||||
/// element of the first 64-bit integer vector of [8 x i8]. If an element of
|
||||
/// the first vector is less than the corresponding element of the second
|
||||
/// vector, the result is saturated to 0. The results are packed into a
|
||||
/// 64-bit integer vector of [8 x i8].
|
||||
/// element of the first 64-bit integer vector of [8 x i8].
|
||||
///
|
||||
/// If an element of the first vector is less than the corresponding element
|
||||
/// of the second vector, the result is saturated to 0. The results are
|
||||
/// packed into a 64-bit integer vector of [8 x i8].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -631,10 +632,11 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
|
|||
|
||||
/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
|
||||
/// integer vector of [4 x i16] from the corresponding 16-bit unsigned
|
||||
/// integer element of the first 64-bit integer vector of [4 x i16]. If an
|
||||
/// element of the first vector is less than the corresponding element of the
|
||||
/// second vector, the result is saturated to 0. The results are packed into
|
||||
/// a 64-bit integer vector of [4 x i16].
|
||||
/// integer element of the first 64-bit integer vector of [4 x i16].
|
||||
///
|
||||
/// If an element of the first vector is less than the corresponding element
|
||||
/// of the second vector, the result is saturated to 0. The results are
|
||||
/// packed into a 64-bit integer vector of [4 x i16].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -657,9 +659,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
|
|||
/// element of the second 64-bit integer vector of [4 x i16] and get four
|
||||
/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
|
||||
/// The lower 32 bits of these two sums are packed into a 64-bit integer
|
||||
/// vector of [2 x i32]. For example, bits [15:0] of both parameters are
|
||||
/// multiplied, bits [31:16] of both parameters are multiplied, and the sum
|
||||
/// of both results is written to bits [31:0] of the result.
|
||||
/// vector of [2 x i32].
|
||||
///
|
||||
/// For example, bits [15:0] of both parameters are multiplied, bits [31:16]
|
||||
/// of both parameters are multiplied, and the sum of both results is written
|
||||
/// to bits [31:0] of the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -851,10 +855,11 @@ _mm_slli_si64(__m64 __m, int __count)
|
|||
|
||||
/// \brief Right-shifts each 16-bit integer element of the first parameter,
|
||||
/// which is a 64-bit integer vector of [4 x i16], by the number of bits
|
||||
/// specified by the second parameter, which is a 64-bit integer. High-order
|
||||
/// bits are filled with the sign bit of the initial value of each 16-bit
|
||||
/// element. The 16-bit results are packed into a 64-bit integer vector of
|
||||
/// [4 x i16].
|
||||
/// specified by the second parameter, which is a 64-bit integer.
|
||||
///
|
||||
/// High-order bits are filled with the sign bit of the initial value of each
|
||||
/// 16-bit element. The 16-bit results are packed into a 64-bit integer
|
||||
/// vector of [4 x i16].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -874,6 +879,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
|
|||
|
||||
/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
|
||||
/// of [4 x i16] by the number of bits specified by a 32-bit integer.
|
||||
///
|
||||
/// High-order bits are filled with the sign bit of the initial value of each
|
||||
/// 16-bit element. The 16-bit results are packed into a 64-bit integer
|
||||
/// vector of [4 x i16].
|
||||
|
@ -896,10 +902,11 @@ _mm_srai_pi16(__m64 __m, int __count)
|
|||
|
||||
/// \brief Right-shifts each 32-bit integer element of the first parameter,
|
||||
/// which is a 64-bit integer vector of [2 x i32], by the number of bits
|
||||
/// specified by the second parameter, which is a 64-bit integer. High-order
|
||||
/// bits are filled with the sign bit of the initial value of each 32-bit
|
||||
/// element. The 32-bit results are packed into a 64-bit integer vector of
|
||||
/// [2 x i32].
|
||||
/// specified by the second parameter, which is a 64-bit integer.
|
||||
///
|
||||
/// High-order bits are filled with the sign bit of the initial value of each
|
||||
/// 32-bit element. The 32-bit results are packed into a 64-bit integer
|
||||
/// vector of [2 x i32].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -919,6 +926,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
|
|||
|
||||
/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
|
||||
/// of [2 x i32] by the number of bits specified by a 32-bit integer.
|
||||
///
|
||||
/// High-order bits are filled with the sign bit of the initial value of each
|
||||
/// 32-bit element. The 32-bit results are packed into a 64-bit integer
|
||||
/// vector of [2 x i32].
|
||||
|
@ -941,9 +949,10 @@ _mm_srai_pi32(__m64 __m, int __count)
|
|||
|
||||
/// \brief Right-shifts each 16-bit integer element of the first parameter,
|
||||
/// which is a 64-bit integer vector of [4 x i16], by the number of bits
|
||||
/// specified by the second parameter, which is a 64-bit integer. High-order
|
||||
/// bits are cleared. The 16-bit results are packed into a 64-bit integer
|
||||
/// vector of [4 x i16].
|
||||
/// specified by the second parameter, which is a 64-bit integer.
|
||||
///
|
||||
/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
|
||||
/// integer vector of [4 x i16].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -963,6 +972,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
|
|||
|
||||
/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
|
||||
/// of [4 x i16] by the number of bits specified by a 32-bit integer.
|
||||
///
|
||||
/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
|
||||
/// integer vector of [4 x i16].
|
||||
///
|
||||
|
@ -984,9 +994,10 @@ _mm_srli_pi16(__m64 __m, int __count)
|
|||
|
||||
/// \brief Right-shifts each 32-bit integer element of the first parameter,
|
||||
/// which is a 64-bit integer vector of [2 x i32], by the number of bits
|
||||
/// specified by the second parameter, which is a 64-bit integer. High-order
|
||||
/// bits are cleared. The 32-bit results are packed into a 64-bit integer
|
||||
/// vector of [2 x i32].
|
||||
/// specified by the second parameter, which is a 64-bit integer.
|
||||
///
|
||||
/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
|
||||
/// integer vector of [2 x i32].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1006,6 +1017,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
|
|||
|
||||
/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
|
||||
/// of [2 x i32] by the number of bits specified by a 32-bit integer.
|
||||
///
|
||||
/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
|
||||
/// integer vector of [2 x i32].
|
||||
///
|
||||
|
@ -1026,8 +1038,9 @@ _mm_srli_pi32(__m64 __m, int __count)
|
|||
}
|
||||
|
||||
/// \brief Right-shifts the first 64-bit integer parameter by the number of bits
|
||||
/// specified by the second 64-bit integer parameter. High-order bits are
|
||||
/// cleared.
|
||||
/// specified by the second 64-bit integer parameter.
|
||||
///
|
||||
/// High-order bits are cleared.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1046,7 +1059,9 @@ _mm_srl_si64(__m64 __m, __m64 __count)
|
|||
|
||||
/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
|
||||
/// number of bits specified by the second parameter, which is a 32-bit
|
||||
/// integer. High-order bits are cleared.
|
||||
/// integer.
|
||||
///
|
||||
/// High-order bits are cleared.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1140,8 +1155,9 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
|
|||
|
||||
/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
|
||||
/// [8 x i8] to determine if the element of the first vector is equal to the
|
||||
/// corresponding element of the second vector. The comparison yields 0 for
|
||||
/// false, 0xFF for true.
|
||||
/// corresponding element of the second vector.
|
||||
///
|
||||
/// The comparison yields 0 for false, 0xFF for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1161,8 +1177,9 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
|
|||
|
||||
/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
|
||||
/// [4 x i16] to determine if the element of the first vector is equal to the
|
||||
/// corresponding element of the second vector. The comparison yields 0 for
|
||||
/// false, 0xFFFF for true.
|
||||
/// corresponding element of the second vector.
|
||||
///
|
||||
/// The comparison yields 0 for false, 0xFFFF for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1182,8 +1199,9 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
|
|||
|
||||
/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
|
||||
/// [2 x i32] to determine if the element of the first vector is equal to the
|
||||
/// corresponding element of the second vector. The comparison yields 0 for
|
||||
/// false, 0xFFFFFFFF for true.
|
||||
/// corresponding element of the second vector.
|
||||
///
|
||||
/// The comparison yields 0 for false, 0xFFFFFFFF for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1203,8 +1221,9 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
|
|||
|
||||
/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
|
||||
/// [8 x i8] to determine if the element of the first vector is greater than
|
||||
/// the corresponding element of the second vector. The comparison yields 0
|
||||
/// for false, 0xFF for true.
|
||||
/// the corresponding element of the second vector.
|
||||
///
|
||||
/// The comparison yields 0 for false, 0xFF for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1224,8 +1243,9 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
|
|||
|
||||
/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
|
||||
/// [4 x i16] to determine if the element of the first vector is greater than
|
||||
/// the corresponding element of the second vector. The comparison yields 0
|
||||
/// for false, 0xFFFF for true.
|
||||
/// the corresponding element of the second vector.
|
||||
///
|
||||
/// The comparison yields 0 for false, 0xFFFF for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1245,8 +1265,9 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
|
|||
|
||||
/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
|
||||
/// [2 x i32] to determine if the element of the first vector is greater than
|
||||
/// the corresponding element of the second vector. The comparison yields 0
|
||||
/// for false, 0xFFFFFFFF for true.
|
||||
/// the corresponding element of the second vector.
|
||||
///
|
||||
/// The comparison yields 0 for false, 0xFFFFFFFF for true.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -1268,7 +1289,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction.
|
||||
/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
|
||||
///
|
||||
/// \returns An initialized 64-bit integer vector with all elements set to zero.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -61,6 +61,7 @@ module _Builtin_intrinsics [system] [extern_c] {
|
|||
textual header "xopintrin.h"
|
||||
textual header "fma4intrin.h"
|
||||
textual header "mwaitxintrin.h"
|
||||
textual header "clzerointrin.h"
|
||||
|
||||
explicit module mm_malloc {
|
||||
requires !freestanding
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -31,9 +31,11 @@
|
|||
__attribute__((__always_inline__, __nodebug__, __target__("sse3")))
|
||||
|
||||
/// \brief Loads data from an unaligned memory location to elements in a 128-bit
|
||||
/// vector. If the address of the data is not 16-byte aligned, the
|
||||
/// instruction may read two adjacent aligned blocks of memory to retrieve
|
||||
/// the requested data.
|
||||
/// vector.
|
||||
///
|
||||
/// If the address of the data is not 16-byte aligned, the instruction may
|
||||
/// read two adjacent aligned blocks of memory to retrieve the requested
|
||||
/// data.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -115,7 +117,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
|
|||
|
||||
/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
|
||||
/// vector of [4 x float] to float values stored in a 128-bit vector of
|
||||
/// [4 x float].
|
||||
/// [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -136,7 +138,7 @@ _mm_movehdup_ps(__m128 __a)
|
|||
}
|
||||
|
||||
/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
|
||||
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
|
||||
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -257,14 +259,6 @@ _mm_movedup_pd(__m128d __a)
|
|||
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
|
||||
}
|
||||
|
||||
#define _MM_DENORMALS_ZERO_ON (0x0040)
|
||||
#define _MM_DENORMALS_ZERO_OFF (0x0000)
|
||||
|
||||
#define _MM_DENORMALS_ZERO_MASK (0x0040)
|
||||
|
||||
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
|
||||
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
|
||||
|
||||
/// \brief Establishes a linear address memory range to be monitored and puts
|
||||
/// the processor in the monitor event pending state. Data stored in the
|
||||
/// monitored address range causes the processor to exit the pending state.
|
||||
|
|
|
@ -29,12 +29,38 @@
|
|||
#define __PRFCHWINTRIN_H
|
||||
|
||||
#if defined(__PRFCHW__) || defined(__3dNOW__)
|
||||
/// \brief Loads a memory sequence containing the specified memory address into
|
||||
/// all data cache levels. The cache-coherency state is set to exclusive.
|
||||
/// Data can be read from and written to the cache line without additional
|
||||
/// delay.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
|
||||
///
|
||||
/// \param __P
|
||||
/// A pointer specifying the memory address to be prefetched.
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_m_prefetch(void *__P)
|
||||
{
|
||||
__builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
|
||||
}
|
||||
|
||||
/// \brief Loads a memory sequence containing the specified memory address into
|
||||
/// the L1 data cache and sets the cache-coherency to modified. This
|
||||
/// provides a hint to the processor that the cache line will be modified.
|
||||
/// It is intended for use when the cache line will be written to shortly
|
||||
/// after the prefetch is performed.
|
||||
///
|
||||
/// Note that the effect of this intrinsic is dependent on the processor
|
||||
/// implementation.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PREFETCHW instruction.
|
||||
///
|
||||
/// \param __P
|
||||
/// A pointer specifying the memory address to be prefetched.
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_m_prefetchw(void *__P)
|
||||
{
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -43,14 +43,12 @@ typedef __builtin_va_list va_list;
|
|||
#define va_copy(dest, src) __builtin_va_copy(dest, src)
|
||||
#endif
|
||||
|
||||
/* Hack required to make standard headers work, at least on Ubuntu */
|
||||
#ifndef __GNUC_VA_LIST
|
||||
#define __GNUC_VA_LIST 1
|
||||
#endif
|
||||
typedef __builtin_va_list __gnuc_va_list;
|
||||
|
||||
/* zig: added because glibc stdio.h was duplicately defining va_list
|
||||
*/
|
||||
#define _VA_LIST_DEFINED
|
||||
#endif
|
||||
|
||||
#endif /* __STDARG_H */
|
||||
|
|
|
@ -40,16 +40,16 @@ extern "C" {
|
|||
|
||||
/* 7.17.1 Introduction */
|
||||
|
||||
#define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE
|
||||
#define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE
|
||||
#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
|
||||
#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
|
||||
#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE
|
||||
#define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE
|
||||
#define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
|
||||
#define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE
|
||||
#define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE
|
||||
#define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE
|
||||
#define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE
|
||||
#define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE
|
||||
#define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
|
||||
#define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
|
||||
#define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
|
||||
#define ATOMIC_SHORT_LOCK_FREE __CLANG_ATOMIC_SHORT_LOCK_FREE
|
||||
#define ATOMIC_INT_LOCK_FREE __CLANG_ATOMIC_INT_LOCK_FREE
|
||||
#define ATOMIC_LONG_LOCK_FREE __CLANG_ATOMIC_LONG_LOCK_FREE
|
||||
#define ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE
|
||||
#define ATOMIC_POINTER_LOCK_FREE __CLANG_ATOMIC_POINTER_LOCK_FREE
|
||||
|
||||
/* 7.17.2 Initialization */
|
||||
|
||||
|
|
|
@ -255,19 +255,16 @@ typedef __uint_least8_t uint_fast8_t;
|
|||
*/
|
||||
#define __stdint_join3(a,b,c) a ## b ## c
|
||||
|
||||
#define __intn_t(n) __stdint_join3( int, n, _t)
|
||||
#define __uintn_t(n) __stdint_join3(uint, n, _t)
|
||||
|
||||
#ifndef _INTPTR_T
|
||||
#ifndef __intptr_t_defined
|
||||
typedef __intn_t(__INTPTR_WIDTH__) intptr_t;
|
||||
typedef __INTPTR_TYPE__ intptr_t;
|
||||
#define __intptr_t_defined
|
||||
#define _INTPTR_T
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef _UINTPTR_T
|
||||
typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
|
||||
typedef __UINTPTR_TYPE__ uintptr_t;
|
||||
#define _UINTPTR_T
|
||||
#endif
|
||||
|
||||
|
@ -659,12 +656,12 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||
/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
|
||||
/* C99 7.18.3 Limits of other integer types. */
|
||||
|
||||
#define INTPTR_MIN __INTN_MIN(__INTPTR_WIDTH__)
|
||||
#define INTPTR_MAX __INTN_MAX(__INTPTR_WIDTH__)
|
||||
#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
|
||||
#define PTRDIFF_MIN __INTN_MIN(__PTRDIFF_WIDTH__)
|
||||
#define PTRDIFF_MAX __INTN_MAX(__PTRDIFF_WIDTH__)
|
||||
#define SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
|
||||
#define INTPTR_MIN (-__INTPTR_MAX__-1)
|
||||
#define INTPTR_MAX __INTPTR_MAX__
|
||||
#define UINTPTR_MAX __UINTPTR_MAX__
|
||||
#define PTRDIFF_MIN (-__PTRDIFF_MAX__-1)
|
||||
#define PTRDIFF_MAX __PTRDIFF_MAX__
|
||||
#define SIZE_MAX __SIZE_MAX__
|
||||
|
||||
/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
|
||||
* is enabled. */
|
||||
|
@ -673,9 +670,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||
#endif
|
||||
|
||||
/* C99 7.18.2.5 Limits of greatest-width integer types. */
|
||||
#define INTMAX_MIN __INTN_MIN(__INTMAX_WIDTH__)
|
||||
#define INTMAX_MAX __INTN_MAX(__INTMAX_WIDTH__)
|
||||
#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
|
||||
#define INTMAX_MIN (-__INTMAX_MAX__-1)
|
||||
#define INTMAX_MAX __INTMAX_MAX__
|
||||
#define UINTMAX_MAX __UINTMAX_MAX__
|
||||
|
||||
/* C99 7.18.3 Limits of other integer types. */
|
||||
#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
|
||||
|
@ -700,8 +697,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||
#endif
|
||||
|
||||
/* 7.18.4.2 Macros for greatest-width integer constants. */
|
||||
#define INTMAX_C(v) __INTN_C(__INTMAX_WIDTH__, v)
|
||||
#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
|
||||
#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__)
|
||||
#define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)
|
||||
|
||||
#endif /* __STDC_HOSTED__ */
|
||||
#endif /* __CLANG_STDINT_H */
|
||||
|
|
|
@ -22,12 +22,21 @@
|
|||
*
|
||||
\*===----------------------------------------------------------------------===*/
|
||||
|
||||
#ifndef __TGMATH_H
|
||||
#define __TGMATH_H
|
||||
#ifndef __CLANG_TGMATH_H
|
||||
#define __CLANG_TGMATH_H
|
||||
|
||||
/* C99 7.22 Type-generic math <tgmath.h>. */
|
||||
#include <math.h>
|
||||
|
||||
/*
|
||||
* Allow additional definitions and implementation-defined values on Apple
|
||||
* platforms. This is done after #include <math.h> to avoid depcycle conflicts
|
||||
* between libcxx and darwin in C++ modules builds.
|
||||
*/
|
||||
#if defined(__APPLE__) && __STDC_HOSTED__ && __has_include_next(<tgmath.h>)
|
||||
# include_next <tgmath.h>
|
||||
#else
|
||||
|
||||
/* C++ handles type genericity with overloading in math.h. */
|
||||
#ifndef __cplusplus
|
||||
#include <complex.h>
|
||||
|
@ -1371,4 +1380,5 @@ static long double
|
|||
#undef _TG_ATTRS
|
||||
|
||||
#endif /* __cplusplus */
|
||||
#endif /* __TGMATH_H */
|
||||
#endif /* __has_include_next */
|
||||
#endif /* __CLANG_TGMATH_H */
|
||||
|
|
|
@ -469,10 +469,11 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
|
|||
/// values contained in the first source operand and packed 8-bit signed
|
||||
/// integer values contained in the second source operand, adds pairs of
|
||||
/// contiguous products with signed saturation, and writes the 16-bit sums to
|
||||
/// the corresponding bits in the destination. For example, bits [7:0] of
|
||||
/// both operands are multiplied, bits [15:8] of both operands are
|
||||
/// multiplied, and the sum of both results is written to bits [15:0] of the
|
||||
/// destination.
|
||||
/// the corresponding bits in the destination.
|
||||
///
|
||||
/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
|
||||
/// both operands are multiplied, and the sum of both results is written to
|
||||
/// bits [15:0] of the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -502,10 +503,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
|
|||
/// values contained in the first source operand and packed 8-bit signed
|
||||
/// integer values contained in the second source operand, adds pairs of
|
||||
/// contiguous products with signed saturation, and writes the 16-bit sums to
|
||||
/// the corresponding bits in the destination. For example, bits [7:0] of
|
||||
/// both operands are multiplied, bits [15:8] of both operands are
|
||||
/// multiplied, and the sum of both results is written to bits [15:0] of the
|
||||
/// destination.
|
||||
/// the corresponding bits in the destination.
|
||||
///
|
||||
/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
|
||||
/// both operands are multiplied, and the sum of both results is written to
|
||||
/// bits [15:0] of the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -619,13 +621,14 @@ _mm_shuffle_pi8(__m64 __a, __m64 __b)
|
|||
}
|
||||
|
||||
/// \brief For each 8-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// byte in the second source is negative, calculate the two's complement of
|
||||
/// the corresponding byte in the first source, and write that value to the
|
||||
/// destination. If the byte in the second source is positive, copy the
|
||||
/// corresponding byte from the first source to the destination. If the byte
|
||||
/// in the second source is zero, clear the corresponding byte in the
|
||||
/// destination.
|
||||
/// the following actions as specified by the second source operand.
|
||||
///
|
||||
/// If the byte in the second source is negative, calculate the two's
|
||||
/// complement of the corresponding byte in the first source, and write that
|
||||
/// value to the destination. If the byte in the second source is positive,
|
||||
/// copy the corresponding byte from the first source to the destination. If
|
||||
/// the byte in the second source is zero, clear the corresponding byte in
|
||||
/// the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -644,13 +647,14 @@ _mm_sign_epi8(__m128i __a, __m128i __b)
|
|||
}
|
||||
|
||||
/// \brief For each 16-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// word in the second source is negative, calculate the two's complement of
|
||||
/// the corresponding word in the first source, and write that value to the
|
||||
/// destination. If the word in the second source is positive, copy the
|
||||
/// corresponding word from the first source to the destination. If the word
|
||||
/// in the second source is zero, clear the corresponding word in the
|
||||
/// destination.
|
||||
/// the following actions as specified by the second source operand.
|
||||
///
|
||||
/// If the word in the second source is negative, calculate the two's
|
||||
/// complement of the corresponding word in the first source, and write that
|
||||
/// value to the destination. If the word in the second source is positive,
|
||||
/// copy the corresponding word from the first source to the destination. If
|
||||
/// the word in the second source is zero, clear the corresponding word in
|
||||
/// the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -669,8 +673,9 @@ _mm_sign_epi16(__m128i __a, __m128i __b)
|
|||
}
|
||||
|
||||
/// \brief For each 32-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// doubleword in the second source is negative, calculate the two's
|
||||
/// the following actions as specified by the second source operand.
|
||||
///
|
||||
/// If the doubleword in the second source is negative, calculate the two's
|
||||
/// complement of the corresponding word in the first source, and write that
|
||||
/// value to the destination. If the doubleword in the second source is
|
||||
/// positive, copy the corresponding word from the first source to the
|
||||
|
@ -694,13 +699,14 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
|
|||
}
|
||||
|
||||
/// \brief For each 8-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// byte in the second source is negative, calculate the two's complement of
|
||||
/// the corresponding byte in the first source, and write that value to the
|
||||
/// destination. If the byte in the second source is positive, copy the
|
||||
/// corresponding byte from the first source to the destination. If the byte
|
||||
/// in the second source is zero, clear the corresponding byte in the
|
||||
/// destination.
|
||||
/// the following actions as specified by the second source operand.
|
||||
///
|
||||
/// If the byte in the second source is negative, calculate the two's
|
||||
/// complement of the corresponding byte in the first source, and write that
|
||||
/// value to the destination. If the byte in the second source is positive,
|
||||
/// copy the corresponding byte from the first source to the destination. If
|
||||
/// the byte in the second source is zero, clear the corresponding byte in
|
||||
/// the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -719,13 +725,14 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
|
|||
}
|
||||
|
||||
/// \brief For each 16-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// word in the second source is negative, calculate the two's complement of
|
||||
/// the corresponding word in the first source, and write that value to the
|
||||
/// destination. If the word in the second source is positive, copy the
|
||||
/// corresponding word from the first source to the destination. If the word
|
||||
/// in the second source is zero, clear the corresponding word in the
|
||||
/// destination.
|
||||
/// the following actions as specified by the second source operand.
|
||||
///
|
||||
/// If the word in the second source is negative, calculate the two's
|
||||
/// complement of the corresponding word in the first source, and write that
|
||||
/// value to the destination. If the word in the second source is positive,
|
||||
/// copy the corresponding word from the first source to the destination. If
|
||||
/// the word in the second source is zero, clear the corresponding word in
|
||||
/// the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -744,8 +751,9 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
|
|||
}
|
||||
|
||||
/// \brief For each 32-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// doubleword in the second source is negative, calculate the two's
|
||||
/// the following actions as specified by the second source operand.
|
||||
///
|
||||
/// If the doubleword in the second source is negative, calculate the two's
|
||||
/// complement of the corresponding doubleword in the first source, and
|
||||
/// write that value to the destination. If the doubleword in the second
|
||||
/// source is positive, copy the corresponding doubleword from the first
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -72,6 +72,10 @@
|
|||
#include <tbmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LWP__)
|
||||
#include <lwpintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
|
||||
#include <f16cintrin.h>
|
||||
#endif
|
||||
|
@ -80,6 +84,8 @@
|
|||
#include <mwaitxintrin.h>
|
||||
#endif
|
||||
|
||||
/* FIXME: LWP */
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLZERO__)
|
||||
#include <clzerointrin.h>
|
||||
#endif
|
||||
|
||||
#endif /* __X86INTRIN_H */
|
||||
|
|
|
@ -2067,7 +2067,7 @@ _mm_storer_ps(float *__p, __m128 __a)
|
|||
/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
|
||||
/// be generated. \n
|
||||
/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
|
||||
/// be generated.
|
||||
/// be generated.
|
||||
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
|
||||
#endif
|
||||
|
||||
|
@ -2099,7 +2099,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
|
|||
///
|
||||
/// \param __p
|
||||
/// A pointer to a 128-bit aligned memory location that will receive the
|
||||
/// integer values.
|
||||
/// single-precision floating-point values.
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float] containing the values to be moved.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
|
@ -2133,7 +2133,7 @@ void _mm_sfence(void);
|
|||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _mm_extract_pi(__m64 a, int n);
|
||||
/// int _mm_extract_pi16(__m64 a, int n);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
|
||||
|
@ -2157,7 +2157,7 @@ void _mm_sfence(void);
|
|||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _mm_insert_pi(__m64 a, int d, int n);
|
||||
/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
|
||||
|
@ -2331,8 +2331,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
|
|||
/// \brief Conditionally copies the values from each 8-bit element in the first
|
||||
/// 64-bit integer vector operand to the specified memory location, as
|
||||
/// specified by the most significant bit in the corresponding element in the
|
||||
/// second 64-bit integer vector operand. To minimize caching, the data is
|
||||
/// flagged as non-temporal (unlikely to be used again soon).
|
||||
/// second 64-bit integer vector operand.
|
||||
///
|
||||
/// To minimize caching, the data is flagged as non-temporal
|
||||
/// (unlikely to be used again soon).
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
|
@ -2435,17 +2437,17 @@ extern "C" {
|
|||
/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
|
||||
/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
|
||||
/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
|
||||
/// </li>
|
||||
/// </li>
|
||||
/// <li>
|
||||
/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
|
||||
/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
|
||||
/// _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
|
||||
/// </li>
|
||||
/// <li>
|
||||
/// <li>
|
||||
/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
|
||||
/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
|
||||
/// </li>
|
||||
/// <li>
|
||||
/// <li>
|
||||
/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
|
||||
/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
|
||||
/// _MM_GET_DENORMALS_ZERO_MODE().
|
||||
|
@ -2468,11 +2470,11 @@ extern "C" {
|
|||
unsigned int _mm_getcsr(void);
|
||||
|
||||
/// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
|
||||
///
|
||||
///
|
||||
/// There are several groups of macros associated with this intrinsic,
|
||||
/// including:
|
||||
/// <ul>
|
||||
/// <li>
|
||||
/// <li>
|
||||
/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
|
||||
/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
|
||||
/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
|
||||
|
@ -2517,7 +2519,7 @@ unsigned int _mm_getcsr(void);
|
|||
///
|
||||
/// \param __i
|
||||
/// A 32-bit unsigned integer value to be written to the MXCSR register.
|
||||
void _mm_setcsr(unsigned int);
|
||||
void _mm_setcsr(unsigned int __i);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
|
@ -2540,7 +2542,7 @@ void _mm_setcsr(unsigned int);
|
|||
/// A 128-bit vector of [4 x float].
|
||||
/// \param mask
|
||||
/// An immediate value containing an 8-bit value specifying which elements to
|
||||
/// copy from \ a and \a b. \n
|
||||
/// copy from \a a and \a b. \n
|
||||
/// Bits [3:0] specify the values copied from operand \a a. \n
|
||||
/// Bits [7:4] specify the values copied from operand \a b. \n
|
||||
/// The destinations within the 128-bit destination are assigned values as
|
||||
|
@ -2678,8 +2680,7 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
|
||||
/// instruction.
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
|
||||
|
@ -2709,8 +2710,7 @@ _mm_cvtpi16_ps(__m64 __a)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
|
||||
/// instruction.
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
|
||||
|
@ -2739,8 +2739,7 @@ _mm_cvtpu16_ps(__m64 __a)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
|
||||
/// instruction.
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
|
||||
|
@ -2764,8 +2763,7 @@ _mm_cvtpi8_ps(__m64 __a)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
|
||||
/// instruction.
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
|
||||
|
@ -2789,8 +2787,7 @@ _mm_cvtpu8_ps(__m64 __a)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
|
||||
/// instruction.
|
||||
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
|
||||
|
@ -2815,16 +2812,16 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
|
|||
|
||||
/// \brief Converts each single-precision floating-point element of a 128-bit
|
||||
/// floating-point vector of [4 x float] into a 16-bit signed integer, and
|
||||
/// packs the results into a 64-bit integer vector of [4 x i16]. If the
|
||||
/// floating-point element is NaN or infinity, or if the floating-point
|
||||
/// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
|
||||
/// to 0x8000. Otherwise if the floating-point element is greater than
|
||||
/// 0x7FFF, it is converted to 0x7FFF.
|
||||
/// packs the results into a 64-bit integer vector of [4 x i16].
|
||||
///
|
||||
/// If the floating-point element is NaN or infinity, or if the
|
||||
/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
|
||||
/// it is converted to 0x8000. Otherwise if the floating-point element is
|
||||
/// greater than 0x7FFF, it is converted to 0x7FFF.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
|
||||
/// instruction.
|
||||
/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit floating-point vector of [4 x float].
|
||||
|
@ -2845,16 +2842,16 @@ _mm_cvtps_pi16(__m128 __a)
|
|||
/// \brief Converts each single-precision floating-point element of a 128-bit
|
||||
/// floating-point vector of [4 x float] into an 8-bit signed integer, and
|
||||
/// packs the results into the lower 32 bits of a 64-bit integer vector of
|
||||
/// [8 x i8]. The upper 32 bits of the vector are set to 0. If the
|
||||
/// floating-point element is NaN or infinity, or if the floating-point
|
||||
/// element is greater than 0x7FFFFFFF or less than -0x80, it is converted
|
||||
/// to 0x80. Otherwise if the floating-point element is greater than 0x7F,
|
||||
/// it is converted to 0x7F.
|
||||
/// [8 x i8]. The upper 32 bits of the vector are set to 0.
|
||||
///
|
||||
/// If the floating-point element is NaN or infinity, or if the
|
||||
/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
|
||||
/// is converted to 0x80. Otherwise if the floating-point element is greater
|
||||
/// than 0x7F, it is converted to 0x7F.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
|
||||
/// instruction.
|
||||
/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// 128-bit floating-point vector of [4 x float].
|
||||
|
|
|
@ -198,13 +198,13 @@ _mm_hsubq_epi32(__m128i __A)
|
|||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
|
||||
return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
|
||||
return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
Create bootstrap code in std/bootstrap.zig and add conditional compilation
|
||||
logic. This code is responsible for the real executable entry point, calling
|
||||
main(argc, argv, env) and making the exit syscall when main returns.
|
||||
main() and making the exit syscall when main returns.
|
||||
|
||||
How to pass a byvalue struct parameter in the C calling convention is
|
||||
target-specific. Add logic for how to do function prototypes and function calls
|
||||
|
|
Loading…
Reference in New Issue