update C headers to clang 5.0.0

master
Andrew Kelley 2017-09-30 18:20:12 -04:00
parent ba3d21ca67
commit cd58b40011
37 changed files with 5700 additions and 1830 deletions

View File

@ -410,10 +410,12 @@ install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlbwintrin.h" DESTINATION "${
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlcdintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vldqintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vpopcntdqintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avxintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmi2intrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmiintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/clflushoptintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/clzerointrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cpuid.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/algorithm" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/complex" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
@ -432,6 +434,7 @@ install(FILES "${CMAKE_SOURCE_DIR}/c_headers/intrin.h" DESTINATION "${C_HEADERS_
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/inttypes.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/iso646.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/limits.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/lwpintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/lzcntintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm3dnow.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm_malloc.h" DESTINATION "${C_HEADERS_DEST}")

View File

@ -2887,87 +2887,79 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
/* vec_ctf */
static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
return __builtin_altivec_vcfsx(__a, __b);
}
static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
int __b) {
return __builtin_altivec_vcfux((vector int)__a, __b);
}
#ifdef __VSX__
static __inline__ vector double __ATTRS_o_ai
vec_ctf(vector unsigned long long __a, int __b) {
vector double __ret = __builtin_convertvector(__a, vector double);
__ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
return __ret;
}
static __inline__ vector double __ATTRS_o_ai
vec_ctf(vector signed long long __a, int __b) {
vector double __ret = __builtin_convertvector(__a, vector double);
__ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
return __ret;
}
#define vec_ctf(__a, __b) \
_Generic((__a), vector int \
: (vector float)__builtin_altivec_vcfsx((__a), (__b)), \
vector unsigned int \
: (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)), \
vector unsigned long long \
: (__builtin_convertvector((vector unsigned long long)(__a), \
vector double) * \
(vector double)(vector unsigned long long)((0x3ffULL - (__b)) \
<< 52)), \
vector signed long long \
: (__builtin_convertvector((vector signed long long)(__a), \
vector double) * \
(vector double)(vector unsigned long long)((0x3ffULL - (__b)) \
<< 52)))
#else
#define vec_ctf(__a, __b) \
_Generic((__a), vector int \
: (vector float)__builtin_altivec_vcfsx((__a), (__b)), \
vector unsigned int \
: (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)))
#endif
/* vec_vcfsx */
static __inline__ vector float __attribute__((__always_inline__))
vec_vcfsx(vector int __a, int __b) {
return __builtin_altivec_vcfsx(__a, __b);
}
#define vec_vcfux __builtin_altivec_vcfux
/* vec_vcfux */
static __inline__ vector float __attribute__((__always_inline__))
vec_vcfux(vector unsigned int __a, int __b) {
return __builtin_altivec_vcfux((vector int)__a, __b);
}
#define vec_vcfsx(__a, __b) __builtin_altivec_vcfsx((vector int)(__a), (__b))
/* vec_cts */
static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
return __builtin_altivec_vctsxs(__a, __b);
}
#ifdef __VSX__
static __inline__ vector signed long long __ATTRS_o_ai
vec_cts(vector double __a, int __b) {
__a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
return __builtin_convertvector(__a, vector signed long long);
}
#define vec_cts(__a, __b) \
_Generic((__a), vector float \
: __builtin_altivec_vctsxs((__a), (__b)), vector double \
: __extension__({ \
vector double __ret = \
(__a) * \
(vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
<< 52); \
__builtin_convertvector(__ret, vector signed long long); \
}))
#else
#define vec_cts __builtin_altivec_vctsxs
#endif
/* vec_vctsxs */
static __inline__ vector int __attribute__((__always_inline__))
vec_vctsxs(vector float __a, int __b) {
return __builtin_altivec_vctsxs(__a, __b);
}
#define vec_vctsxs __builtin_altivec_vctsxs
/* vec_ctu */
static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
int __b) {
return __builtin_altivec_vctuxs(__a, __b);
}
#ifdef __VSX__
static __inline__ vector unsigned long long __ATTRS_o_ai
vec_ctu(vector double __a, int __b) {
__a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
return __builtin_convertvector(__a, vector unsigned long long);
}
#define vec_ctu(__a, __b) \
_Generic((__a), vector float \
: __builtin_altivec_vctuxs((__a), (__b)), vector double \
: __extension__({ \
vector double __ret = \
(__a) * \
(vector double)(vector unsigned long long)((0x3ffULL + __b) \
<< 52); \
__builtin_convertvector(__ret, vector unsigned long long); \
}))
#else
#define vec_ctu __builtin_altivec_vctuxs
#endif
/* vec_vctuxs */
static __inline__ vector unsigned int __attribute__((__always_inline__))
vec_vctuxs(vector float __a, int __b) {
return __builtin_altivec_vctuxs(__a, __b);
}
#define vec_vctuxs __builtin_altivec_vctuxs
/* vec_signed */
@ -8045,45 +8037,51 @@ static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
/* vec_sl */
static __inline__ vector signed char __ATTRS_o_ai
vec_sl(vector signed char __a, vector unsigned char __b) {
return __a << (vector signed char)__b;
}
// vec_sl does modulo arithmetic on __b first, so __b is allowed to be more
// than the length of __a.
static __inline__ vector unsigned char __ATTRS_o_ai
vec_sl(vector unsigned char __a, vector unsigned char __b) {
return __a << __b;
return __a << (__b %
(vector unsigned char)(sizeof(unsigned char) * __CHAR_BIT__));
}
static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
vector unsigned short __b) {
return __a << (vector short)__b;
static __inline__ vector signed char __ATTRS_o_ai
vec_sl(vector signed char __a, vector unsigned char __b) {
return (vector signed char)vec_sl((vector unsigned char)__a, __b);
}
static __inline__ vector unsigned short __ATTRS_o_ai
vec_sl(vector unsigned short __a, vector unsigned short __b) {
return __a << __b;
return __a << (__b % (vector unsigned short)(sizeof(unsigned short) *
__CHAR_BIT__));
}
static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
vector unsigned int __b) {
return __a << (vector int)__b;
static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
vector unsigned short __b) {
return (vector short)vec_sl((vector unsigned short)__a, __b);
}
static __inline__ vector unsigned int __ATTRS_o_ai
vec_sl(vector unsigned int __a, vector unsigned int __b) {
return __a << __b;
return __a << (__b %
(vector unsigned int)(sizeof(unsigned int) * __CHAR_BIT__));
}
static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
vector unsigned int __b) {
return (vector int)vec_sl((vector unsigned int)__a, __b);
}
#ifdef __POWER8_VECTOR__
static __inline__ vector signed long long __ATTRS_o_ai
vec_sl(vector signed long long __a, vector unsigned long long __b) {
return __a << (vector long long)__b;
}
static __inline__ vector unsigned long long __ATTRS_o_ai
vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
return __a << __b;
return __a << (__b % (vector unsigned long long)(sizeof(unsigned long long) *
__CHAR_BIT__));
}
static __inline__ vector long long __ATTRS_o_ai
vec_sl(vector long long __a, vector unsigned long long __b) {
return (vector long long)vec_sl((vector unsigned long long)__a, __b);
}
#endif
@ -12150,6 +12148,11 @@ static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
#endif
#ifdef __VSX__
#define vec_xxpermdi __builtin_vsx_xxpermdi
#define vec_xxsldwi __builtin_vsx_xxsldwi
#endif
/* vec_xor */
#define __builtin_altivec_vxor vec_xor

View File

@ -224,6 +224,36 @@ __rbitl(unsigned long __t) {
#endif
}
/*
* 9.3 16-bit multiplications
*/
#if __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulbb(int32_t __a, int32_t __b) {
return __builtin_arm_smulbb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulbt(int32_t __a, int32_t __b) {
return __builtin_arm_smulbt(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smultb(int32_t __a, int32_t __b) {
return __builtin_arm_smultb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smultt(int32_t __a, int32_t __b) {
return __builtin_arm_smultt(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulwb(int32_t __a, int32_t __b) {
return __builtin_arm_smulwb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulwt(int32_t __a, int32_t __b) {
return __builtin_arm_smulwt(__a, __b);
}
#endif
/*
* 9.4 Saturating intrinsics
*
@ -231,13 +261,13 @@ __rbitl(unsigned long __t) {
* intrinsics are implemented and the flag is enabled.
*/
/* 9.4.1 Width-specified saturation intrinsics */
#if __ARM_32BIT_STATE
#if __ARM_FEATURE_SAT
#define __ssat(x, y) __builtin_arm_ssat(x, y)
#define __usat(x, y) __builtin_arm_usat(x, y)
#endif
/* 9.4.2 Saturating addition and subtraction intrinsics */
#if __ARM_32BIT_STATE
#if __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qadd(int32_t __t, int32_t __v) {
return __builtin_arm_qadd(__t, __v);
@ -254,6 +284,290 @@ __qdbl(int32_t __t) {
}
#endif
/* 9.4.3 Accumultating multiplications */
#if __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlabb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlabt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlabt(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlatb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlatb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlatt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlatt(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlawb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlawb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlawt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlawt(__a, __b, __c);
}
#endif
/* 9.5.4 Parallel 16-bit saturation */
#if __ARM_FEATURE_SIMD32
#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
#define __usat16(x, y) __builtin_arm_usat16(x, y)
#endif
/* 9.5.5 Packing and unpacking */
#if __ARM_FEATURE_SIMD32
typedef int32_t int8x4_t;
typedef int32_t int16x2_t;
typedef uint32_t uint8x4_t;
typedef uint32_t uint16x2_t;
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sxtab16(int16x2_t __a, int8x4_t __b) {
return __builtin_arm_sxtab16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sxtb16(int8x4_t __a) {
return __builtin_arm_sxtb16(__a);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__uxtab16(int16x2_t __a, int8x4_t __b) {
return __builtin_arm_uxtab16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__uxtb16(int8x4_t __a) {
return __builtin_arm_uxtb16(__a);
}
#endif
/* 9.5.6 Parallel selection */
#if __ARM_FEATURE_SIMD32
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__sel(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_sel(__a, __b);
}
#endif
/* 9.5.7 Parallel 8-bit addition and subtraction */
#if __ARM_FEATURE_SIMD32
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__qadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_qadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__qsub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_qsub8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__sadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_sadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__shadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_shadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__shsub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_shsub8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__ssub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_ssub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uhadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uhadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uhsub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uhsub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uqadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uqadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uqsub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uqsub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__usub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_usub8(__a, __b);
}
#endif
/* 9.5.8 Sum of 8-bit absolute differences */
#if __ARM_FEATURE_SIMD32
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__usad8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_usad8(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
return __builtin_arm_usada8(__a, __b, __c);
}
#endif
/* 9.5.9 Parallel 16-bit addition and subtraction */
#if __ARM_FEATURE_SIMD32
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qsax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qsax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qsub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qsub16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_sadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_sasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shsax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shsax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shsub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shsub16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__ssax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_ssax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__ssub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_ssub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhsax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhsax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhsub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhsub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqsax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqsax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqsub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqsub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__usax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_usax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__usub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_usub16(__a, __b);
}
#endif
/* 9.5.10 Parallel 16-bit multiplications */
#if __ARM_FEATURE_SIMD32
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlad(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smladx(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlald(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlaldx(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlsd(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlsdx(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlsld(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlsldx(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smuad(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smuad(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smuadx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smuadx(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smusd(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smusd(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smusdx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smusdx(__a, __b);
}
#endif
/* 9.7 CRC32 intrinsics */
#if __ARM_FEATURE_CRC32
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))

View File

@ -832,7 +832,8 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_stream_load_si256(__m256i const *__V)
{
return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS

View File

@ -504,115 +504,91 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_packs_epi32 (__m512i __A, __m512i __B)
_mm512_packs_epi32(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
(__v16si) __B,
(__v32hi) _mm512_setzero_hi(),
(__mmask32) -1);
return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
(__v16si) __B,
(__v32hi) _mm512_setzero_hi(),
__M);
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
(__v32hi)_mm512_packs_epi32(__A, __B),
(__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
__m512i __B)
_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
(__v16si) __B,
(__v32hi) __W,
__M);
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
(__v32hi)_mm512_packs_epi32(__A, __B),
(__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_packs_epi16 (__m512i __A, __m512i __B)
_mm512_packs_epi16(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v64qi) _mm512_setzero_qi(),
(__mmask64) -1);
return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
__m512i __B)
_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v64qi) __W,
(__mmask64) __M);
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_packs_epi16(__A, __B),
(__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v64qi) _mm512_setzero_qi(),
__M);
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_packs_epi16(__A, __B),
(__v64qi)_mm512_setzero_qi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_packus_epi32 (__m512i __A, __m512i __B)
_mm512_packus_epi32(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
(__v16si) __B,
(__v32hi) _mm512_setzero_hi(),
(__mmask32) -1);
return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
(__v16si) __B,
(__v32hi) _mm512_setzero_hi(),
__M);
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
(__v32hi)_mm512_packus_epi32(__A, __B),
(__v32hi)_mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
__m512i __B)
_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
(__v16si) __B,
(__v32hi) __W,
__M);
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
(__v32hi)_mm512_packus_epi32(__A, __B),
(__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_packus_epi16 (__m512i __A, __m512i __B)
_mm512_packus_epi16(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v64qi) _mm512_setzero_qi(),
(__mmask64) -1);
return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
__m512i __B)
_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v64qi) __W,
(__mmask64) __M);
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_packus_epi16(__A, __B),
(__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
(__v32hi) __B,
(__v64qi) _mm512_setzero_qi(),
(__mmask64) __M);
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_packus_epi16(__A, __B),
(__v64qi)_mm512_setzero_qi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS

View File

@ -995,51 +995,50 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_broadcast_f32x8 (__m256 __A)
_mm512_broadcast_f32x8(__m256 __A)
{
return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
_mm512_undefined_ps(),
(__mmask16) -1);
return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
{
return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
(__v16sf)__O,
__M);
return (__m512)__builtin_ia32_selectps_512((__mmask8)__M,
(__v16sf)_mm512_broadcast_f32x8(__A),
(__v16sf)__O);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
{
return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
(__v16sf)_mm512_setzero_ps (),
__M);
return (__m512)__builtin_ia32_selectps_512((__mmask8)__M,
(__v16sf)_mm512_broadcast_f32x8(__A),
(__v16sf)_mm512_setzero_ps());
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_broadcast_f64x2 (__m128d __A)
_mm512_broadcast_f64x2(__m128d __A)
{
return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
(__v8df)_mm512_undefined_pd(),
(__mmask8) -1);
return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
0, 1, 0, 1, 0, 1, 0, 1);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
{
return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
(__v8df)
__O, __M);
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
(__v8df)_mm512_broadcast_f64x2(__A),
(__v8df)__O);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
{
return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
(__v8df)_mm512_setzero_ps (),
__M);
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
(__v8df)_mm512_broadcast_f64x2(__A),
(__v8df)_mm512_setzero_pd());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@ -1067,52 +1066,50 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcast_i32x8 (__m256i __A)
_mm512_broadcast_i32x8(__m256i __A)
{
return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
(__v16si)_mm512_setzero_si512(),
(__mmask16) -1);
return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
{
return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
(__v16si)__O,
__M);
return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M,
(__v16si)_mm512_broadcast_i32x8(__A),
(__v16si)__O);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
{
return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
(__v16si)
_mm512_setzero_si512 (),
__M);
return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M,
(__v16si)_mm512_broadcast_i32x8(__A),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcast_i64x2 (__m128i __A)
_mm512_broadcast_i64x2(__m128i __A)
{
return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
(__v8di)_mm512_setzero_si512(),
(__mmask8) -1);
return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
0, 1, 0, 1, 0, 1, 0, 1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
{
return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
(__v8di)
__O, __M);
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
(__v8di)_mm512_broadcast_i64x2(__A),
(__v8di)__O);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
{
return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
(__v8di)_mm512_setzero_si512 (),
__M);
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
(__v8di)_mm512_broadcast_i64x2(__A),
(__v8di)_mm512_setzero_si512());
}
#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \

View File

@ -528,6 +528,116 @@ _mm512_mask2int(__mmask16 __a)
return (int)__a;
}
/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
/// contain the value of the source vector. The upper 384 bits are set
/// to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double].
/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
/// contain the value of the parameter. The upper 384 bits are set to zero.
static __inline __m512d __DEFAULT_FN_ATTRS
_mm512_zextpd128_pd512(__m128d __a)
{
return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
}
/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
/// 256-bit floating-point vector of [4 x double]. The lower 256 bits
/// contain the value of the source vector. The upper 256 bits are set
/// to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double].
/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
/// contain the value of the parameter. The upper 256 bits are set to zero.
static __inline __m512d __DEFAULT_FN_ATTRS
_mm512_zextpd256_pd512(__m256d __a)
{
return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
}
/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
/// the value of the source vector. The upper 384 bits are set to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
/// contain the value of the parameter. The upper 384 bits are set to zero.
static __inline __m512 __DEFAULT_FN_ATTRS
_mm512_zextps128_ps512(__m128 __a)
{
return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
}
/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain
/// the value of the source vector. The upper 256 bits are set to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float].
/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
/// contain the value of the parameter. The upper 256 bits are set to zero.
static __inline __m512 __DEFAULT_FN_ATTRS
_mm512_zextps256_ps512(__m256 __a)
{
return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
}
/// \brief Constructs a 512-bit integer vector from a 128-bit integer vector.
/// The lower 128 bits contain the value of the source vector. The upper
/// 384 bits are set to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 128-bit integer vector.
/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
/// the parameter. The upper 384 bits are set to zero.
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_zextsi128_si512(__m128i __a)
{
return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
}
/// \brief Constructs a 512-bit integer vector from a 256-bit integer vector.
/// The lower 256 bits contain the value of the source vector. The upper
/// 256 bits are set to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 256-bit integer vector.
/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
/// the parameter. The upper 256 bits are set to zero.
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_zextsi256_si512(__m256i __a)
{
return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
}
/* Bitwise operators */
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_and_epi32(__m512i __a, __m512i __b)
@ -4179,7 +4289,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
{
return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
(__v16si)
(__v16si)
_mm512_setzero_si512 (),
(__mmask16) __U ,
_MM_FROUND_CUR_DIRECTION);
@ -4229,6 +4339,18 @@ _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
_MM_FROUND_CUR_DIRECTION);
}
static __inline__ double __DEFAULT_FN_ATTRS
_mm512_cvtsd_f64(__m512d __a)
{
return __a[0];
}
static __inline__ float __DEFAULT_FN_ATTRS
_mm512_cvtss_f32(__m512 __a)
{
return __a[0];
}
/* Unpack and Interleave */
static __inline __m512d __DEFAULT_FN_ATTRS
@ -4540,7 +4662,7 @@ _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
}
static __inline __m512d __DEFAULT_FN_ATTRS
_mm512_loadu_pd(double const *__p)
_mm512_loadu_pd(void const *__p)
{
struct __loadu_pd {
__m512d __v;
@ -4549,7 +4671,7 @@ _mm512_loadu_pd(double const *__p)
}
static __inline __m512 __DEFAULT_FN_ATTRS
_mm512_loadu_ps(float const *__p)
_mm512_loadu_ps(void const *__p)
{
struct __loadu_ps {
__m512 __v;
@ -4558,7 +4680,7 @@ _mm512_loadu_ps(float const *__p)
}
static __inline __m512 __DEFAULT_FN_ATTRS
_mm512_load_ps(float const *__p)
_mm512_load_ps(void const *__p)
{
return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
(__v16sf)
@ -4584,7 +4706,7 @@ _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
}
static __inline __m512d __DEFAULT_FN_ATTRS
_mm512_load_pd(double const *__p)
_mm512_load_pd(void const *__p)
{
return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
(__v8df)
@ -7278,107 +7400,97 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
(__mmask8)(U), (int)(R)); })
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_broadcast_f32x4 (__m128 __A)
_mm512_broadcast_f32x4(__m128 __A)
{
return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
(__v16sf)
_mm512_undefined_ps (),
(__mmask16) -1);
return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
0, 1, 2, 3, 0, 1, 2, 3,
0, 1, 2, 3, 0, 1, 2, 3);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
{
return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
(__v16sf) __O,
__M);
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
(__v16sf)_mm512_broadcast_f32x4(__A),
(__v16sf)__O);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
{
return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
(__v16sf)
_mm512_setzero_ps (),
__M);
return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
(__v16sf)_mm512_broadcast_f32x4(__A),
(__v16sf)_mm512_setzero_ps());
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_broadcast_f64x4 (__m256d __A)
_mm512_broadcast_f64x4(__m256d __A)
{
return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
(__v8df)
_mm512_undefined_pd (),
(__mmask8) -1);
return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
0, 1, 2, 3, 0, 1, 2, 3);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
{
return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
(__v8df) __O,
__M);
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
(__v8df)_mm512_broadcast_f64x4(__A),
(__v8df)__O);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
{
return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
(__v8df)
_mm512_setzero_pd (),
__M);
return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
(__v8df)_mm512_broadcast_f64x4(__A),
(__v8df)_mm512_setzero_pd());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcast_i32x4 (__m128i __A)
_mm512_broadcast_i32x4(__m128i __A)
{
return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
(__v16si)
_mm512_undefined_epi32 (),
(__mmask16) -1);
return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
0, 1, 2, 3, 0, 1, 2, 3,
0, 1, 2, 3, 0, 1, 2, 3);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
{
return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
(__v16si) __O,
__M);
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
(__v16si)_mm512_broadcast_i32x4(__A),
(__v16si)__O);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
{
return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
(__v16si)
_mm512_setzero_si512 (),
__M);
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
(__v16si)_mm512_broadcast_i32x4(__A),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcast_i64x4 (__m256i __A)
_mm512_broadcast_i64x4(__m256i __A)
{
return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
(__v8di)
_mm512_undefined_epi32 (),
(__mmask8) -1);
return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
0, 1, 2, 3, 0, 1, 2, 3);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
{
return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
(__v8di) __O,
__M);
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
(__v8di)_mm512_broadcast_i64x4(__A),
(__v8di)__O);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
{
return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
(__v8di)
_mm512_setzero_si512 (),
__M);
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
(__v8di)_mm512_broadcast_i64x4(__A),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
@ -7860,12 +7972,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
3 + ((imm) & 0x3) * 4); })
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
(__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
(__v4si)__W); })
(__v4si)(W)); })
#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
(__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
(__v4si)_mm_setzero_si128()); })
@ -7878,12 +7990,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
((imm) & 1) ? 7 : 3); })
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
(__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
(__v4di)__W); })
(__v4di)(W)); })
#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
(__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
(__v4di)_mm256_setzero_si256()); })
@ -8159,11 +8271,11 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
(__v8di)(__m512i)(index), (__mmask8)-1, \
(int)(scale)); })
#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\
__addr, __scale) __extension__({\
__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
__addr,(__v8di) __index, __mask, __scale);\
})
#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__({\
(__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
(float const *)(addr), \
(__v8di)(__m512i)(index), \
(__mmask8)(mask), (int)(scale)); })
#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
(__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
@ -8858,6 +8970,8 @@ _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
(__mmask16) -1);
}
#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
__m512i __Y)
@ -8868,6 +8982,8 @@ _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
__M);
}
#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kand (__mmask16 __A, __mmask16 __B)
{
@ -8919,25 +9035,29 @@ _mm512_kxor (__mmask16 __A, __mmask16 __B)
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_si512 (__m512i * __P, __m512i __A)
{
__builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
typedef __v8di __v8di_aligned __attribute__((aligned(64)));
__builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_stream_load_si512 (void *__P)
{
return __builtin_ia32_movntdqa512 ((__v8di *)__P);
typedef __v8di __v8di_aligned __attribute__((aligned(64)));
return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_pd (double *__P, __m512d __A)
{
__builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
typedef __v8df __v8df_aligned __attribute__((aligned(64)));
__builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_ps (float *__P, __m512 __A)
{
__builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
__builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
@ -9101,39 +9221,39 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
{
__m128 res = __A;
__m128 res = __A;
res[0] = (__U & 1) ? __B[0] : __W[0];
return res;
return res;
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
{
__m128 res = __A;
res[0] = (__U & 1) ? __B[0] : 0;
return res;
__m128 res = __A;
res[0] = (__U & 1) ? __B[0] : 0;
return res;
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
{
__m128d res = __A;
__m128d res = __A;
res[0] = (__U & 1) ? __B[0] : __W[0];
return res;
return res;
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
{
__m128d res = __A;
res[0] = (__U & 1) ? __B[0] : 0;
return res;
__m128d res = __A;
res[0] = (__U & 1) ? __B[0] : 0;
return res;
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
{
__builtin_ia32_storess128_mask ((__v16sf *)__W,
__builtin_ia32_storess128_mask ((__v16sf *)__W,
(__v16sf) _mm512_castps128_ps512(__A),
(__mmask16) __U & (__mmask16)1);
}
@ -9141,7 +9261,7 @@ _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
static __inline__ void __DEFAULT_FN_ATTRS
_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
{
__builtin_ia32_storesd128_mask ((__v8df *)__W,
__builtin_ia32_storesd128_mask ((__v8df *)__W,
(__v8df) _mm512_castpd128_pd512(__A),
(__mmask8) __U & 1);
}
@ -9490,7 +9610,7 @@ _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
{
return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
(__v2df)(__B),
(__v4sf)(__W),
(__v4sf)(__W),
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
}
@ -9499,7 +9619,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
{
return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
(__v2df)(__B),
(__v4sf)_mm_setzero_ps(),
(__v4sf)_mm_setzero_ps(),
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
}
@ -9564,7 +9684,7 @@ _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
(__v4sf)(__B),
(__v2df)(__W),
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
@ -9572,8 +9692,8 @@ _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
{
return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
(__v4sf)(__B),
(__v2df)_mm_setzero_pd(),
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
(__v2df)_mm_setzero_pd(),
(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
@ -9635,6 +9755,45 @@ _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
}
#endif
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
char __e4, char __e3, char __e2, char __e1, char __e0) {
return __extension__ (__m512i)(__v64qi)
{__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
__e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
__e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
__e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
__e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
__e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
__e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
__e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
}
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
short __e27, short __e26, short __e25, short __e24, short __e23,
short __e22, short __e21, short __e20, short __e19, short __e18,
short __e17, short __e16, short __e15, short __e14, short __e13,
short __e12, short __e11, short __e10, short __e9, short __e8,
short __e7, short __e6, short __e5, short __e4, short __e3,
short __e2, short __e1, short __e0) {
return __extension__ (__m512i)(__v32hi)
{__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
__e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
__e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
__e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
}
static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_set_epi32 (int __A, int __B, int __C, int __D,
int __E, int __F, int __G, int __H,
@ -9780,7 +9939,7 @@ static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
}
// Vec512 - Vector with size 512.
// Vec512Neutral - All vector elements set to the identity element.
// Vec512Neutral - All vector elements set to the identity element.
// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
// Operator - Can be one of following: +,*,&,|
// Mask - Intrinsic Mask
@ -9810,19 +9969,19 @@ _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
&, __M, i, i, q);
}
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M,
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M,
i, i, q);
}
static __inline__ double __DEFAULT_FN_ATTRS
_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
f, d, pd);
}
@ -9884,17 +10043,17 @@ _mm512_reduce_add_epi32(__m512i __W) {
_mm512_reduce_operator_32bit(__W, +, i, i);
}
static __inline__ int __DEFAULT_FN_ATTRS
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_reduce_mul_epi32(__m512i __W) {
_mm512_reduce_operator_32bit(__W, *, i, i);
}
static __inline__ int __DEFAULT_FN_ATTRS
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_reduce_and_epi32(__m512i __W) {
_mm512_reduce_operator_32bit(__W, &, i, i);
}
static __inline__ int __DEFAULT_FN_ATTRS
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_reduce_or_epi32(__m512i __W) {
_mm512_reduce_operator_32bit(__W, |, i, i);
}
@ -9910,7 +10069,7 @@ _mm512_reduce_mul_ps(__m512 __W) {
}
// Vec512 - Vector with size 512.
// Vec512Neutral - All vector elements set to the identity element.
// Vec512Neutral - All vector elements set to the identity element.
// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
// Operator - Can be one of following: +,*,&,|
// Mask - Intrinsic Mask
@ -9940,7 +10099,7 @@ _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
static __inline__ int __DEFAULT_FN_ATTRS
_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
i, i, d);
}
@ -10003,7 +10162,7 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
return Vec512[0]; \
})
static __inline__ long long __DEFAULT_FN_ATTRS
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_reduce_max_epi64(__m512i __V) {
_mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
}
@ -10013,7 +10172,7 @@ _mm512_reduce_max_epu64(__m512i __V) {
_mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
}
static __inline__ double __DEFAULT_FN_ATTRS
static __inline__ double __DEFAULT_FN_ATTRS
_mm512_reduce_max_pd(__m512d __V) {
_mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
}
@ -10028,7 +10187,7 @@ _mm512_reduce_min_epu64(__m512i __V) {
_mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
}
static __inline__ double __DEFAULT_FN_ATTRS
static __inline__ double __DEFAULT_FN_ATTRS
_mm512_reduce_min_pd(__m512d __V) {
_mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
}

View File

@ -1000,27 +1000,26 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_broadcast_f64x2 (__m128d __A)
_mm256_broadcast_f64x2(__m128d __A)
{
return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
(__v4df)_mm256_undefined_pd(),
(__mmask8) -1);
return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
0, 1, 0, 1);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
{
return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
(__v4df) __O,
__M);
return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
(__v4df)_mm256_broadcast_f64x2(__A),
(__v4df)__O);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
{
return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
(__v4df) _mm256_setzero_ps (),
__M);
return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
(__v4df)_mm256_broadcast_f64x2(__A),
(__v4df)_mm256_setzero_pd());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@ -1072,27 +1071,26 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcast_i64x2 (__m128i __A)
_mm256_broadcast_i64x2(__m128i __A)
{
return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
(__v4di)_mm256_undefined_si256(),
(__mmask8) -1);
return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
0, 1, 0, 1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
{
return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
(__v4di) __O,
__M);
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
(__v4di)_mm256_broadcast_i64x2(__A),
(__v4di)__O);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
{
return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
(__v4di) _mm256_setzero_si256 (),
__M);
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
(__v4di)_mm256_broadcast_i64x2(__A),
(__v4di)_mm256_setzero_si256());
}
#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \

View File

@ -7189,52 +7189,49 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_broadcast_f32x4 (__m128 __A)
_mm256_broadcast_f32x4(__m128 __A)
{
return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
(__v8sf)_mm256_undefined_pd (),
(__mmask8) -1);
return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
0, 1, 2, 3, 0, 1, 2, 3);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
{
return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
(__v8sf) __O,
__M);
return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
(__v8sf)_mm256_broadcast_f32x4(__A),
(__v8sf)__O);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
{
return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
(__v8sf) _mm256_setzero_ps (),
__M);
return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
(__v8sf)_mm256_broadcast_f32x4(__A),
(__v8sf)_mm256_setzero_ps());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcast_i32x4 (__m128i __A)
_mm256_broadcast_i32x4(__m128i __A)
{
return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
(__v8si)_mm256_undefined_si256 (),
(__mmask8) -1);
return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
0, 1, 2, 3, 0, 1, 2, 3);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
{
return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
(__v8si)
__O, __M);
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
(__v8si)_mm256_broadcast_i32x4(__A),
(__v8si)__O);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
{
return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
__A,
(__v8si) _mm256_setzero_si256 (),
__M);
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
(__v8si)_mm256_broadcast_i32x4(__A),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS

View File

@ -0,0 +1,70 @@
/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
*------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VPOPCNTDQINTRIN_H
#define __AVX512VPOPCNTDQINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd" \
"q")))
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
return (__m512i)__builtin_ia32_selectq_512(
(__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -1458,12 +1458,13 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \brief Computes two dot products in parallel, using the lower and upper
/// halves of two [8 x float] vectors as input to the two computations, and
/// returning the two dot products in the lower and upper halves of the
/// [8 x float] result. The immediate integer operand controls which input
/// elements will contribute to the dot product, and where the final results
/// are returned. In general, for each dot product, the four corresponding
/// elements of the input vectors are multiplied; the first two and second
/// two products are summed, then the two sums are added to form the final
/// result.
/// [8 x float] result.
///
/// The immediate integer operand controls which input elements will
/// contribute to the dot product, and where the final results are returned.
/// In general, for each dot product, the four corresponding elements of the
/// input vectors are multiplied; the first two and second two products are
/// summed, then the two sums are added to form the final result.
///
/// \headerfile <x86intrin.h>
///
@ -1497,15 +1498,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/* Vector shuffle */
/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
/// specified by the immediate value operand. The four selected elements in
/// each operand are copied to the destination according to the bits
/// specified in the immediate operand. The selected elements from the first
/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the
/// destination, and the selected elements from the second 256-bit operand
/// are copied to bits [127:64] and bits [255:192] of the destination. For
/// example, if bits [7:0] of the immediate operand contain a value of 0xFF,
/// the 256-bit destination vector would contain the following values: b[7],
/// b[7], a[7], a[7], b[3], b[3], a[3], a[3].
/// specified by the immediate value operand.
///
/// The four selected elements in each operand are copied to the destination
/// according to the bits specified in the immediate operand. The selected
/// elements from the first 256-bit operand are copied to bits [63:0] and
/// bits [191:128] of the destination, and the selected elements from the
/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
/// the destination. For example, if bits [7:0] of the immediate operand
/// contain a value of 0xFF, the 256-bit destination vector would contain the
/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
///
/// \headerfile <x86intrin.h>
///
@ -1557,13 +1559,14 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
12 + (((mask) >> 6) & 0x3)); })
/// \brief Selects four double-precision values from the 256-bit operands of
/// [4 x double], as specified by the immediate value operand. The selected
/// elements from the first 256-bit operand are copied to bits [63:0] and
/// bits [191:128] in the destination, and the selected elements from the
/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in
/// the destination. For example, if bits [3:0] of the immediate operand
/// contain a value of 0xF, the 256-bit destination vector would contain the
/// following values: b[3], a[3], b[1], a[1].
/// [4 x double], as specified by the immediate value operand.
///
/// The selected elements from the first 256-bit operand are copied to bits
/// [63:0] and bits [191:128] in the destination, and the selected elements
/// from the second 256-bit operand are copied to bits [127:64] and bits
/// [255:192] in the destination. For example, if bits [3:0] of the immediate
/// operand contain a value of 0xF, the 256-bit destination vector would
/// contain the following values: b[3], a[3], b[1], a[1].
///
/// \headerfile <x86intrin.h>
///
@ -1613,9 +1616,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */
#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */
#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
@ -1628,10 +1631,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */
#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */
#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
@ -1641,9 +1644,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \brief Compares each of the corresponding double-precision values of two
/// 128-bit vectors of [2 x double], using the operation specified by the
/// immediate integer operand. Returns a [2 x double] vector consisting of
/// two doubles corresponding to the two comparison results: zero if the
/// comparison is false, and all 1's if the comparison is true.
/// immediate integer operand.
///
/// Returns a [2 x double] vector consisting of two doubles corresponding to
/// the two comparison results: zero if the comparison is false, and all 1's
/// if the comparison is true.
///
/// \headerfile <x86intrin.h>
///
@ -1660,17 +1665,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
/// operation to use: \n
/// 00h, 08h, 10h, 18h: Equal \n
/// 01h, 09h, 11h, 19h: Less than \n
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
/// (swapped operands) \n
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// 0x00 : Equal (ordered, non-signaling)
/// 0x01 : Less-than (ordered, signaling)
/// 0x02 : Less-than-or-equal (ordered, signaling)
/// 0x03 : Unordered (non-signaling)
/// 0x04 : Not-equal (unordered, non-signaling)
/// 0x05 : Not-less-than (unordered, signaling)
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
/// 0x07 : Ordered (non-signaling)
/// 0x08 : Equal (unordered, non-signaling)
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
/// 0x0a : Not-greater-than (unordered, signaling)
/// 0x0b : False (ordered, non-signaling)
/// 0x0c : Not-equal (ordered, non-signaling)
/// 0x0d : Greater-than-or-equal (ordered, signaling)
/// 0x0e : Greater-than (ordered, signaling)
/// 0x0f : True (unordered, non-signaling)
/// 0x10 : Equal (ordered, signaling)
/// 0x11 : Less-than (ordered, non-signaling)
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
/// 0x13 : Unordered (signaling)
/// 0x14 : Not-equal (unordered, signaling)
/// 0x15 : Not-less-than (unordered, non-signaling)
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
/// 0x17 : Ordered (signaling)
/// 0x18 : Equal (unordered, signaling)
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
/// 0x1a : Not-greater-than (unordered, non-signaling)
/// 0x1b : False (ordered, signaling)
/// 0x1c : Not-equal (ordered, signaling)
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
/// 0x1e : Greater-than (ordered, non-signaling)
/// 0x1f : True (unordered, signaling)
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_pd(a, b, c) __extension__ ({ \
(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
@ -1678,9 +1704,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \brief Compares each of the corresponding values of two 128-bit vectors of
/// [4 x float], using the operation specified by the immediate integer
/// operand. Returns a [4 x float] vector consisting of four floats
/// corresponding to the four comparison results: zero if the comparison is
/// false, and all 1's if the comparison is true.
/// operand.
///
/// Returns a [4 x float] vector consisting of four floats corresponding to
/// the four comparison results: zero if the comparison is false, and all 1's
/// if the comparison is true.
///
/// \headerfile <x86intrin.h>
///
@ -1697,17 +1725,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
/// operation to use: \n
/// 00h, 08h, 10h, 18h: Equal \n
/// 01h, 09h, 11h, 19h: Less than \n
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
/// (swapped operands) \n
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// 0x00 : Equal (ordered, non-signaling)
/// 0x01 : Less-than (ordered, signaling)
/// 0x02 : Less-than-or-equal (ordered, signaling)
/// 0x03 : Unordered (non-signaling)
/// 0x04 : Not-equal (unordered, non-signaling)
/// 0x05 : Not-less-than (unordered, signaling)
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
/// 0x07 : Ordered (non-signaling)
/// 0x08 : Equal (unordered, non-signaling)
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
/// 0x0a : Not-greater-than (unordered, signaling)
/// 0x0b : False (ordered, non-signaling)
/// 0x0c : Not-equal (ordered, non-signaling)
/// 0x0d : Greater-than-or-equal (ordered, signaling)
/// 0x0e : Greater-than (ordered, signaling)
/// 0x0f : True (unordered, non-signaling)
/// 0x10 : Equal (ordered, signaling)
/// 0x11 : Less-than (ordered, non-signaling)
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
/// 0x13 : Unordered (signaling)
/// 0x14 : Not-equal (unordered, signaling)
/// 0x15 : Not-less-than (unordered, non-signaling)
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
/// 0x17 : Ordered (signaling)
/// 0x18 : Equal (unordered, signaling)
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
/// 0x1a : Not-greater-than (unordered, non-signaling)
/// 0x1b : False (ordered, signaling)
/// 0x1c : Not-equal (ordered, signaling)
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
/// 0x1e : Greater-than (ordered, non-signaling)
/// 0x1f : True (unordered, signaling)
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ps(a, b, c) __extension__ ({ \
(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
@ -1715,9 +1764,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \brief Compares each of the corresponding double-precision values of two
/// 256-bit vectors of [4 x double], using the operation specified by the
/// immediate integer operand. Returns a [4 x double] vector consisting of
/// four doubles corresponding to the four comparison results: zero if the
/// comparison is false, and all 1's if the comparison is true.
/// immediate integer operand.
///
/// Returns a [4 x double] vector consisting of four doubles corresponding to
/// the four comparison results: zero if the comparison is false, and all 1's
/// if the comparison is true.
///
/// \headerfile <x86intrin.h>
///
@ -1734,17 +1785,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
/// operation to use: \n
/// 00h, 08h, 10h, 18h: Equal \n
/// 01h, 09h, 11h, 19h: Less than \n
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
/// (swapped operands) \n
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// 0x00 : Equal (ordered, non-signaling)
/// 0x01 : Less-than (ordered, signaling)
/// 0x02 : Less-than-or-equal (ordered, signaling)
/// 0x03 : Unordered (non-signaling)
/// 0x04 : Not-equal (unordered, non-signaling)
/// 0x05 : Not-less-than (unordered, signaling)
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
/// 0x07 : Ordered (non-signaling)
/// 0x08 : Equal (unordered, non-signaling)
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
/// 0x0a : Not-greater-than (unordered, signaling)
/// 0x0b : False (ordered, non-signaling)
/// 0x0c : Not-equal (ordered, non-signaling)
/// 0x0d : Greater-than-or-equal (ordered, signaling)
/// 0x0e : Greater-than (ordered, signaling)
/// 0x0f : True (unordered, non-signaling)
/// 0x10 : Equal (ordered, signaling)
/// 0x11 : Less-than (ordered, non-signaling)
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
/// 0x13 : Unordered (signaling)
/// 0x14 : Not-equal (unordered, signaling)
/// 0x15 : Not-less-than (unordered, non-signaling)
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
/// 0x17 : Ordered (signaling)
/// 0x18 : Equal (unordered, signaling)
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
/// 0x1a : Not-greater-than (unordered, non-signaling)
/// 0x1b : False (ordered, signaling)
/// 0x1c : Not-equal (ordered, signaling)
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
/// 0x1e : Greater-than (ordered, non-signaling)
/// 0x1f : True (unordered, signaling)
/// \returns A 256-bit vector of [4 x double] containing the comparison results.
#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
@ -1752,9 +1824,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \brief Compares each of the corresponding values of two 256-bit vectors of
/// [8 x float], using the operation specified by the immediate integer
/// operand. Returns a [8 x float] vector consisting of eight floats
/// corresponding to the eight comparison results: zero if the comparison is
/// false, and all 1's if the comparison is true.
/// operand.
///
/// Returns a [8 x float] vector consisting of eight floats corresponding to
/// the eight comparison results: zero if the comparison is false, and all
/// 1's if the comparison is true.
///
/// \headerfile <x86intrin.h>
///
@ -1771,17 +1845,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
/// operation to use: \n
/// 00h, 08h, 10h, 18h: Equal \n
/// 01h, 09h, 11h, 19h: Less than \n
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
/// (swapped operands) \n
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// 0x00 : Equal (ordered, non-signaling)
/// 0x01 : Less-than (ordered, signaling)
/// 0x02 : Less-than-or-equal (ordered, signaling)
/// 0x03 : Unordered (non-signaling)
/// 0x04 : Not-equal (unordered, non-signaling)
/// 0x05 : Not-less-than (unordered, signaling)
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
/// 0x07 : Ordered (non-signaling)
/// 0x08 : Equal (unordered, non-signaling)
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
/// 0x0a : Not-greater-than (unordered, signaling)
/// 0x0b : False (ordered, non-signaling)
/// 0x0c : Not-equal (ordered, non-signaling)
/// 0x0d : Greater-than-or-equal (ordered, signaling)
/// 0x0e : Greater-than (ordered, signaling)
/// 0x0f : True (unordered, non-signaling)
/// 0x10 : Equal (ordered, signaling)
/// 0x11 : Less-than (ordered, non-signaling)
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
/// 0x13 : Unordered (signaling)
/// 0x14 : Not-equal (unordered, signaling)
/// 0x15 : Not-less-than (unordered, non-signaling)
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
/// 0x17 : Ordered (signaling)
/// 0x18 : Equal (unordered, signaling)
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
/// 0x1a : Not-greater-than (unordered, non-signaling)
/// 0x1b : False (ordered, signaling)
/// 0x1c : Not-equal (ordered, signaling)
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
/// 0x1e : Greater-than (ordered, non-signaling)
/// 0x1f : True (unordered, signaling)
/// \returns A 256-bit vector of [8 x float] containing the comparison results.
#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
@ -1789,8 +1884,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \brief Compares each of the corresponding scalar double-precision values of
/// two 128-bit vectors of [2 x double], using the operation specified by the
/// immediate integer operand. If the result is true, all 64 bits of the
/// destination vector are set; otherwise they are cleared.
/// immediate integer operand.
///
/// If the result is true, all 64 bits of the destination vector are set;
/// otherwise they are cleared.
///
/// \headerfile <x86intrin.h>
///
@ -1807,17 +1904,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
/// operation to use: \n
/// 00h, 08h, 10h, 18h: Equal \n
/// 01h, 09h, 11h, 19h: Less than \n
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
/// (swapped operands) \n
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// 0x00 : Equal (ordered, non-signaling)
/// 0x01 : Less-than (ordered, signaling)
/// 0x02 : Less-than-or-equal (ordered, signaling)
/// 0x03 : Unordered (non-signaling)
/// 0x04 : Not-equal (unordered, non-signaling)
/// 0x05 : Not-less-than (unordered, signaling)
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
/// 0x07 : Ordered (non-signaling)
/// 0x08 : Equal (unordered, non-signaling)
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
/// 0x0a : Not-greater-than (unordered, signaling)
/// 0x0b : False (ordered, non-signaling)
/// 0x0c : Not-equal (ordered, non-signaling)
/// 0x0d : Greater-than-or-equal (ordered, signaling)
/// 0x0e : Greater-than (ordered, signaling)
/// 0x0f : True (unordered, non-signaling)
/// 0x10 : Equal (ordered, signaling)
/// 0x11 : Less-than (ordered, non-signaling)
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
/// 0x13 : Unordered (signaling)
/// 0x14 : Not-equal (unordered, signaling)
/// 0x15 : Not-less-than (unordered, non-signaling)
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
/// 0x17 : Ordered (signaling)
/// 0x18 : Equal (unordered, signaling)
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
/// 0x1a : Not-greater-than (unordered, non-signaling)
/// 0x1b : False (ordered, signaling)
/// 0x1c : Not-equal (ordered, signaling)
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
/// 0x1e : Greater-than (ordered, non-signaling)
/// 0x1f : True (unordered, signaling)
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_sd(a, b, c) __extension__ ({ \
(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
@ -1825,8 +1943,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \brief Compares each of the corresponding scalar values of two 128-bit
/// vectors of [4 x float], using the operation specified by the immediate
/// integer operand. If the result is true, all 32 bits of the destination
/// vector are set; otherwise they are cleared.
/// integer operand.
///
/// If the result is true, all 32 bits of the destination vector are set;
/// otherwise they are cleared.
///
/// \headerfile <x86intrin.h>
///
@ -1843,17 +1963,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \param c
/// An immediate integer operand, with bits [4:0] specifying which comparison
/// operation to use: \n
/// 00h, 08h, 10h, 18h: Equal \n
/// 01h, 09h, 11h, 19h: Less than \n
/// 02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
/// (swapped operands) \n
/// 03h, 0Bh, 13h, 1Bh: Unordered \n
/// 04h, 0Ch, 14h, 1Ch: Not equal \n
/// 05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
/// (swapped operands) \n
/// 06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
/// (swapped operands) \n
/// 07h, 0Fh, 17h, 1Fh: Ordered
/// 0x00 : Equal (ordered, non-signaling)
/// 0x01 : Less-than (ordered, signaling)
/// 0x02 : Less-than-or-equal (ordered, signaling)
/// 0x03 : Unordered (non-signaling)
/// 0x04 : Not-equal (unordered, non-signaling)
/// 0x05 : Not-less-than (unordered, signaling)
/// 0x06 : Not-less-than-or-equal (unordered, signaling)
/// 0x07 : Ordered (non-signaling)
/// 0x08 : Equal (unordered, non-signaling)
/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
/// 0x0a : Not-greater-than (unordered, signaling)
/// 0x0b : False (ordered, non-signaling)
/// 0x0c : Not-equal (ordered, non-signaling)
/// 0x0d : Greater-than-or-equal (ordered, signaling)
/// 0x0e : Greater-than (ordered, signaling)
/// 0x0f : True (unordered, non-signaling)
/// 0x10 : Equal (ordered, signaling)
/// 0x11 : Less-than (ordered, non-signaling)
/// 0x12 : Less-than-or-equal (ordered, non-signaling)
/// 0x13 : Unordered (signaling)
/// 0x14 : Not-equal (unordered, signaling)
/// 0x15 : Not-less-than (unordered, non-signaling)
/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
/// 0x17 : Ordered (signaling)
/// 0x18 : Equal (unordered, signaling)
/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
/// 0x1a : Not-greater-than (unordered, non-signaling)
/// 0x1b : False (ordered, signaling)
/// 0x1c : Not-equal (ordered, signaling)
/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
/// 0x1e : Greater-than (ordered, non-signaling)
/// 0x1f : True (unordered, signaling)
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ss(a, b, c) __extension__ ({ \
(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
@ -2184,12 +2325,32 @@ _mm256_cvttps_epi32(__m256 __a)
return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
}
/// \brief Returns the first element of the input vector of [4 x double].
///
/// \headerfile <avxintrin.h>
///
/// This intrinsic is a utility function and does not correspond to a specific
/// instruction.
///
/// \param __a
/// A 256-bit vector of [4 x double].
/// \returns A 64 bit double containing the first element of the input vector.
static __inline double __DEFAULT_FN_ATTRS
_mm256_cvtsd_f64(__m256d __a)
{
return __a[0];
}
/// \brief Returns the first element of the input vector of [8 x i32].
///
/// \headerfile <avxintrin.h>
///
/// This intrinsic is a utility function and does not correspond to a specific
/// instruction.
///
/// \param __a
/// A 256-bit vector of [8 x i32].
/// \returns A 32 bit integer containing the first element of the input vector.
static __inline int __DEFAULT_FN_ATTRS
_mm256_cvtsi256_si32(__m256i __a)
{
@ -2197,6 +2358,16 @@ _mm256_cvtsi256_si32(__m256i __a)
return __b[0];
}
/// \brief Returns the first element of the input vector of [8 x float].
///
/// \headerfile <avxintrin.h>
///
/// This intrinsic is a utility function and does not correspond to a specific
/// instruction.
///
/// \param __a
/// A 256-bit vector of [8 x float].
/// \returns A 32 bit float containing the first element of the input vector.
static __inline float __DEFAULT_FN_ATTRS
_mm256_cvtss_f32(__m256 __a)
{
@ -2380,7 +2551,9 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b)
/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
/// element-by-element comparison of the double-precision element in the
/// first source vector and the corresponding element in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of double-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2407,7 +2580,9 @@ _mm_testz_pd(__m128d __a, __m128d __b)
/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
/// element-by-element comparison of the double-precision element in the
/// first source vector and the corresponding element in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of double-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2434,7 +2609,9 @@ _mm_testc_pd(__m128d __a, __m128d __b)
/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
/// element-by-element comparison of the double-precision element in the
/// first source vector and the corresponding element in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of double-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2462,7 +2639,9 @@ _mm_testnzc_pd(__m128d __a, __m128d __b)
/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
/// element-by-element comparison of the single-precision element in the
/// first source vector and the corresponding element in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of single-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2489,7 +2668,9 @@ _mm_testz_ps(__m128 __a, __m128 __b)
/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
/// element-by-element comparison of the single-precision element in the
/// first source vector and the corresponding element in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of single-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2516,7 +2697,9 @@ _mm_testc_ps(__m128 __a, __m128 __b)
/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
/// element-by-element comparison of the single-precision element in the
/// first source vector and the corresponding element in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of single-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2544,7 +2727,9 @@ _mm_testnzc_ps(__m128 __a, __m128 __b)
/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
/// element-by-element comparison of the double-precision elements in the
/// first source vector and the corresponding elements in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of double-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2571,7 +2756,9 @@ _mm256_testz_pd(__m256d __a, __m256d __b)
/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
/// element-by-element comparison of the double-precision elements in the
/// first source vector and the corresponding elements in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of double-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2598,7 +2785,9 @@ _mm256_testc_pd(__m256d __a, __m256d __b)
/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
/// element-by-element comparison of the double-precision elements in the
/// first source vector and the corresponding elements in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of double-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2626,7 +2815,9 @@ _mm256_testnzc_pd(__m256d __a, __m256d __b)
/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
/// element-by-element comparison of the single-precision element in the
/// first source vector and the corresponding element in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of single-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2653,7 +2844,9 @@ _mm256_testz_ps(__m256 __a, __m256 __b)
/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
/// element-by-element comparison of the single-precision element in the
/// first source vector and the corresponding element in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of single-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2680,7 +2873,9 @@ _mm256_testc_ps(__m256 __a, __m256 __b)
/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
/// element-by-element comparison of the single-precision elements in the
/// first source vector and the corresponding elements in the second source
/// vector. The EFLAGS register is updated as follows: \n
/// vector.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of single-precision elements where the
/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
/// ZF flag is set to 1. \n
@ -2706,7 +2901,9 @@ _mm256_testnzc_ps(__m256 __a, __m256 __b)
}
/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
/// of the two source vectors and update the EFLAGS register as follows: \n
/// of the two source vectors.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of bits where both bits are 1, the ZF flag
/// is set to 0. Otherwise the ZF flag is set to 1. \n
/// If there is at least one pair of bits where the bit from the first source
@ -2730,7 +2927,9 @@ _mm256_testz_si256(__m256i __a, __m256i __b)
}
/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
/// of the two source vectors and update the EFLAGS register as follows: \n
/// of the two source vectors.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of bits where both bits are 1, the ZF flag
/// is set to 0. Otherwise the ZF flag is set to 1. \n
/// If there is at least one pair of bits where the bit from the first source
@ -2754,7 +2953,9 @@ _mm256_testc_si256(__m256i __a, __m256i __b)
}
/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
/// of the two source vectors and update the EFLAGS register as follows: \n
/// of the two source vectors.
///
/// The EFLAGS register is updated as follows: \n
/// If there is at least one pair of bits where both bits are 1, the ZF flag
/// is set to 0. Otherwise the ZF flag is set to 1. \n
/// If there is at least one pair of bits where the bit from the first source
@ -3389,7 +3590,8 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_si256(__m256i *__a, __m256i __b)
{
__builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
}
/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
@ -3402,13 +3604,14 @@ _mm256_stream_si256(__m256i *__a, __m256i __b)
///
/// \param __a
/// A pointer to a 32-byte aligned memory location that will receive the
/// integer values.
/// double-precision floating-point values.
/// \param __b
/// A 256-bit vector of [4 x double] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_pd(double *__a, __m256d __b)
{
__builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
typedef __v4df __v4df_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
}
/// \brief Moves single-precision floating point values from a 256-bit vector
@ -3428,7 +3631,8 @@ _mm256_stream_pd(double *__a, __m256d __b)
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_ps(float *__p, __m256 __a)
{
__builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
}
/* Create vectors */
@ -4310,9 +4514,10 @@ _mm256_castsi256_si128(__m256i __a)
}
/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
/// contain the value of the source vector. The contents of the upper 128
/// bits are undefined.
/// 128-bit floating-point vector of [2 x double].
///
/// The lower 128 bits contain the value of the source vector. The contents
/// of the upper 128 bits are undefined.
///
/// \headerfile <x86intrin.h>
///
@ -4330,9 +4535,10 @@ _mm256_castpd128_pd256(__m128d __a)
}
/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
/// the value of the source vector. The contents of the upper 128 bits are
/// undefined.
/// 128-bit floating-point vector of [4 x float].
///
/// The lower 128 bits contain the value of the source vector. The contents
/// of the upper 128 bits are undefined.
///
/// \headerfile <x86intrin.h>
///
@ -4350,6 +4556,7 @@ _mm256_castps128_ps256(__m128 __a)
}
/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
///
/// The lower 128 bits contain the value of the source vector. The contents
/// of the upper 128 bits are undefined.
///
@ -4367,6 +4574,61 @@ _mm256_castsi128_si256(__m128i __a)
return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
}
/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
/// contain the value of the source vector. The upper 128 bits are set
/// to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double].
/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
/// contain the value of the parameter. The upper 128 bits are set to zero.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_zextpd128_pd256(__m128d __a)
{
return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
}
/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
/// the value of the source vector. The upper 128 bits are set to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float].
/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
/// contain the value of the parameter. The upper 128 bits are set to zero.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_zextps128_ps256(__m128 __a)
{
return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
}
/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
/// The lower 128 bits contain the value of the source vector. The upper
/// 128 bits are set to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic has no corresponding instruction.
///
/// \param __a
/// A 128-bit integer vector.
/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
/// the parameter. The upper 128 bits are set to zero.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_zextsi128_si256(__m128i __a)
{
return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
}
/*
Vector insert.
We use macros rather than inlines because we only want to accept
@ -4375,8 +4637,10 @@ _mm256_castsi128_si256(__m128i __a)
/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
/// a 256-bit vector of [8 x float] given in the first parameter, and then
/// replacing either the upper or the lower 128 bits with the contents of a
/// 128-bit vector of [4 x float] in the second parameter. The immediate
/// integer parameter determines between the upper or the lower 128 bits.
/// 128-bit vector of [4 x float] in the second parameter.
///
/// The immediate integer parameter determines between the upper or the lower
/// 128 bits.
///
/// \headerfile <x86intrin.h>
///
@ -4420,8 +4684,10 @@ _mm256_castsi128_si256(__m128i __a)
/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
/// a 256-bit vector of [4 x double] given in the first parameter, and then
/// replacing either the upper or the lower 128 bits with the contents of a
/// 128-bit vector of [2 x double] in the second parameter. The immediate
/// integer parameter determines between the upper or the lower 128 bits.
/// 128-bit vector of [2 x double] in the second parameter.
///
/// The immediate integer parameter determines between the upper or the lower
/// 128 bits.
///
/// \headerfile <x86intrin.h>
///
@ -4461,8 +4727,10 @@ _mm256_castsi128_si256(__m128i __a)
/// \brief Constructs a new 256-bit integer vector by first duplicating a
/// 256-bit integer vector given in the first parameter, and then replacing
/// either the upper or the lower 128 bits with the contents of a 128-bit
/// integer vector in the second parameter. The immediate integer parameter
/// determines between the upper or the lower 128 bits.
/// integer vector in the second parameter.
///
/// The immediate integer parameter determines between the upper or the lower
/// 128 bits.
///
/// \headerfile <x86intrin.h>
///

View File

@ -28,107 +28,17 @@
#ifndef __BMIINTRIN_H
#define __BMIINTRIN_H
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned short _tzcnt_u16(unsigned short a);
/// \endcode
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 16-bit integer containing the number of trailing zero
/// bits in the operand.
#define _tzcnt_u16(a) (__tzcnt_u16((a)))
/// \brief Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _andn_u32(unsigned int a, unsigned int b);
/// \endcode
///
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param a
/// An unsigned integer containing one of the operands.
/// \param b
/// An unsigned integer containing one of the operands.
/// \returns An unsigned integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
#define _andn_u32(a, b) (__andn_u32((a), (b)))
/* _bextr_u32 != __bextr_u32 */
/// \brief Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _blsi_u32(unsigned int a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param a
/// An unsigned integer whose bits are to be cleared.
/// \returns An unsigned integer containing the result of clearing the bits from
/// the source operand.
#define _blsi_u32(a) (__blsi_u32((a)))
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _blsmsk_u32(unsigned int a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param a
/// An unsigned integer used to create the mask.
/// \returns An unsigned integer containing the newly created mask.
#define _blsmsk_u32(a) (__blsmsk_u32((a)))
/// \brief Clears the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _blsr_u32(unsigned int a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param a
/// An unsigned integer containing the operand to be cleared.
/// \returns An unsigned integer containing the result of clearing the source
/// operand.
#define _blsr_u32(a) (__blsr_u32((a)))
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _tzcnt_u32(unsigned int a);
/// \endcode
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of trailing zero
/// bits in the operand.
#define _tzcnt_u32(a) (__tzcnt_u32((a)))
/* Define the default attributes for the functions in this file. */
@ -238,7 +148,7 @@ __blsi_u32(unsigned int __X)
}
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least siginificant bit that is set to 1 in the source
/// including the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
@ -254,7 +164,7 @@ __blsmsk_u32(unsigned int __X)
return __X ^ (__X - 1);
}
/// \brief Clears the least siginificant bit that is set to 1 in the source
/// \brief Clears the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
@ -305,91 +215,15 @@ _mm_tzcnt_32(unsigned int __X)
#ifdef __x86_64__
/// \brief Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
/// \endcode
///
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer containing one of the operands.
/// \param b
/// An unsigned 64-bit integer containing one of the operands.
/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
#define _andn_u64(a, b) (__andn_u64((a), (b)))
/* _bextr_u64 != __bextr_u64 */
/// \brief Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsi_u64(unsigned long long a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer whose bits are to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// bits from the source operand.
#define _blsi_u64(a) (__blsi_u64((a)))
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsmsk_u64(unsigned long long a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer used to create the mask.
/// \returns A unsigned 64-bit integer containing the newly created mask.
#define _blsmsk_u64(a) (__blsmsk_u64((a)))
/// \brief Clears the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsr_u64(unsigned long long a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer containing the operand to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// source operand.
#define _blsr_u64(a) (__blsr_u64((a)))
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _tzcnt_u64(unsigned long long a);
/// \endcode
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of trailing zero
/// bits in the operand.
#define _tzcnt_u64(a) (__tzcnt_u64((a)))
/// \brief Performs a bitwise AND of the second operand with the one's
@ -475,7 +309,7 @@ __blsi_u64(unsigned long long __X)
}
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least siginificant bit that is set to 1 in the source
/// including the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
@ -484,14 +318,14 @@ __blsi_u64(unsigned long long __X)
///
/// \param __X
/// An unsigned 64-bit integer used to create the mask.
/// \returns A unsigned 64-bit integer containing the newly created mask.
/// \returns An unsigned 64-bit integer containing the newly created mask.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsmsk_u64(unsigned long long __X)
{
return __X ^ (__X - 1);
}
/// \brief Clears the least siginificant bit that is set to 1 in the source
/// \brief Clears the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>

50
c_headers/clzerointrin.h Normal file
View File

@ -0,0 +1,50 @@
/*===----------------------- clzerointrin.h - CLZERO ----------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _CLZEROINTRIN_H
#define _CLZEROINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("clzero")))
/// \brief Loads the cache line address and zero's out the cacheline
///
/// \headerfile <clzerointrin.h>
///
/// This intrinsic corresponds to the <c> CLZERO </c> instruction.
///
/// \param __line
/// A pointer to a cacheline which needs to be zeroed out.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_clzero (void * __line)
{
__builtin_ia32_clzero ((void *)__line);
}
#undef __DEFAULT_FN_ATTRS
#endif /* _CLZEROINTRIN_H */

View File

@ -79,7 +79,7 @@
#define signature_VORTEX_edx 0x36387865
#define signature_VORTEX_ecx 0x436f5320
/* Features in %ecx for level 1 */
/* Features in %ecx for leaf 1 */
#define bit_SSE3 0x00000001
#define bit_PCLMULQDQ 0x00000002
#define bit_PCLMUL bit_PCLMULQDQ /* for gcc compat */
@ -114,7 +114,7 @@
#define bit_F16C 0x20000000
#define bit_RDRND 0x40000000
/* Features in %edx for level 1 */
/* Features in %edx for leaf 1 */
#define bit_FPU 0x00000001
#define bit_VME 0x00000002
#define bit_DE 0x00000004
@ -147,44 +147,95 @@
#define bit_TM 0x20000000
#define bit_PBE 0x80000000
/* Features in %ebx for level 7 sub-leaf 0 */
/* Features in %ebx for leaf 7 sub-leaf 0 */
#define bit_FSGSBASE 0x00000001
#define bit_SGX 0x00000004
#define bit_BMI 0x00000008
#define bit_HLE 0x00000010
#define bit_AVX2 0x00000020
#define bit_SMEP 0x00000080
#define bit_BMI2 0x00000100
#define bit_ENH_MOVSB 0x00000200
#define bit_RTM 0x00000800
#define bit_MPX 0x00004000
#define bit_AVX512F 0x00010000
#define bit_AVX512DQ 0x00020000
#define bit_RDSEED 0x00040000
#define bit_ADX 0x00080000
#define bit_AVX512IFMA 0x00200000
#define bit_CLFLUSHOPT 0x00800000
#define bit_CLWB 0x01000000
#define bit_AVX512PF 0x04000000
#define bit_AVX51SER 0x08000000
#define bit_AVX512CD 0x10000000
#define bit_SHA 0x20000000
#define bit_AVX512BW 0x40000000
#define bit_AVX512VL 0x80000000
/* Features in %ecx for leaf 7 sub-leaf 0 */
#define bit_PREFTCHWT1 0x00000001
#define bit_AVX512VBMI 0x00000002
#define bit_PKU 0x00000004
#define bit_OSPKE 0x00000010
#define bit_AVX512VPOPCNTDQ 0x00004000
#define bit_RDPID 0x00400000
/* Features in %edx for leaf 7 sub-leaf 0 */
#define bit_AVX5124VNNIW 0x00000004
#define bit_AVX5124FMAPS 0x00000008
/* Features in %eax for leaf 13 sub-leaf 1 */
#define bit_XSAVEOPT 0x00000001
#define bit_XSAVEC 0x00000002
#define bit_XSAVES 0x00000008
/* Features in %ecx for leaf 0x80000001 */
#define bit_LAHF_LM 0x00000001
#define bit_ABM 0x00000020
#define bit_SSE4a 0x00000040
#define bit_PRFCHW 0x00000100
#define bit_XOP 0x00000800
#define bit_LWP 0x00008000
#define bit_FMA4 0x00010000
#define bit_TBM 0x00200000
#define bit_MWAITX 0x20000000
/* Features in %edx for leaf 0x80000001 */
#define bit_MMXEXT 0x00400000
#define bit_LM 0x20000000
#define bit_3DNOWP 0x40000000
#define bit_3DNOW 0x80000000
/* Features in %ebx for leaf 0x80000001 */
#define bit_CLZERO 0x00000001
#if __i386__
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__level))
: "0"(__leaf))
#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__level), "2"(__count))
: "0"(__leaf), "2"(__count))
#else
/* x86-64 uses %rbx as the base register, so preserve it. */
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
__asm(" xchgq %%rbx,%q1\n" \
" cpuid\n" \
" xchgq %%rbx,%q1" \
: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__level))
: "0"(__leaf))
#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
__asm(" xchgq %%rbx,%q1\n" \
" cpuid\n" \
" xchgq %%rbx,%q1" \
: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__level), "2"(__count))
: "0"(__leaf), "2"(__count))
#endif
static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
unsigned int *__ebx, unsigned int *__ecx,
unsigned int *__edx) {
__cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
return 1;
}
static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
{
unsigned int __eax, __ebx, __ecx, __edx;
#if __i386__
@ -208,8 +259,35 @@ static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
return 0;
#endif
__cpuid(__level, __eax, __ebx, __ecx, __edx);
__cpuid(__leaf, __eax, __ebx, __ecx, __edx);
if (__sig)
*__sig = __ebx;
return __eax;
}
static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax,
unsigned int *__ebx, unsigned int *__ecx,
unsigned int *__edx)
{
unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
if (__max_leaf == 0 || __max_leaf < __leaf)
return 0;
__cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
return 1;
}
static __inline int __get_cpuid_count (unsigned int __leaf,
unsigned int __subleaf,
unsigned int *__eax, unsigned int *__ebx,
unsigned int *__ecx, unsigned int *__edx)
{
unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
if (__max_leaf == 0 || __max_leaf < __leaf)
return 0;
__cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
return 1;
}

View File

@ -302,7 +302,7 @@ _mm_min_pd(__m128d __a, __m128d __b)
return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
}
/// \brief Compares lower 64-bits double-precision values of both operands, and
/// \brief Compares lower 64-bit double-precision values of both operands, and
/// returns the greater of the pair of values in the lower 64-bits of the
/// result. The upper 64 bits of the result are copied from the upper double-
/// precision value of the first operand.
@ -462,8 +462,9 @@ _mm_cmplt_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are less than or equal to those in the second operand. Each
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
/// operand are less than or equal to those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -482,8 +483,9 @@ _mm_cmple_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are greater than those in the second operand. Each comparison
/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
/// operand are greater than those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -502,8 +504,9 @@ _mm_cmpgt_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are greater than or equal to those in the second operand. Each
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
/// operand are greater than or equal to those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -522,9 +525,10 @@ _mm_cmpge_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are ordered with respect to those in the second operand. A pair
/// of double-precision values are "ordered" with respect to each other if
/// neither value is a NaN. Each comparison yields 0h for false,
/// operand are ordered with respect to those in the second operand.
///
/// A pair of double-precision values are "ordered" with respect to each
/// other if neither value is a NaN. Each comparison yields 0h for false,
/// FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
@ -544,9 +548,10 @@ _mm_cmpord_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are unordered with respect to those in the second operand. A pair
/// of double-precision values are "unordered" with respect to each other if
/// one or both values are NaN. Each comparison yields 0h for false,
/// operand are unordered with respect to those in the second operand.
///
/// A pair of double-precision values are "unordered" with respect to each
/// other if one or both values are NaN. Each comparison yields 0h for false,
/// FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
@ -567,8 +572,9 @@ _mm_cmpunord_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are unequal to those in the second operand. Each comparison
/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
/// operand are unequal to those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -587,8 +593,9 @@ _mm_cmpneq_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are not less than those in the second operand. Each comparison
/// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
/// operand are not less than those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -607,8 +614,9 @@ _mm_cmpnlt_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are not less than or equal to those in the second operand. Each
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
/// operand are not less than or equal to those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -627,8 +635,9 @@ _mm_cmpnle_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are not greater than those in the second operand. Each
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
/// operand are not greater than those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -648,6 +657,7 @@ _mm_cmpngt_pd(__m128d __a, __m128d __b)
/// \brief Compares each of the corresponding double-precision values of the
/// 128-bit vectors of [2 x double] to determine if the values in the first
/// operand are not greater than or equal to those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
@ -666,8 +676,9 @@ _mm_cmpnge_pd(__m128d __a, __m128d __b)
}
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] for equality. The
/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
/// the two 128-bit floating-point vectors of [2 x double] for equality.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -690,8 +701,9 @@ _mm_cmpeq_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is less than the corresponding value in
/// the second parameter. The comparison yields 0h for false,
/// FFFFFFFFFFFFFFFFh for true.
/// the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -714,8 +726,9 @@ _mm_cmplt_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is less than or equal to the
/// corresponding value in the second parameter. The comparison yields 0h for
/// false, FFFFFFFFFFFFFFFFh for true.
/// corresponding value in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -738,8 +751,9 @@ _mm_cmple_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is greater than the corresponding value
/// in the second parameter. The comparison yields 0h for false,
/// FFFFFFFFFFFFFFFFh for true.
/// in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -763,8 +777,9 @@ _mm_cmpgt_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is greater than or equal to the
/// corresponding value in the second parameter. The comparison yields 0h for
/// false, FFFFFFFFFFFFFFFFh for true.
/// corresponding value in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -788,9 +803,11 @@ _mm_cmpge_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is "ordered" with respect to the
/// corresponding value in the second parameter. The comparison yields 0h for
/// false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
/// "ordered" with respect to each other if neither value is a NaN.
/// corresponding value in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
/// double-precision values are "ordered" with respect to each other if
/// neither value is a NaN.
///
/// \headerfile <x86intrin.h>
///
@ -813,9 +830,11 @@ _mm_cmpord_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is "unordered" with respect to the
/// corresponding value in the second parameter. The comparison yields 0h
/// for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
/// are "unordered" with respect to each other if one or both values are NaN.
/// corresponding value in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
/// double-precision values are "unordered" with respect to each other if one
/// or both values are NaN.
///
/// \headerfile <x86intrin.h>
///
@ -839,8 +858,9 @@ _mm_cmpunord_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is unequal to the corresponding value in
/// the second parameter. The comparison yields 0h for false,
/// FFFFFFFFFFFFFFFFh for true.
/// the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -863,8 +883,9 @@ _mm_cmpneq_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is not less than the corresponding
/// value in the second parameter. The comparison yields 0h for false,
/// FFFFFFFFFFFFFFFFh for true.
/// value in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -887,8 +908,9 @@ _mm_cmpnlt_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is not less than or equal to the
/// corresponding value in the second parameter. The comparison yields 0h
/// for false, FFFFFFFFFFFFFFFFh for true.
/// corresponding value in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -911,8 +933,9 @@ _mm_cmpnle_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is not greater than the corresponding
/// value in the second parameter. The comparison yields 0h for false,
/// FFFFFFFFFFFFFFFFh for true.
/// value in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -936,8 +959,9 @@ _mm_cmpngt_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is not greater than or equal to the
/// corresponding value in the second parameter. The comparison yields 0h
/// for false, FFFFFFFFFFFFFFFFh for true.
/// corresponding value in the second parameter.
///
/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -982,7 +1006,9 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is less than the corresponding value in
/// the second parameter. The comparison yields 0 for false, 1 for true.
/// the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
///
/// \headerfile <x86intrin.h>
///
@ -1004,8 +1030,9 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is less than or equal to the
/// corresponding value in the second parameter. The comparison yields 0 for
/// false, 1 for true.
/// corresponding value in the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
///
/// \headerfile <x86intrin.h>
///
@ -1027,7 +1054,9 @@ _mm_comile_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is greater than the corresponding value
/// in the second parameter. The comparison yields 0 for false, 1 for true.
/// in the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
///
/// \headerfile <x86intrin.h>
///
@ -1049,8 +1078,9 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is greater than or equal to the
/// corresponding value in the second parameter. The comparison yields 0 for
/// false, 1 for true.
/// corresponding value in the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
///
/// \headerfile <x86intrin.h>
///
@ -1072,7 +1102,9 @@ _mm_comige_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is unequal to the corresponding value in
/// the second parameter. The comparison yields 0 for false, 1 for true.
/// the second parameter.
///
/// The comparison yields 0 for false, 1 for true.
///
/// \headerfile <x86intrin.h>
///
@ -1093,8 +1125,9 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] for equality. The
/// comparison yields 0 for false, 1 for true. If either of the two lower
/// double-precision values is NaN, 1 is returned.
/// comparison yields 0 for false, 1 for true.
///
/// If either of the two lower double-precision values is NaN, 1 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1117,8 +1150,10 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is less than the corresponding value in
/// the second parameter. The comparison yields 0 for false, 1 for true. If
/// either of the two lower double-precision values is NaN, 1 is returned.
/// the second parameter.
///
/// The comparison yields 0 for false, 1 for true. If either of the two lower
/// double-precision values is NaN, 1 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1141,9 +1176,10 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is less than or equal to the
/// corresponding value in the second parameter. The comparison yields 0 for
/// false, 1 for true. If either of the two lower double-precision values is
/// NaN, 1 is returned.
/// corresponding value in the second parameter.
///
/// The comparison yields 0 for false, 1 for true. If either of the two lower
/// double-precision values is NaN, 1 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1166,8 +1202,10 @@ _mm_ucomile_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is greater than the corresponding value
/// in the second parameter. The comparison yields 0 for false, 1 for true.
/// If either of the two lower double-precision values is NaN, 0 is returned.
/// in the second parameter.
///
/// The comparison yields 0 for false, 1 for true. If either of the two lower
/// double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1190,9 +1228,10 @@ _mm_ucomigt_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is greater than or equal to the
/// corresponding value in the second parameter. The comparison yields 0 for
/// false, 1 for true. If either of the two lower double-precision values
/// is NaN, 0 is returned.
/// corresponding value in the second parameter.
///
/// The comparison yields 0 for false, 1 for true. If either of the two
/// lower double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1215,8 +1254,10 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
/// \brief Compares the lower double-precision floating-point values in each of
/// the two 128-bit floating-point vectors of [2 x double] to determine if
/// the value in the first parameter is unequal to the corresponding value in
/// the second parameter. The comparison yields 0 for false, 1 for true. If
/// either of the two lower double-precision values is NaN, 0 is returned.
/// the second parameter.
///
/// The comparison yields 0 for false, 1 for true. If either of the two lower
/// double-precision values is NaN, 0 is returned.
///
/// \headerfile <x86intrin.h>
///
@ -1278,8 +1319,9 @@ _mm_cvtps_pd(__m128 __a)
/// \brief Converts the lower two integer elements of a 128-bit vector of
/// [4 x i32] into two double-precision floating-point values, returned in a
/// 128-bit vector of [2 x double]. The upper two elements of the input
/// vector are unused.
/// 128-bit vector of [2 x double].
///
/// The upper two elements of the input vector are unused.
///
/// \headerfile <x86intrin.h>
///
@ -1287,7 +1329,9 @@ _mm_cvtps_pd(__m128 __a)
///
/// \param __a
/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
/// converted to double-precision values. The upper two elements are unused.
/// converted to double-precision values.
///
/// The upper two elements are unused.
/// \returns A 128-bit vector of [2 x double] containing the converted values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtepi32_pd(__m128i __a)
@ -1409,10 +1453,11 @@ _mm_cvtss_sd(__m128d __a, __m128 __b)
/// \brief Converts the two double-precision floating-point elements of a
/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
/// result of either conversion is inexact, the result is truncated (rounded
/// towards zero) regardless of the current MXCSR setting. The upper 64 bits
/// of the result vector are set to zero.
/// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
///
/// If the result of either conversion is inexact, the result is truncated
/// (rounded towards zero) regardless of the current MXCSR setting. The upper
/// 64 bits of the result vector are set to zero.
///
/// \headerfile <x86intrin.h>
///
@ -1466,9 +1511,10 @@ _mm_cvtpd_pi32(__m128d __a)
/// \brief Converts the two double-precision floating-point elements of a
/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
/// returned in a 64-bit vector of [2 x i32]. If the result of either
/// conversion is inexact, the result is truncated (rounded towards zero)
/// regardless of the current MXCSR setting.
/// returned in a 64-bit vector of [2 x i32].
///
/// If the result of either conversion is inexact, the result is truncated
/// (rounded towards zero) regardless of the current MXCSR setting.
///
/// \headerfile <x86intrin.h>
///
@ -1599,6 +1645,17 @@ _mm_loadu_pd(double const *__dp)
return ((struct __loadu_pd*)__dp)->__v;
}
/// \brief Loads a 64-bit integer value to the low element of a 128-bit integer
/// vector and clears the upper element.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
///
/// \param __a
/// A pointer to a 64-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si64(void const *__a)
{
@ -1609,6 +1666,17 @@ _mm_loadu_si64(void const *__a)
return (__m128i){__u, 0L};
}
/// \brief Loads a 64-bit double-precision value to the low element of a
/// 128-bit integer vector and clears the upper element.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
///
/// \param __dp
/// A pointer to a memory location containing a double-precision value.
/// The address of the memory location does not have to be aligned.
/// \returns A 128-bit vector of [2 x double] containing the loaded value.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_sd(double const *__dp)
{
@ -1728,6 +1796,24 @@ _mm_set1_pd(double __w)
return (__m128d){ __w, __w };
}
/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
/// of the two double-precision floating-point vector elements set to the
/// specified double-precision floating-point value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
///
/// \param __w
/// A double-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd1(double __w)
{
return _mm_set1_pd(__w);
}
/// \brief Constructs a 128-bit floating-point vector of [2 x double]
/// initialized with the specified double-precision floating-point values.
///
@ -1787,7 +1873,7 @@ _mm_setzero_pd(void)
/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
/// 64 bits are set to the lower 64 bits of the second parameter. The upper
/// 64 bits are set to the upper 64 bits of the first parameter.
//
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
@ -1825,12 +1911,38 @@ _mm_store_sd(double *__dp, __m128d __a)
((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
}
/// \brief Moves packed double-precision values from a 128-bit vector of
/// [2 x double] to a memory location.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
///
/// \param __dp
/// A pointer to an aligned memory location that can store two
/// double-precision values.
/// \param __a
/// A packed 128-bit vector of [2 x double] containing the values to be
/// moved.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_pd(double *__dp, __m128d __a)
{
*(__m128d*)__dp = __a;
}
/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
/// the upper and lower 64 bits of a memory location.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
///
/// \param __dp
/// A pointer to a memory location that can store two double-precision
/// values.
/// \param __a
/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
/// of the values in \a dp.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_pd(double *__dp, __m128d __a)
{
@ -1940,8 +2052,9 @@ _mm_storel_pd(double *__dp, __m128d __a)
/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
/// saving the lower 8 bits of each sum in the corresponding element of a
/// 128-bit result vector of [16 x i8]. The integer elements of both
/// parameters can be either signed or unsigned.
/// 128-bit result vector of [16 x i8].
///
/// The integer elements of both parameters can be either signed or unsigned.
///
/// \headerfile <x86intrin.h>
///
@ -1961,8 +2074,9 @@ _mm_add_epi8(__m128i __a, __m128i __b)
/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
/// saving the lower 16 bits of each sum in the corresponding element of a
/// 128-bit result vector of [8 x i16]. The integer elements of both
/// parameters can be either signed or unsigned.
/// 128-bit result vector of [8 x i16].
///
/// The integer elements of both parameters can be either signed or unsigned.
///
/// \headerfile <x86intrin.h>
///
@ -1982,8 +2096,9 @@ _mm_add_epi16(__m128i __a, __m128i __b)
/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
/// saving the lower 32 bits of each sum in the corresponding element of a
/// 128-bit result vector of [4 x i32]. The integer elements of both
/// parameters can be either signed or unsigned.
/// 128-bit result vector of [4 x i32].
///
/// The integer elements of both parameters can be either signed or unsigned.
///
/// \headerfile <x86intrin.h>
///
@ -2021,8 +2136,9 @@ _mm_add_si64(__m64 __a, __m64 __b)
/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
/// saving the lower 64 bits of each sum in the corresponding element of a
/// 128-bit result vector of [2 x i64]. The integer elements of both
/// parameters can be either signed or unsigned.
/// 128-bit result vector of [2 x i64].
///
/// The integer elements of both parameters can be either signed or unsigned.
///
/// \headerfile <x86intrin.h>
///
@ -2168,10 +2284,12 @@ _mm_avg_epu16(__m128i __a, __m128i __b)
/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
/// vectors, producing eight intermediate 32-bit signed integer products, and
/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
/// [4 x i32] vector. For example, bits [15:0] of both parameters are
/// multiplied producing a 32-bit product, bits [31:16] of both parameters
/// are multiplied producing a 32-bit product, and the sum of those two
/// products becomes bits [31:0] of the result.
/// [4 x i32] vector.
///
/// For example, bits [15:0] of both parameters are multiplied producing a
/// 32-bit product, bits [31:16] of both parameters are multiplied producing
/// a 32-bit product, and the sum of those two products becomes bits [31:0]
/// of the result.
///
/// \headerfile <x86intrin.h>
///
@ -2369,7 +2487,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b)
/// \brief Computes the absolute differences of corresponding 8-bit integer
/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
/// separately sums the second 8 absolute differences. Packss these two
/// separately sums the second 8 absolute differences. Packs these two
/// unsigned 16-bit integer sums into the upper and lower elements of a
/// [2 x i64] vector.
///
@ -3106,8 +3224,9 @@ _mm_cmpgt_epi8(__m128i __a, __m128i __b)
/// \brief Compares each of the corresponding signed 16-bit values of the
/// 128-bit integer vectors to determine if the values in the first operand
/// are greater than those in the second operand. Each comparison yields 0h
/// for false, FFFFh for true.
/// are greater than those in the second operand.
///
/// Each comparison yields 0h for false, FFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -3126,8 +3245,9 @@ _mm_cmpgt_epi16(__m128i __a, __m128i __b)
/// \brief Compares each of the corresponding signed 32-bit values of the
/// 128-bit integer vectors to determine if the values in the first operand
/// are greater than those in the second operand. Each comparison yields 0h
/// for false, FFFFFFFFh for true.
/// are greater than those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -3146,8 +3266,9 @@ _mm_cmpgt_epi32(__m128i __a, __m128i __b)
/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
/// integer vectors to determine if the values in the first operand are less
/// than those in the second operand. Each comparison yields 0h for false,
/// FFh for true.
/// than those in the second operand.
///
/// Each comparison yields 0h for false, FFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -3166,8 +3287,9 @@ _mm_cmplt_epi8(__m128i __a, __m128i __b)
/// \brief Compares each of the corresponding signed 16-bit values of the
/// 128-bit integer vectors to determine if the values in the first operand
/// are less than those in the second operand. Each comparison yields 0h for
/// false, FFFFh for true.
/// are less than those in the second operand.
///
/// Each comparison yields 0h for false, FFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -3186,8 +3308,9 @@ _mm_cmplt_epi16(__m128i __a, __m128i __b)
/// \brief Compares each of the corresponding signed 32-bit values of the
/// 128-bit integer vectors to determine if the values in the first operand
/// are less than those in the second operand. Each comparison yields 0h for
/// false, FFFFFFFFh for true.
/// are less than those in the second operand.
///
/// Each comparison yields 0h for false, FFFFFFFFh for true.
///
/// \headerfile <x86intrin.h>
///
@ -3885,10 +4008,11 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
/// \brief Moves bytes selected by the mask from the first operand to the
/// specified unaligned memory location. When a mask bit is 1, the
/// corresponding byte is written, otherwise it is not written. To minimize
/// caching, the date is flagged as non-temporal (unlikely to be used again
/// soon). Exception and trap behavior for elements not selected for storage
/// to memory are implementation dependent.
/// corresponding byte is written, otherwise it is not written.
///
/// To minimize caching, the date is flagged as non-temporal (unlikely to be
/// used again soon). Exception and trap behavior for elements not selected
/// for storage to memory are implementation dependent.
///
/// \headerfile <x86intrin.h>
///
@ -3932,8 +4056,10 @@ _mm_storel_epi64(__m128i *__p, __m128i __a)
}
/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
/// aligned memory location. To minimize caching, the data is flagged as
/// non-temporal (unlikely to be used again soon).
/// aligned memory location.
///
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
///
@ -3950,6 +4076,7 @@ _mm_stream_pd(double *__p, __m128d __a)
}
/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
///
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
@ -3967,8 +4094,9 @@ _mm_stream_si128(__m128i *__p, __m128i __a)
__builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
}
/// \brief Stores a 32-bit integer value in the specified memory location. To
/// minimize caching, the data is flagged as non-temporal (unlikely to be
/// \brief Stores a 32-bit integer value in the specified memory location.
///
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
@ -3986,8 +4114,9 @@ _mm_stream_si32(int *__p, int __a)
}
#ifdef __x86_64__
/// \brief Stores a 64-bit integer value in the specified memory location. To
/// minimize caching, the data is flagged as non-temporal (unlikely to be
/// \brief Stores a 64-bit integer value in the specified memory location.
///
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
@ -4019,7 +4148,7 @@ extern "C" {
/// \param __p
/// A pointer to the memory location used to identify the cache line to be
/// flushed.
void _mm_clflush(void const *);
void _mm_clflush(void const * __p);
/// \brief Forces strong memory ordering (serialization) between load
/// instructions preceding this instruction and load instructions following
@ -4141,7 +4270,7 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
/// \param __a
/// A 128-bit integer vector.
/// \param __imm
/// An immediate value. Bits [3:0] selects values from \a __a to be assigned
/// An immediate value. Bits [2:0] selects values from \a __a to be assigned
/// to bits[15:0] of the result. \n
/// 000: assign values from bits [15:0] of \a __a. \n
/// 001: assign values from bits [31:16] of \a __a. \n
@ -4788,4 +4917,12 @@ void _mm_pause(void);
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
#define _MM_DENORMALS_ZERO_ON (0x0040)
#define _MM_DENORMALS_ZERO_OFF (0x0000)
#define _MM_DENORMALS_ZERO_MASK (0x0040)
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
#endif /* __EMMINTRIN_H */

View File

@ -72,9 +72,9 @@ _cvtsh_ss(unsigned short __a)
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) \
((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0]))
#define _cvtss_sh(a, imm) __extension__ ({ \
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0]); })
/// \brief Converts a 128-bit vector containing 32-bit float values into a
/// 128-bit vector containing 16-bit half-precision float values.
@ -99,8 +99,8 @@ _cvtsh_ss(unsigned short __a)
/// \returns A 128-bit vector containing converted 16-bit half-precision float
/// values. The lower 64 bits are used to store the converted 16-bit
/// half-precision floating-point values.
#define _mm_cvtps_ph(a, imm) \
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
#define _mm_cvtps_ph(a, imm) __extension__ ({ \
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
/// \brief Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 128-bit vector containing 32-bit float values.

View File

@ -33,6 +33,15 @@
*/
#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
__STDC_HOSTED__ && __has_include_next(<float.h>)
/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
* of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this
* extra indirection.
*/
#ifdef __APPLE__
#define _FLOAT_H_
#endif
# include_next <float.h>
/* Undefine anything that we'll be redefining below. */

View File

@ -35,14 +35,10 @@
extern "C" {
#endif
#define _TEXASR_PTR(TM_BUF) \
((texasr_t *)((TM_BUF)+0))
#define _TEXASRU_PTR(TM_BUF) \
((texasru_t *)((TM_BUF)+0))
#define _TEXASRL_PTR(TM_BUF) \
((texasrl_t *)((TM_BUF)+4))
#define _TFIAR_PTR(TM_BUF) \
((tfiar_t *)((TM_BUF)+8))
#define _TEXASR_PTR(TM_BUF) ((texasr_t *)((char *)(TM_BUF) + 0))
#define _TEXASRU_PTR(TM_BUF) ((texasru_t *)((char *)(TM_BUF) + 0))
#define _TEXASRL_PTR(TM_BUF) ((texasrl_t *)((char *)(TM_BUF) + 4))
#define _TFIAR_PTR(TM_BUF) ((tfiar_t *)((char *)(TM_BUF) + 8))
typedef char TM_buff_type[16];
@ -178,7 +174,7 @@ extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_is_conflict(void* const __TM_buff)
{
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
/* Return TEXASR bits 11 (Self-Induced Conflict) through
14 (Translation Invalidation Conflict). */
return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;

View File

@ -146,6 +146,10 @@ _mm256_cvtph_ps(__m128i __a)
#include <avx512cdintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
#include <avx512vpopcntdqintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
#include <avx512dqintrin.h>
#endif
@ -208,6 +212,15 @@ _rdrand32_step(unsigned int *__p)
return __builtin_ia32_rdrand32_step(__p);
}
#ifdef __x86_64__
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand64_step(unsigned long long *__p)
{
return __builtin_ia32_rdrand64_step(__p);
}
#endif
#endif /* __RDRND__ */
/* __bit_scan_forward */
static __inline__ int __attribute__((__always_inline__, __nodebug__))
_bit_scan_forward(int __A) {
@ -220,15 +233,6 @@ _bit_scan_reverse(int __A) {
return 31 - __builtin_clz(__A);
}
#ifdef __x86_64__
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand64_step(unsigned long long *__p)
{
return __builtin_ia32_rdrand64_step(__p);
}
#endif
#endif /* __RDRND__ */
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
#ifdef __x86_64__
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))

View File

@ -69,7 +69,6 @@ static __inline__
__int64 __emul(int, int);
static __inline__
unsigned __int64 __emulu(unsigned int, unsigned int);
void __cdecl __fastfail(unsigned int);
unsigned int __getcallerseflags(void);
static __inline__
void __halt(void);
@ -80,16 +79,12 @@ void __incfsdword(unsigned long);
void __incfsword(unsigned long);
unsigned long __indword(unsigned short);
void __indwordstring(unsigned short, unsigned long *, unsigned long);
void __int2c(void);
void __invlpg(void *);
unsigned short __inword(unsigned short);
void __inwordstring(unsigned short, unsigned short *, unsigned long);
void __lidt(void *);
unsigned __int64 __ll_lshift(unsigned __int64, int);
__int64 __ll_rshift(__int64, int);
void __llwpcb(void *);
unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
void __lwpval32(unsigned int, unsigned int, unsigned int);
unsigned int __lzcnt(unsigned int);
unsigned short __lzcnt16(unsigned short);
static __inline__
@ -128,7 +123,6 @@ unsigned __int64 __readmsr(unsigned long);
unsigned __int64 __readpmc(unsigned long);
unsigned long __segmentlimit(unsigned long);
void __sidt(void *);
void *__slwpcb(void);
static __inline__
void __stosb(unsigned char *, unsigned char, size_t);
static __inline__
@ -142,7 +136,6 @@ void __svm_stgi(void);
void __svm_vmload(size_t);
void __svm_vmrun(size_t);
void __svm_vmsave(size_t);
void __ud2(void);
unsigned __int64 __ull_rshift(unsigned __int64, int);
void __vmx_off(void);
void __vmx_vmptrst(unsigned __int64 *);
@ -176,7 +169,6 @@ void __cdecl _disable(void);
void __cdecl _enable(void);
long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
unsigned char _interlockedbittestandreset(long volatile *, long);
static __inline__
unsigned char _interlockedbittestandset(long volatile *, long);
long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
@ -231,8 +223,6 @@ void __incgsbyte(unsigned long);
void __incgsdword(unsigned long);
void __incgsqword(unsigned long);
void __incgsword(unsigned long);
unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
void __lwpval64(unsigned __int64, unsigned int, unsigned int);
unsigned __int64 __lzcnt64(unsigned __int64);
static __inline__
void __movsq(unsigned long long *, unsigned long long const *, size_t);
@ -372,11 +362,6 @@ _bittestandset(long *_BitBase, long _BitPos) {
*_BitBase = *_BitBase | (1 << _BitPos);
return _Res;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
return (_PrevVal >> _BitPos) & 1;
}
#if defined(__arm__) || defined(__aarch64__)
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
@ -872,48 +857,7 @@ _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
return _Comparand;
}
#endif
/*----------------------------------------------------------------------------*\
|* readfs, readgs
|* (Pointers in address space #256 and #257 are relative to the GS and FS
|* segment registers, respectively.)
\*----------------------------------------------------------------------------*/
#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset) \
((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
(__offset))
#ifdef __i386__
static __inline__ unsigned char __DEFAULT_FN_ATTRS
__readfsbyte(unsigned long __offset) {
return *__ptr_to_addr_space(257, unsigned char, __offset);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS
__readfsword(unsigned long __offset) {
return *__ptr_to_addr_space(257, unsigned short, __offset);
}
static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
__readfsqword(unsigned long __offset) {
return *__ptr_to_addr_space(257, unsigned __int64, __offset);
}
#endif
#ifdef __x86_64__
static __inline__ unsigned char __DEFAULT_FN_ATTRS
__readgsbyte(unsigned long __offset) {
return *__ptr_to_addr_space(256, unsigned char, __offset);
}
static __inline__ unsigned short __DEFAULT_FN_ATTRS
__readgsword(unsigned long __offset) {
return *__ptr_to_addr_space(256, unsigned short, __offset);
}
static __inline__ unsigned long __DEFAULT_FN_ATTRS
__readgsdword(unsigned long __offset) {
return *__ptr_to_addr_space(256, unsigned long, __offset);
}
static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
__readgsqword(unsigned long __offset) {
return *__ptr_to_addr_space(256, unsigned __int64, __offset);
}
#endif
#undef __ptr_to_addr_space
/*----------------------------------------------------------------------------*\
|* movs, stos
\*----------------------------------------------------------------------------*/

150
c_headers/lwpintrin.h Normal file
View File

@ -0,0 +1,150 @@
/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __LWPINTRIN_H
#define __LWPINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
/// \brief Parses the LWPCB at the specified address and enables
/// profiling if valid.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
///
/// \param __addr
/// Address to the new Lightweight Profiling Control Block (LWPCB). If the
/// LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
/// Lightweight Profiling.
static __inline__ void __DEFAULT_FN_ATTRS
__llwpcb (void *__addr)
{
__builtin_ia32_llwpcb(__addr);
}
/// \brief Flushes the LWP state to memory and returns the address of the LWPCB.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
///
/// \return
/// Address to the current Lightweight Profiling Control Block (LWPCB).
/// If LWP is not currently enabled, returns NULL.
static __inline__ void* __DEFAULT_FN_ATTRS
__slwpcb ()
{
return __builtin_ia32_slwpcb();
}
/// \brief Inserts programmed event record into the LWP event ring buffer
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
///
/// \param DATA2
/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
/// \param DATA1
/// A 32-bit value is inserted into the 32-bit Data1 field.
/// \param FLAGS
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
/// the event record overwrites the last record in the buffer, the MissedEvents
/// counter in the LWPCB is incremented, the head pointer is not advanced, and
/// 1 is returned. Otherwise 0 is returned.
#define __lwpins32(DATA2, DATA1, FLAGS) \
(__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
/// \brief Decrements the LWP programmed value sample event counter. If the result is
/// negative, inserts an event record into the LWP event ring buffer in memory
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
///
/// \param DATA2
/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
/// \param DATA1
/// A 32-bit value is inserted into the 32-bit Data1 field.
/// \param FLAGS
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
#define __lwpval32(DATA2, DATA1, FLAGS) \
(__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
#ifdef __x86_64__
/// \brief Inserts programmed event record into the LWP event ring buffer
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
///
/// \param DATA2
/// A 64-bit value is inserted into the 64-bit Data2 field.
/// \param DATA1
/// A 32-bit value is inserted into the 32-bit Data1 field.
/// \param FLAGS
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
/// the event record overwrites the last record in the buffer, the MissedEvents
/// counter in the LWPCB is incremented, the head pointer is not advanced, and
/// 1 is returned. Otherwise 0 is returned.
#define __lwpins64(DATA2, DATA1, FLAGS) \
(__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
/// \brief Decrements the LWP programmed value sample event counter. If the result is
/// negative, inserts an event record into the LWP event ring buffer in memory
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
///
/// \param DATA2
/// A 64-bit value is and inserted into the 64-bit Data2 field.
/// \param DATA1
/// A 32-bit value is inserted into the 32-bit Data1 field.
/// \param FLAGS
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
#define __lwpval64(DATA2, DATA1, FLAGS) \
(__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
#endif
#undef __DEFAULT_FN_ATTRS
#endif /* __LWPINTRIN_H */

View File

@ -211,7 +211,7 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
///
/// \param __m1
/// A 64-bit integer vector of [8 x i8]. \n
/// A 64-bit integer vector of [8 x i8]. \n
/// Bits [39:32] are written to bits [7:0] of the result. \n
/// Bits [47:40] are written to bits [23:16] of the result. \n
/// Bits [55:48] are written to bits [39:32] of the result. \n
@ -608,10 +608,11 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
/// element of the first 64-bit integer vector of [8 x i8]. If an element of
/// the first vector is less than the corresponding element of the second
/// vector, the result is saturated to 0. The results are packed into a
/// 64-bit integer vector of [8 x i8].
/// element of the first 64-bit integer vector of [8 x i8].
///
/// If an element of the first vector is less than the corresponding element
/// of the second vector, the result is saturated to 0. The results are
/// packed into a 64-bit integer vector of [8 x i8].
///
/// \headerfile <x86intrin.h>
///
@ -631,10 +632,11 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
/// integer vector of [4 x i16] from the corresponding 16-bit unsigned
/// integer element of the first 64-bit integer vector of [4 x i16]. If an
/// element of the first vector is less than the corresponding element of the
/// second vector, the result is saturated to 0. The results are packed into
/// a 64-bit integer vector of [4 x i16].
/// integer element of the first 64-bit integer vector of [4 x i16].
///
/// If an element of the first vector is less than the corresponding element
/// of the second vector, the result is saturated to 0. The results are
/// packed into a 64-bit integer vector of [4 x i16].
///
/// \headerfile <x86intrin.h>
///
@ -657,9 +659,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
/// element of the second 64-bit integer vector of [4 x i16] and get four
/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
/// The lower 32 bits of these two sums are packed into a 64-bit integer
/// vector of [2 x i32]. For example, bits [15:0] of both parameters are
/// multiplied, bits [31:16] of both parameters are multiplied, and the sum
/// of both results is written to bits [31:0] of the result.
/// vector of [2 x i32].
///
/// For example, bits [15:0] of both parameters are multiplied, bits [31:16]
/// of both parameters are multiplied, and the sum of both results is written
/// to bits [31:0] of the result.
///
/// \headerfile <x86intrin.h>
///
@ -851,10 +855,11 @@ _mm_slli_si64(__m64 __m, int __count)
/// \brief Right-shifts each 16-bit integer element of the first parameter,
/// which is a 64-bit integer vector of [4 x i16], by the number of bits
/// specified by the second parameter, which is a 64-bit integer. High-order
/// bits are filled with the sign bit of the initial value of each 16-bit
/// element. The 16-bit results are packed into a 64-bit integer vector of
/// [4 x i16].
/// specified by the second parameter, which is a 64-bit integer.
///
/// High-order bits are filled with the sign bit of the initial value of each
/// 16-bit element. The 16-bit results are packed into a 64-bit integer
/// vector of [4 x i16].
///
/// \headerfile <x86intrin.h>
///
@ -874,6 +879,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
/// of [4 x i16] by the number of bits specified by a 32-bit integer.
///
/// High-order bits are filled with the sign bit of the initial value of each
/// 16-bit element. The 16-bit results are packed into a 64-bit integer
/// vector of [4 x i16].
@ -896,10 +902,11 @@ _mm_srai_pi16(__m64 __m, int __count)
/// \brief Right-shifts each 32-bit integer element of the first parameter,
/// which is a 64-bit integer vector of [2 x i32], by the number of bits
/// specified by the second parameter, which is a 64-bit integer. High-order
/// bits are filled with the sign bit of the initial value of each 32-bit
/// element. The 32-bit results are packed into a 64-bit integer vector of
/// [2 x i32].
/// specified by the second parameter, which is a 64-bit integer.
///
/// High-order bits are filled with the sign bit of the initial value of each
/// 32-bit element. The 32-bit results are packed into a 64-bit integer
/// vector of [2 x i32].
///
/// \headerfile <x86intrin.h>
///
@ -919,6 +926,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
/// of [2 x i32] by the number of bits specified by a 32-bit integer.
///
/// High-order bits are filled with the sign bit of the initial value of each
/// 32-bit element. The 32-bit results are packed into a 64-bit integer
/// vector of [2 x i32].
@ -941,9 +949,10 @@ _mm_srai_pi32(__m64 __m, int __count)
/// \brief Right-shifts each 16-bit integer element of the first parameter,
/// which is a 64-bit integer vector of [4 x i16], by the number of bits
/// specified by the second parameter, which is a 64-bit integer. High-order
/// bits are cleared. The 16-bit results are packed into a 64-bit integer
/// vector of [4 x i16].
/// specified by the second parameter, which is a 64-bit integer.
///
/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
/// integer vector of [4 x i16].
///
/// \headerfile <x86intrin.h>
///
@ -963,6 +972,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
/// of [4 x i16] by the number of bits specified by a 32-bit integer.
///
/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
/// integer vector of [4 x i16].
///
@ -984,9 +994,10 @@ _mm_srli_pi16(__m64 __m, int __count)
/// \brief Right-shifts each 32-bit integer element of the first parameter,
/// which is a 64-bit integer vector of [2 x i32], by the number of bits
/// specified by the second parameter, which is a 64-bit integer. High-order
/// bits are cleared. The 32-bit results are packed into a 64-bit integer
/// vector of [2 x i32].
/// specified by the second parameter, which is a 64-bit integer.
///
/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
/// integer vector of [2 x i32].
///
/// \headerfile <x86intrin.h>
///
@ -1006,6 +1017,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
/// of [2 x i32] by the number of bits specified by a 32-bit integer.
///
/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
/// integer vector of [2 x i32].
///
@ -1026,8 +1038,9 @@ _mm_srli_pi32(__m64 __m, int __count)
}
/// \brief Right-shifts the first 64-bit integer parameter by the number of bits
/// specified by the second 64-bit integer parameter. High-order bits are
/// cleared.
/// specified by the second 64-bit integer parameter.
///
/// High-order bits are cleared.
///
/// \headerfile <x86intrin.h>
///
@ -1046,7 +1059,9 @@ _mm_srl_si64(__m64 __m, __m64 __count)
/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
/// number of bits specified by the second parameter, which is a 32-bit
/// integer. High-order bits are cleared.
/// integer.
///
/// High-order bits are cleared.
///
/// \headerfile <x86intrin.h>
///
@ -1140,8 +1155,9 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
/// [8 x i8] to determine if the element of the first vector is equal to the
/// corresponding element of the second vector. The comparison yields 0 for
/// false, 0xFF for true.
/// corresponding element of the second vector.
///
/// The comparison yields 0 for false, 0xFF for true.
///
/// \headerfile <x86intrin.h>
///
@ -1161,8 +1177,9 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
/// [4 x i16] to determine if the element of the first vector is equal to the
/// corresponding element of the second vector. The comparison yields 0 for
/// false, 0xFFFF for true.
/// corresponding element of the second vector.
///
/// The comparison yields 0 for false, 0xFFFF for true.
///
/// \headerfile <x86intrin.h>
///
@ -1182,8 +1199,9 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
/// [2 x i32] to determine if the element of the first vector is equal to the
/// corresponding element of the second vector. The comparison yields 0 for
/// false, 0xFFFFFFFF for true.
/// corresponding element of the second vector.
///
/// The comparison yields 0 for false, 0xFFFFFFFF for true.
///
/// \headerfile <x86intrin.h>
///
@ -1203,8 +1221,9 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
/// [8 x i8] to determine if the element of the first vector is greater than
/// the corresponding element of the second vector. The comparison yields 0
/// for false, 0xFF for true.
/// the corresponding element of the second vector.
///
/// The comparison yields 0 for false, 0xFF for true.
///
/// \headerfile <x86intrin.h>
///
@ -1224,8 +1243,9 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
/// [4 x i16] to determine if the element of the first vector is greater than
/// the corresponding element of the second vector. The comparison yields 0
/// for false, 0xFFFF for true.
/// the corresponding element of the second vector.
///
/// The comparison yields 0 for false, 0xFFFF for true.
///
/// \headerfile <x86intrin.h>
///
@ -1245,8 +1265,9 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
/// [2 x i32] to determine if the element of the first vector is greater than
/// the corresponding element of the second vector. The comparison yields 0
/// for false, 0xFFFFFFFF for true.
/// the corresponding element of the second vector.
///
/// The comparison yields 0 for false, 0xFFFFFFFF for true.
///
/// \headerfile <x86intrin.h>
///
@ -1268,7 +1289,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction.
/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
///
/// \returns An initialized 64-bit integer vector with all elements set to zero.
static __inline__ __m64 __DEFAULT_FN_ATTRS

View File

@ -61,6 +61,7 @@ module _Builtin_intrinsics [system] [extern_c] {
textual header "xopintrin.h"
textual header "fma4intrin.h"
textual header "mwaitxintrin.h"
textual header "clzerointrin.h"
explicit module mm_malloc {
requires !freestanding

File diff suppressed because it is too large Load Diff

View File

@ -31,9 +31,11 @@
__attribute__((__always_inline__, __nodebug__, __target__("sse3")))
/// \brief Loads data from an unaligned memory location to elements in a 128-bit
/// vector. If the address of the data is not 16-byte aligned, the
/// instruction may read two adjacent aligned blocks of memory to retrieve
/// the requested data.
/// vector.
///
/// If the address of the data is not 16-byte aligned, the instruction may
/// read two adjacent aligned blocks of memory to retrieve the requested
/// data.
///
/// \headerfile <x86intrin.h>
///
@ -115,7 +117,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
/// vector of [4 x float] to float values stored in a 128-bit vector of
/// [4 x float].
/// [4 x float].
///
/// \headerfile <x86intrin.h>
///
@ -136,7 +138,7 @@ _mm_movehdup_ps(__m128 __a)
}
/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>
///
@ -257,14 +259,6 @@ _mm_movedup_pd(__m128d __a)
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
}
#define _MM_DENORMALS_ZERO_ON (0x0040)
#define _MM_DENORMALS_ZERO_OFF (0x0000)
#define _MM_DENORMALS_ZERO_MASK (0x0040)
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
/// \brief Establishes a linear address memory range to be monitored and puts
/// the processor in the monitor event pending state. Data stored in the
/// monitored address range causes the processor to exit the pending state.

View File

@ -29,12 +29,38 @@
#define __PRFCHWINTRIN_H
#if defined(__PRFCHW__) || defined(__3dNOW__)
/// \brief Loads a memory sequence containing the specified memory address into
/// all data cache levels. The cache-coherency state is set to exclusive.
/// Data can be read from and written to the cache line without additional
/// delay.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
///
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_m_prefetch(void *__P)
{
__builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
}
/// \brief Loads a memory sequence containing the specified memory address into
/// the L1 data cache and sets the cache-coherency to modified. This
/// provides a hint to the processor that the cache line will be modified.
/// It is intended for use when the cache line will be written to shortly
/// after the prefetch is performed.
///
/// Note that the effect of this intrinsic is dependent on the processor
/// implementation.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PREFETCHW instruction.
///
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_m_prefetchw(void *__P)
{

File diff suppressed because it is too large Load Diff

View File

@ -43,14 +43,12 @@ typedef __builtin_va_list va_list;
#define va_copy(dest, src) __builtin_va_copy(dest, src)
#endif
/* Hack required to make standard headers work, at least on Ubuntu */
#ifndef __GNUC_VA_LIST
#define __GNUC_VA_LIST 1
#endif
typedef __builtin_va_list __gnuc_va_list;
/* zig: added because glibc stdio.h was duplicately defining va_list
*/
#define _VA_LIST_DEFINED
#endif
#endif /* __STDARG_H */

View File

@ -40,16 +40,16 @@ extern "C" {
/* 7.17.1 Introduction */
#define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE
#define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE
#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE
#define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE
#define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
#define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE
#define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE
#define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE
#define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE
#define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE
#define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
#define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
#define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
#define ATOMIC_SHORT_LOCK_FREE __CLANG_ATOMIC_SHORT_LOCK_FREE
#define ATOMIC_INT_LOCK_FREE __CLANG_ATOMIC_INT_LOCK_FREE
#define ATOMIC_LONG_LOCK_FREE __CLANG_ATOMIC_LONG_LOCK_FREE
#define ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE
#define ATOMIC_POINTER_LOCK_FREE __CLANG_ATOMIC_POINTER_LOCK_FREE
/* 7.17.2 Initialization */

View File

@ -255,19 +255,16 @@ typedef __uint_least8_t uint_fast8_t;
*/
#define __stdint_join3(a,b,c) a ## b ## c
#define __intn_t(n) __stdint_join3( int, n, _t)
#define __uintn_t(n) __stdint_join3(uint, n, _t)
#ifndef _INTPTR_T
#ifndef __intptr_t_defined
typedef __intn_t(__INTPTR_WIDTH__) intptr_t;
typedef __INTPTR_TYPE__ intptr_t;
#define __intptr_t_defined
#define _INTPTR_T
#endif
#endif
#ifndef _UINTPTR_T
typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
typedef __UINTPTR_TYPE__ uintptr_t;
#define _UINTPTR_T
#endif
@ -659,12 +656,12 @@ typedef __UINTMAX_TYPE__ uintmax_t;
/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
/* C99 7.18.3 Limits of other integer types. */
#define INTPTR_MIN __INTN_MIN(__INTPTR_WIDTH__)
#define INTPTR_MAX __INTN_MAX(__INTPTR_WIDTH__)
#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
#define PTRDIFF_MIN __INTN_MIN(__PTRDIFF_WIDTH__)
#define PTRDIFF_MAX __INTN_MAX(__PTRDIFF_WIDTH__)
#define SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
#define INTPTR_MIN (-__INTPTR_MAX__-1)
#define INTPTR_MAX __INTPTR_MAX__
#define UINTPTR_MAX __UINTPTR_MAX__
#define PTRDIFF_MIN (-__PTRDIFF_MAX__-1)
#define PTRDIFF_MAX __PTRDIFF_MAX__
#define SIZE_MAX __SIZE_MAX__
/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
* is enabled. */
@ -673,9 +670,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#endif
/* C99 7.18.2.5 Limits of greatest-width integer types. */
#define INTMAX_MIN __INTN_MIN(__INTMAX_WIDTH__)
#define INTMAX_MAX __INTN_MAX(__INTMAX_WIDTH__)
#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
#define INTMAX_MIN (-__INTMAX_MAX__-1)
#define INTMAX_MAX __INTMAX_MAX__
#define UINTMAX_MAX __UINTMAX_MAX__
/* C99 7.18.3 Limits of other integer types. */
#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
@ -700,8 +697,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#endif
/* 7.18.4.2 Macros for greatest-width integer constants. */
#define INTMAX_C(v) __INTN_C(__INTMAX_WIDTH__, v)
#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__)
#define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)
#endif /* __STDC_HOSTED__ */
#endif /* __CLANG_STDINT_H */

View File

@ -22,12 +22,21 @@
*
\*===----------------------------------------------------------------------===*/
#ifndef __TGMATH_H
#define __TGMATH_H
#ifndef __CLANG_TGMATH_H
#define __CLANG_TGMATH_H
/* C99 7.22 Type-generic math <tgmath.h>. */
#include <math.h>
/*
* Allow additional definitions and implementation-defined values on Apple
* platforms. This is done after #include <math.h> to avoid depcycle conflicts
* between libcxx and darwin in C++ modules builds.
*/
#if defined(__APPLE__) && __STDC_HOSTED__ && __has_include_next(<tgmath.h>)
# include_next <tgmath.h>
#else
/* C++ handles type genericity with overloading in math.h. */
#ifndef __cplusplus
#include <complex.h>
@ -1371,4 +1380,5 @@ static long double
#undef _TG_ATTRS
#endif /* __cplusplus */
#endif /* __TGMATH_H */
#endif /* __has_include_next */
#endif /* __CLANG_TGMATH_H */

View File

@ -469,10 +469,11 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
/// values contained in the first source operand and packed 8-bit signed
/// integer values contained in the second source operand, adds pairs of
/// contiguous products with signed saturation, and writes the 16-bit sums to
/// the corresponding bits in the destination. For example, bits [7:0] of
/// both operands are multiplied, bits [15:8] of both operands are
/// multiplied, and the sum of both results is written to bits [15:0] of the
/// destination.
/// the corresponding bits in the destination.
///
/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
/// both operands are multiplied, and the sum of both results is written to
/// bits [15:0] of the destination.
///
/// \headerfile <x86intrin.h>
///
@ -502,10 +503,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
/// values contained in the first source operand and packed 8-bit signed
/// integer values contained in the second source operand, adds pairs of
/// contiguous products with signed saturation, and writes the 16-bit sums to
/// the corresponding bits in the destination. For example, bits [7:0] of
/// both operands are multiplied, bits [15:8] of both operands are
/// multiplied, and the sum of both results is written to bits [15:0] of the
/// destination.
/// the corresponding bits in the destination.
///
/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
/// both operands are multiplied, and the sum of both results is written to
/// bits [15:0] of the destination.
///
/// \headerfile <x86intrin.h>
///
@ -619,13 +621,14 @@ _mm_shuffle_pi8(__m64 __a, __m64 __b)
}
/// \brief For each 8-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// byte in the second source is negative, calculate the two's complement of
/// the corresponding byte in the first source, and write that value to the
/// destination. If the byte in the second source is positive, copy the
/// corresponding byte from the first source to the destination. If the byte
/// in the second source is zero, clear the corresponding byte in the
/// destination.
/// the following actions as specified by the second source operand.
///
/// If the byte in the second source is negative, calculate the two's
/// complement of the corresponding byte in the first source, and write that
/// value to the destination. If the byte in the second source is positive,
/// copy the corresponding byte from the first source to the destination. If
/// the byte in the second source is zero, clear the corresponding byte in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
@ -644,13 +647,14 @@ _mm_sign_epi8(__m128i __a, __m128i __b)
}
/// \brief For each 16-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// word in the second source is negative, calculate the two's complement of
/// the corresponding word in the first source, and write that value to the
/// destination. If the word in the second source is positive, copy the
/// corresponding word from the first source to the destination. If the word
/// in the second source is zero, clear the corresponding word in the
/// destination.
/// the following actions as specified by the second source operand.
///
/// If the word in the second source is negative, calculate the two's
/// complement of the corresponding word in the first source, and write that
/// value to the destination. If the word in the second source is positive,
/// copy the corresponding word from the first source to the destination. If
/// the word in the second source is zero, clear the corresponding word in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
@ -669,8 +673,9 @@ _mm_sign_epi16(__m128i __a, __m128i __b)
}
/// \brief For each 32-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// doubleword in the second source is negative, calculate the two's
/// the following actions as specified by the second source operand.
///
/// If the doubleword in the second source is negative, calculate the two's
/// complement of the corresponding word in the first source, and write that
/// value to the destination. If the doubleword in the second source is
/// positive, copy the corresponding word from the first source to the
@ -694,13 +699,14 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
}
/// \brief For each 8-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// byte in the second source is negative, calculate the two's complement of
/// the corresponding byte in the first source, and write that value to the
/// destination. If the byte in the second source is positive, copy the
/// corresponding byte from the first source to the destination. If the byte
/// in the second source is zero, clear the corresponding byte in the
/// destination.
/// the following actions as specified by the second source operand.
///
/// If the byte in the second source is negative, calculate the two's
/// complement of the corresponding byte in the first source, and write that
/// value to the destination. If the byte in the second source is positive,
/// copy the corresponding byte from the first source to the destination. If
/// the byte in the second source is zero, clear the corresponding byte in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
@ -719,13 +725,14 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
}
/// \brief For each 16-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// word in the second source is negative, calculate the two's complement of
/// the corresponding word in the first source, and write that value to the
/// destination. If the word in the second source is positive, copy the
/// corresponding word from the first source to the destination. If the word
/// in the second source is zero, clear the corresponding word in the
/// destination.
/// the following actions as specified by the second source operand.
///
/// If the word in the second source is negative, calculate the two's
/// complement of the corresponding word in the first source, and write that
/// value to the destination. If the word in the second source is positive,
/// copy the corresponding word from the first source to the destination. If
/// the word in the second source is zero, clear the corresponding word in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
@ -744,8 +751,9 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
}
/// \brief For each 32-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// doubleword in the second source is negative, calculate the two's
/// the following actions as specified by the second source operand.
///
/// If the doubleword in the second source is negative, calculate the two's
/// complement of the corresponding doubleword in the first source, and
/// write that value to the destination. If the doubleword in the second
/// source is positive, copy the corresponding doubleword from the first

File diff suppressed because it is too large Load Diff

View File

@ -72,6 +72,10 @@
#include <tbmintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LWP__)
#include <lwpintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
#include <f16cintrin.h>
#endif
@ -80,6 +84,8 @@
#include <mwaitxintrin.h>
#endif
/* FIXME: LWP */
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLZERO__)
#include <clzerointrin.h>
#endif
#endif /* __X86INTRIN_H */

View File

@ -2067,7 +2067,7 @@ _mm_storer_ps(float *__p, __m128 __a)
/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
/// be generated. \n
/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
/// be generated.
/// be generated.
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
#endif
@ -2099,7 +2099,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
///
/// \param __p
/// A pointer to a 128-bit aligned memory location that will receive the
/// integer values.
/// single-precision floating-point values.
/// \param __a
/// A 128-bit vector of [4 x float] containing the values to be moved.
static __inline__ void __DEFAULT_FN_ATTRS
@ -2133,7 +2133,7 @@ void _mm_sfence(void);
/// \headerfile <x86intrin.h>
///
/// \code
/// void _mm_extract_pi(__m64 a, int n);
/// int _mm_extract_pi16(__m64 a, int n);
/// \endcode
///
/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
@ -2157,7 +2157,7 @@ void _mm_sfence(void);
/// \headerfile <x86intrin.h>
///
/// \code
/// void _mm_insert_pi(__m64 a, int d, int n);
/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
/// \endcode
///
/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
@ -2331,8 +2331,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
/// \brief Conditionally copies the values from each 8-bit element in the first
/// 64-bit integer vector operand to the specified memory location, as
/// specified by the most significant bit in the corresponding element in the
/// second 64-bit integer vector operand. To minimize caching, the data is
/// flagged as non-temporal (unlikely to be used again soon).
/// second 64-bit integer vector operand.
///
/// To minimize caching, the data is flagged as non-temporal
/// (unlikely to be used again soon).
///
/// \headerfile <x86intrin.h>
///
@ -2435,17 +2437,17 @@ extern "C" {
/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
/// </li>
/// </li>
/// <li>
/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
/// _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
/// </li>
/// <li>
/// <li>
/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
/// </li>
/// <li>
/// <li>
/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
/// _MM_GET_DENORMALS_ZERO_MODE().
@ -2468,11 +2470,11 @@ extern "C" {
unsigned int _mm_getcsr(void);
/// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
///
///
/// There are several groups of macros associated with this intrinsic,
/// including:
/// <ul>
/// <li>
/// <li>
/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
@ -2517,7 +2519,7 @@ unsigned int _mm_getcsr(void);
///
/// \param __i
/// A 32-bit unsigned integer value to be written to the MXCSR register.
void _mm_setcsr(unsigned int);
void _mm_setcsr(unsigned int __i);
#if defined(__cplusplus)
} // extern "C"
@ -2540,7 +2542,7 @@ void _mm_setcsr(unsigned int);
/// A 128-bit vector of [4 x float].
/// \param mask
/// An immediate value containing an 8-bit value specifying which elements to
/// copy from \ a and \a b. \n
/// copy from \a a and \a b. \n
/// Bits [3:0] specify the values copied from operand \a a. \n
/// Bits [7:4] specify the values copied from operand \a b. \n
/// The destinations within the 128-bit destination are assigned values as
@ -2678,8 +2680,7 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
/// instruction.
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
@ -2709,8 +2710,7 @@ _mm_cvtpi16_ps(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
/// instruction.
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
///
/// \param __a
/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
@ -2739,8 +2739,7 @@ _mm_cvtpu16_ps(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
/// instruction.
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
///
/// \param __a
/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
@ -2764,8 +2763,7 @@ _mm_cvtpi8_ps(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
/// instruction.
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
///
/// \param __a
/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
@ -2789,8 +2787,7 @@ _mm_cvtpu8_ps(__m64 __a)
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
/// instruction.
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
///
/// \param __a
/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
@ -2815,16 +2812,16 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
/// \brief Converts each single-precision floating-point element of a 128-bit
/// floating-point vector of [4 x float] into a 16-bit signed integer, and
/// packs the results into a 64-bit integer vector of [4 x i16]. If the
/// floating-point element is NaN or infinity, or if the floating-point
/// element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
/// to 0x8000. Otherwise if the floating-point element is greater than
/// 0x7FFF, it is converted to 0x7FFF.
/// packs the results into a 64-bit integer vector of [4 x i16].
///
/// If the floating-point element is NaN or infinity, or if the
/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
/// it is converted to 0x8000. Otherwise if the floating-point element is
/// greater than 0x7FFF, it is converted to 0x7FFF.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
/// instruction.
/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
///
/// \param __a
/// A 128-bit floating-point vector of [4 x float].
@ -2845,16 +2842,16 @@ _mm_cvtps_pi16(__m128 __a)
/// \brief Converts each single-precision floating-point element of a 128-bit
/// floating-point vector of [4 x float] into an 8-bit signed integer, and
/// packs the results into the lower 32 bits of a 64-bit integer vector of
/// [8 x i8]. The upper 32 bits of the vector are set to 0. If the
/// floating-point element is NaN or infinity, or if the floating-point
/// element is greater than 0x7FFFFFFF or less than -0x80, it is converted
/// to 0x80. Otherwise if the floating-point element is greater than 0x7F,
/// it is converted to 0x7F.
/// [8 x i8]. The upper 32 bits of the vector are set to 0.
///
/// If the floating-point element is NaN or infinity, or if the
/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
/// is converted to 0x80. Otherwise if the floating-point element is greater
/// than 0x7F, it is converted to 0x7F.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
/// instruction.
/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
///
/// \param __a
/// 128-bit floating-point vector of [4 x float].

View File

@ -198,13 +198,13 @@ _mm_hsubq_epi32(__m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
}
static __inline__ __m128i __DEFAULT_FN_ATTRS

View File

@ -2,7 +2,7 @@
Create bootstrap code in std/bootstrap.zig and add conditional compilation
logic. This code is responsible for the real executable entry point, calling
main(argc, argv, env) and making the exit syscall when main returns.
main() and making the exit syscall when main returns.
How to pass a byvalue struct parameter in the C calling convention is
target-specific. Add logic for how to do function prototypes and function calls