From 2117fbdae35dddf368c4ce5bb39cc73fa0f78d4c Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 19 Jul 2019 16:50:45 -0400 Subject: [PATCH] update C headers to llvm9 upstream commit 1931d3cb20a00da732c5210b123656632982fde0 --- lib/include/__clang_cuda_builtin_vars.h | 20 +- lib/include/__clang_cuda_cmath.h | 49 +- lib/include/__clang_cuda_complex_builtins.h | 20 +- lib/include/__clang_cuda_device_functions.h | 93 +- lib/include/__clang_cuda_intrinsics.h | 20 +- lib/include/__clang_cuda_libdevice_declares.h | 890 ++++--- .../__clang_cuda_math_forward_declares.h | 70 +- lib/include/__clang_cuda_runtime_wrapper.h | 32 +- lib/include/__stddef_max_align_t.h | 22 +- lib/include/__wmmintrin_aes.h | 20 +- lib/include/__wmmintrin_pclmul.h | 20 +- lib/include/adxintrin.h | 20 +- lib/include/altivec.h | 20 +- lib/include/ammintrin.h | 20 +- lib/include/arm64intr.h | 20 +- lib/include/arm_acle.h | 38 +- lib/include/arm_neon.h | 396 +-- lib/include/armintr.h | 20 +- lib/include/avx2intrin.h | 32 +- lib/include/avx512bf16intrin.h | 279 ++ lib/include/avx512bitalgintrin.h | 20 +- lib/include/avx512bwintrin.h | 44 +- lib/include/avx512cdintrin.h | 52 +- lib/include/avx512dqintrin.h | 20 +- lib/include/avx512erintrin.h | 20 +- lib/include/avx512fintrin.h | 115 +- lib/include/avx512ifmaintrin.h | 20 +- lib/include/avx512ifmavlintrin.h | 20 +- lib/include/avx512pfintrin.h | 20 +- lib/include/avx512vbmi2intrin.h | 20 +- lib/include/avx512vbmiintrin.h | 20 +- lib/include/avx512vbmivlintrin.h | 20 +- lib/include/avx512vlbf16intrin.h | 474 ++++ lib/include/avx512vlbitalgintrin.h | 20 +- lib/include/avx512vlbwintrin.h | 36 +- lib/include/avx512vlcdintrin.h | 86 +- lib/include/avx512vldqintrin.h | 52 +- lib/include/avx512vlintrin.h | 77 +- lib/include/avx512vlvbmi2intrin.h | 20 +- lib/include/avx512vlvnniintrin.h | 20 +- lib/include/avx512vlvp2intersectintrin.h | 121 + lib/include/avx512vnniintrin.h | 20 +- lib/include/avx512vp2intersectintrin.h | 77 + lib/include/avx512vpopcntdqintrin.h | 20 +- lib/include/avx512vpopcntdqvlintrin.h | 20 +- lib/include/avxintrin.h | 50 +- lib/include/bmi2intrin.h | 20 +- lib/include/bmiintrin.h | 20 +- lib/include/cetintrin.h | 20 +- lib/include/cldemoteintrin.h | 20 +- lib/include/clflushoptintrin.h | 20 +- lib/include/clwbintrin.h | 20 +- lib/include/clzerointrin.h | 20 +- lib/include/cpuid.h | 24 +- lib/include/emmintrin.h | 55 +- lib/include/enqcmdintrin.h | 63 + lib/include/f16cintrin.h | 26 +- lib/include/float.h | 28 +- lib/include/fma4intrin.h | 20 +- lib/include/fmaintrin.h | 20 +- lib/include/fxsrintrin.h | 20 +- lib/include/gfniintrin.h | 20 +- lib/include/htmintrin.h | 20 +- lib/include/htmxlintrin.h | 20 +- lib/include/ia32intrin.h | 320 ++- lib/include/immintrin.h | 62 +- lib/include/intrin.h | 42 +- lib/include/inttypes.h | 25 +- lib/include/invpcidintrin.h | 20 +- lib/include/iso646.h | 22 +- lib/include/limits.h | 22 +- lib/include/lwpintrin.h | 20 +- lib/include/lzcntintrin.h | 20 +- lib/include/mm3dnow.h | 20 +- lib/include/mm_malloc.h | 20 +- lib/include/mmintrin.h | 22 +- lib/include/module.modulemap | 21 +- lib/include/movdirintrin.h | 20 +- lib/include/msa.h | 20 +- lib/include/mwaitxintrin.h | 20 +- lib/include/nmmintrin.h | 20 +- lib/include/opencl-c-base.h | 578 ++++ lib/include/opencl-c.h | 698 +---- .../openmp_wrappers/__clang_openmp_math.h | 35 + .../__clang_openmp_math_declares.h | 33 + lib/include/openmp_wrappers/cmath | 16 + lib/include/openmp_wrappers/math.h | 17 + lib/include/pconfigintrin.h | 24 +- lib/include/pkuintrin.h | 20 +- lib/include/pmmintrin.h | 20 +- lib/include/popcntintrin.h | 52 +- lib/include/ppc_wrappers/emmintrin.h | 2318 +++++++++++++++++ lib/include/ppc_wrappers/mm_malloc.h | 44 + lib/include/ppc_wrappers/mmintrin.h | 1443 ++++++++++ lib/include/ppc_wrappers/xmmintrin.h | 1838 +++++++++++++ lib/include/prfchwintrin.h | 20 +- lib/include/ptwriteintrin.h | 20 +- lib/include/rdseedintrin.h | 20 +- lib/include/rtmintrin.h | 20 +- lib/include/s390intrin.h | 20 +- lib/include/sgxintrin.h | 24 +- lib/include/shaintrin.h | 20 +- lib/include/smmintrin.h | 20 +- lib/include/stdalign.h | 20 +- lib/include/stdarg.h | 22 +- lib/include/stdatomic.h | 20 +- lib/include/stdbool.h | 22 +- lib/include/stddef.h | 22 +- lib/include/stdint.h | 27 +- lib/include/stdnoreturn.h | 20 +- lib/include/tbmintrin.h | 20 +- lib/include/tgmath.h | 22 +- lib/include/tmmintrin.h | 20 +- lib/include/unwind.h | 24 +- lib/include/vadefs.h | 20 +- lib/include/vaesintrin.h | 20 +- lib/include/varargs.h | 20 +- lib/include/vecintrin.h | 426 ++- lib/include/vpclmulqdqintrin.h | 20 +- lib/include/waitpkgintrin.h | 20 +- lib/include/wbnoinvdintrin.h | 20 +- lib/include/wmmintrin.h | 20 +- lib/include/x86intrin.h | 20 +- lib/include/xmmintrin.h | 40 +- lib/include/xopintrin.h | 20 +- lib/include/xsavecintrin.h | 20 +- lib/include/xsaveintrin.h | 39 +- lib/include/xsaveoptintrin.h | 20 +- lib/include/xsavesintrin.h | 20 +- lib/include/xtestintrin.h | 20 +- 130 files changed, 9519 insertions(+), 3542 deletions(-) create mode 100644 lib/include/avx512bf16intrin.h create mode 100644 lib/include/avx512vlbf16intrin.h create mode 100644 lib/include/avx512vlvp2intersectintrin.h create mode 100644 lib/include/avx512vp2intersectintrin.h create mode 100644 lib/include/enqcmdintrin.h create mode 100644 lib/include/opencl-c-base.h create mode 100644 lib/include/openmp_wrappers/__clang_openmp_math.h create mode 100644 lib/include/openmp_wrappers/__clang_openmp_math_declares.h create mode 100644 lib/include/openmp_wrappers/cmath create mode 100644 lib/include/openmp_wrappers/math.h create mode 100644 lib/include/ppc_wrappers/emmintrin.h create mode 100644 lib/include/ppc_wrappers/mm_malloc.h create mode 100644 lib/include/ppc_wrappers/mmintrin.h create mode 100644 lib/include/ppc_wrappers/xmmintrin.h diff --git a/lib/include/__clang_cuda_builtin_vars.h b/lib/include/__clang_cuda_builtin_vars.h index 290c4b298..2ba1521f2 100644 --- a/lib/include/__clang_cuda_builtin_vars.h +++ b/lib/include/__clang_cuda_builtin_vars.h @@ -1,22 +1,8 @@ /*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/__clang_cuda_cmath.h b/lib/include/__clang_cuda_cmath.h index 5331ba401..834a2e3fd 100644 --- a/lib/include/__clang_cuda_cmath.h +++ b/lib/include/__clang_cuda_cmath.h @@ -1,22 +1,8 @@ /*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -44,12 +30,32 @@ // implementation. Declaring in the global namespace and pulling into namespace // std covers all of the known knowns. +#ifdef _OPENMP +#define __DEVICE__ static __attribute__((always_inline)) +#else #define __DEVICE__ static __device__ __inline__ __attribute__((always_inline)) +#endif +// For C++ 17 we need to include noexcept attribute to be compatible +// with the header-defined version. This may be removed once +// variant is supported. +#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L +#define __NOEXCEPT noexcept +#else +#define __NOEXCEPT +#endif + +#if !(defined(_OPENMP) && defined(__cplusplus)) __DEVICE__ long long abs(long long __n) { return ::llabs(__n); } __DEVICE__ long abs(long __n) { return ::labs(__n); } __DEVICE__ float abs(float __x) { return ::fabsf(__x); } __DEVICE__ double abs(double __x) { return ::fabs(__x); } +#endif +// TODO: remove once variat is supported. +#if defined(_OPENMP) && defined(__cplusplus) +__DEVICE__ const float abs(const float __x) { return ::fabsf((float)__x); } +__DEVICE__ const double abs(const double __x) { return ::fabs((double)__x); } +#endif __DEVICE__ float acos(float __x) { return ::acosf(__x); } __DEVICE__ float asin(float __x) { return ::asinf(__x); } __DEVICE__ float atan(float __x) { return ::atanf(__x); } @@ -58,9 +64,11 @@ __DEVICE__ float ceil(float __x) { return ::ceilf(__x); } __DEVICE__ float cos(float __x) { return ::cosf(__x); } __DEVICE__ float cosh(float __x) { return ::coshf(__x); } __DEVICE__ float exp(float __x) { return ::expf(__x); } -__DEVICE__ float fabs(float __x) { return ::fabsf(__x); } +__DEVICE__ float fabs(float __x) __NOEXCEPT { return ::fabsf(__x); } __DEVICE__ float floor(float __x) { return ::floorf(__x); } __DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); } +// TODO: remove when variant is supported +#ifndef _OPENMP __DEVICE__ int fpclassify(float __x) { return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x); @@ -69,6 +77,7 @@ __DEVICE__ int fpclassify(double __x) { return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x); } +#endif __DEVICE__ float frexp(float __arg, int *__exp) { return ::frexpf(__arg, __exp); } @@ -448,7 +457,10 @@ using ::remainderf; using ::remquof; using ::rintf; using ::roundf; +// TODO: remove once variant is supported +#ifndef _OPENMP using ::scalblnf; +#endif using ::scalbnf; using ::sinf; using ::sinhf; @@ -467,6 +479,7 @@ _GLIBCXX_END_NAMESPACE_VERSION } // namespace std #endif +#undef __NOEXCEPT #undef __DEVICE__ #endif diff --git a/lib/include/__clang_cuda_complex_builtins.h b/lib/include/__clang_cuda_complex_builtins.h index beef7deff..576a958b1 100644 --- a/lib/include/__clang_cuda_complex_builtins.h +++ b/lib/include/__clang_cuda_complex_builtins.h @@ -1,22 +1,8 @@ /*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/__clang_cuda_device_functions.h b/lib/include/__clang_cuda_device_functions.h index 67bbc68b1..50ad674f9 100644 --- a/lib/include/__clang_cuda_device_functions.h +++ b/lib/include/__clang_cuda_device_functions.h @@ -1,22 +1,8 @@ /*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -24,15 +10,21 @@ #ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__ #define __CLANG_CUDA_DEVICE_FUNCTIONS_H__ +#ifndef _OPENMP #if CUDA_VERSION < 9000 #error This file is intended to be used with CUDA-9+ only. #endif +#endif // __DEVICE__ is a helper macro with common set of attributes for the wrappers // we implement in this file. We need static in order to avoid emitting unused // functions and __forceinline__ helps inlining these wrappers at -O1. #pragma push_macro("__DEVICE__") +#ifdef _OPENMP +#define __DEVICE__ static __attribute__((always_inline)) +#else #define __DEVICE__ static __device__ __forceinline__ +#endif // libdevice provides fast low precision and slow full-recision implementations // for some functions. Which one gets selected depends on @@ -45,6 +37,15 @@ #define __FAST_OR_SLOW(fast, slow) slow #endif +// For C++ 17 we need to include noexcept attribute to be compatible +// with the header-defined version. This may be removed once +// variant is supported. +#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L +#define __NOEXCEPT noexcept +#else +#define __NOEXCEPT +#endif + __DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); } __DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); } __DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); } @@ -52,8 +53,13 @@ __DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); } __DEVICE__ unsigned long long __brevll(unsigned long long __a) { return __nv_brevll(__a); } +#if defined(__cplusplus) __DEVICE__ void __brkpt() { asm volatile("brkpt;"); } __DEVICE__ void __brkpt(int __a) { __brkpt(); } +#else +__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); } +__DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); } +#endif __DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b, unsigned int __c) { return __nv_byte_perm(__a, __b, __c); @@ -237,6 +243,9 @@ __DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); } __DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); } __DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); } __DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); } +#ifdef _MSC_VER +__DEVICE__ int __finitel(long double __a); +#endif __DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); } __DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); } __DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); } @@ -445,8 +454,14 @@ __DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); } __DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); } __DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); } __DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); } +#ifdef _MSC_VER +__DEVICE__ int __isinfl(long double __a); +#endif __DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); } __DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); } +#ifdef _MSC_VER +__DEVICE__ int __isnanl(long double __a); +#endif __DEVICE__ double __ll2double_rd(long long __a) { return __nv_ll2double_rd(__a); } @@ -520,8 +535,8 @@ __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) { __DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); } __DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); } __DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); } -__DEVICE__ void __sincosf(float __a, float *__sptr, float *__cptr) { - return __nv_fast_sincosf(__a, __sptr, __cptr); +__DEVICE__ void __sincosf(float __a, float *__s, float *__c) { + return __nv_fast_sincosf(__a, __s, __c); } __DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); } __DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); } @@ -1468,7 +1483,8 @@ __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { return r; } #endif // CUDA_VERSION >= 9020 -__DEVICE__ int abs(int __a) { return __nv_abs(__a); } +__DEVICE__ int abs(int __a) __NOEXCEPT { return __nv_abs(__a); } +__DEVICE__ double fabs(double __a) __NOEXCEPT { return __nv_fabs(__a); } __DEVICE__ double acos(double __a) { return __nv_acos(__a); } __DEVICE__ float acosf(float __a) { return __nv_acosf(__a); } __DEVICE__ double acosh(double __a) { return __nv_acosh(__a); } @@ -1487,8 +1503,10 @@ __DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); } __DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); } __DEVICE__ double ceil(double __a) { return __nv_ceil(__a); } __DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); } +#ifndef _OPENMP __DEVICE__ int clock() { return __nvvm_read_ptx_sreg_clock(); } __DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); } +#endif __DEVICE__ double copysign(double __a, double __b) { return __nv_copysign(__a, __b); } @@ -1525,7 +1543,6 @@ __DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); } __DEVICE__ float expf(float __a) { return __nv_expf(__a); } __DEVICE__ double expm1(double __a) { return __nv_expm1(__a); } __DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); } -__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); } __DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); } __DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); } __DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); } @@ -1563,16 +1580,16 @@ __DEVICE__ double j1(double __a) { return __nv_j1(__a); } __DEVICE__ float j1f(float __a) { return __nv_j1f(__a); } __DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); } __DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); } -#if defined(__LP64__) -__DEVICE__ long labs(long __a) { return llabs(__a); }; +#if defined(__LP64__) || defined(_WIN64) +__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_llabs(__a); }; #else -__DEVICE__ long labs(long __a) { return __nv_abs(__a); }; +__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_abs(__a); }; #endif __DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); } __DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); } __DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); } __DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); } -__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); } +__DEVICE__ long long llabs(long long __a) __NOEXCEPT { return __nv_llabs(__a); } __DEVICE__ long long llmax(long long __a, long long __b) { return __nv_llmax(__a, __b); } @@ -1597,7 +1614,7 @@ __DEVICE__ float logbf(float __a) { return __nv_logbf(__a); } __DEVICE__ float logf(float __a) { return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a); } -#if defined(__LP64__) +#if defined(__LP64__) || defined(_WIN64) __DEVICE__ long lrint(double __a) { return llrint(__a); } __DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); } __DEVICE__ long lround(double __a) { return llround(__a); } @@ -1609,12 +1626,16 @@ __DEVICE__ long lround(double __a) { return round(__a); } __DEVICE__ long lroundf(float __a) { return roundf(__a); } #endif __DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); } +// These functions shouldn't be declared when including this header +// for math function resolution purposes. +#ifndef _OPENMP __DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) { return __builtin_memcpy(__a, __b, __c); } __DEVICE__ void *memset(void *__a, int __b, size_t __c) { return __builtin_memset(__a, __b, __c); } +#endif __DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); } __DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); } __DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); } @@ -1698,6 +1719,8 @@ __DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); } __DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); } __DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); } __DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); } +// TODO: remove once variant is supported +#ifndef _OPENMP __DEVICE__ double scalbln(double __a, long __b) { if (__b > INT_MAX) return __a > 0 ? HUGE_VAL : -HUGE_VAL; @@ -1712,18 +1735,19 @@ __DEVICE__ float scalblnf(float __a, long __b) { return __a > 0 ? 0.f : -0.f; return scalbnf(__a, (int)__b); } +#endif __DEVICE__ double sin(double __a) { return __nv_sin(__a); } -__DEVICE__ void sincos(double __a, double *__sptr, double *__cptr) { - return __nv_sincos(__a, __sptr, __cptr); +__DEVICE__ void sincos(double __a, double *__s, double *__c) { + return __nv_sincos(__a, __s, __c); } -__DEVICE__ void sincosf(float __a, float *__sptr, float *__cptr) { - return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __sptr, __cptr); +__DEVICE__ void sincosf(float __a, float *__s, float *__c) { + return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c); } -__DEVICE__ void sincospi(double __a, double *__sptr, double *__cptr) { - return __nv_sincospi(__a, __sptr, __cptr); +__DEVICE__ void sincospi(double __a, double *__s, double *__c) { + return __nv_sincospi(__a, __s, __c); } -__DEVICE__ void sincospif(float __a, float *__sptr, float *__cptr) { - return __nv_sincospif(__a, __sptr, __cptr); +__DEVICE__ void sincospif(float __a, float *__s, float *__c) { + return __nv_sincospif(__a, __s, __c); } __DEVICE__ float sinf(float __a) { return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a); @@ -1763,6 +1787,7 @@ __DEVICE__ float y1f(float __a) { return __nv_y1f(__a); } __DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); } __DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); } +#undef __NOEXCEPT #pragma pop_macro("__DEVICE__") #pragma pop_macro("__FAST_OR_SLOW") #endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__ diff --git a/lib/include/__clang_cuda_intrinsics.h b/lib/include/__clang_cuda_intrinsics.h index 3c0cde94e..2970d17f8 100644 --- a/lib/include/__clang_cuda_intrinsics.h +++ b/lib/include/__clang_cuda_intrinsics.h @@ -1,22 +1,8 @@ /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/__clang_cuda_libdevice_declares.h b/lib/include/__clang_cuda_libdevice_declares.h index 71df7f849..4d7035339 100644 --- a/lib/include/__clang_cuda_libdevice_declares.h +++ b/lib/include/__clang_cuda_libdevice_declares.h @@ -1,22 +1,8 @@ /*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -24,443 +10,453 @@ #ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__ #define __CLANG_CUDA_LIBDEVICE_DECLARES_H__ +#if defined(__cplusplus) extern "C" { +#endif -__device__ int __nv_abs(int __a); -__device__ double __nv_acos(double __a); -__device__ float __nv_acosf(float __a); -__device__ double __nv_acosh(double __a); -__device__ float __nv_acoshf(float __a); -__device__ double __nv_asin(double __a); -__device__ float __nv_asinf(float __a); -__device__ double __nv_asinh(double __a); -__device__ float __nv_asinhf(float __a); -__device__ double __nv_atan2(double __a, double __b); -__device__ float __nv_atan2f(float __a, float __b); -__device__ double __nv_atan(double __a); -__device__ float __nv_atanf(float __a); -__device__ double __nv_atanh(double __a); -__device__ float __nv_atanhf(float __a); -__device__ int __nv_brev(int __a); -__device__ long long __nv_brevll(long long __a); -__device__ int __nv_byte_perm(int __a, int __b, int __c); -__device__ double __nv_cbrt(double __a); -__device__ float __nv_cbrtf(float __a); -__device__ double __nv_ceil(double __a); -__device__ float __nv_ceilf(float __a); -__device__ int __nv_clz(int __a); -__device__ int __nv_clzll(long long __a); -__device__ double __nv_copysign(double __a, double __b); -__device__ float __nv_copysignf(float __a, float __b); -__device__ double __nv_cos(double __a); -__device__ float __nv_cosf(float __a); -__device__ double __nv_cosh(double __a); -__device__ float __nv_coshf(float __a); -__device__ double __nv_cospi(double __a); -__device__ float __nv_cospif(float __a); -__device__ double __nv_cyl_bessel_i0(double __a); -__device__ float __nv_cyl_bessel_i0f(float __a); -__device__ double __nv_cyl_bessel_i1(double __a); -__device__ float __nv_cyl_bessel_i1f(float __a); -__device__ double __nv_dadd_rd(double __a, double __b); -__device__ double __nv_dadd_rn(double __a, double __b); -__device__ double __nv_dadd_ru(double __a, double __b); -__device__ double __nv_dadd_rz(double __a, double __b); -__device__ double __nv_ddiv_rd(double __a, double __b); -__device__ double __nv_ddiv_rn(double __a, double __b); -__device__ double __nv_ddiv_ru(double __a, double __b); -__device__ double __nv_ddiv_rz(double __a, double __b); -__device__ double __nv_dmul_rd(double __a, double __b); -__device__ double __nv_dmul_rn(double __a, double __b); -__device__ double __nv_dmul_ru(double __a, double __b); -__device__ double __nv_dmul_rz(double __a, double __b); -__device__ float __nv_double2float_rd(double __a); -__device__ float __nv_double2float_rn(double __a); -__device__ float __nv_double2float_ru(double __a); -__device__ float __nv_double2float_rz(double __a); -__device__ int __nv_double2hiint(double __a); -__device__ int __nv_double2int_rd(double __a); -__device__ int __nv_double2int_rn(double __a); -__device__ int __nv_double2int_ru(double __a); -__device__ int __nv_double2int_rz(double __a); -__device__ long long __nv_double2ll_rd(double __a); -__device__ long long __nv_double2ll_rn(double __a); -__device__ long long __nv_double2ll_ru(double __a); -__device__ long long __nv_double2ll_rz(double __a); -__device__ int __nv_double2loint(double __a); -__device__ unsigned int __nv_double2uint_rd(double __a); -__device__ unsigned int __nv_double2uint_rn(double __a); -__device__ unsigned int __nv_double2uint_ru(double __a); -__device__ unsigned int __nv_double2uint_rz(double __a); -__device__ unsigned long long __nv_double2ull_rd(double __a); -__device__ unsigned long long __nv_double2ull_rn(double __a); -__device__ unsigned long long __nv_double2ull_ru(double __a); -__device__ unsigned long long __nv_double2ull_rz(double __a); -__device__ unsigned long long __nv_double_as_longlong(double __a); -__device__ double __nv_drcp_rd(double __a); -__device__ double __nv_drcp_rn(double __a); -__device__ double __nv_drcp_ru(double __a); -__device__ double __nv_drcp_rz(double __a); -__device__ double __nv_dsqrt_rd(double __a); -__device__ double __nv_dsqrt_rn(double __a); -__device__ double __nv_dsqrt_ru(double __a); -__device__ double __nv_dsqrt_rz(double __a); -__device__ double __nv_dsub_rd(double __a, double __b); -__device__ double __nv_dsub_rn(double __a, double __b); -__device__ double __nv_dsub_ru(double __a, double __b); -__device__ double __nv_dsub_rz(double __a, double __b); -__device__ double __nv_erfc(double __a); -__device__ float __nv_erfcf(float __a); -__device__ double __nv_erfcinv(double __a); -__device__ float __nv_erfcinvf(float __a); -__device__ double __nv_erfcx(double __a); -__device__ float __nv_erfcxf(float __a); -__device__ double __nv_erf(double __a); -__device__ float __nv_erff(float __a); -__device__ double __nv_erfinv(double __a); -__device__ float __nv_erfinvf(float __a); -__device__ double __nv_exp10(double __a); -__device__ float __nv_exp10f(float __a); -__device__ double __nv_exp2(double __a); -__device__ float __nv_exp2f(float __a); -__device__ double __nv_exp(double __a); -__device__ float __nv_expf(float __a); -__device__ double __nv_expm1(double __a); -__device__ float __nv_expm1f(float __a); -__device__ double __nv_fabs(double __a); -__device__ float __nv_fabsf(float __a); -__device__ float __nv_fadd_rd(float __a, float __b); -__device__ float __nv_fadd_rn(float __a, float __b); -__device__ float __nv_fadd_ru(float __a, float __b); -__device__ float __nv_fadd_rz(float __a, float __b); -__device__ float __nv_fast_cosf(float __a); -__device__ float __nv_fast_exp10f(float __a); -__device__ float __nv_fast_expf(float __a); -__device__ float __nv_fast_fdividef(float __a, float __b); -__device__ float __nv_fast_log10f(float __a); -__device__ float __nv_fast_log2f(float __a); -__device__ float __nv_fast_logf(float __a); -__device__ float __nv_fast_powf(float __a, float __b); -__device__ void __nv_fast_sincosf(float __a, float *__sptr, float *__cptr); -__device__ float __nv_fast_sinf(float __a); -__device__ float __nv_fast_tanf(float __a); -__device__ double __nv_fdim(double __a, double __b); -__device__ float __nv_fdimf(float __a, float __b); -__device__ float __nv_fdiv_rd(float __a, float __b); -__device__ float __nv_fdiv_rn(float __a, float __b); -__device__ float __nv_fdiv_ru(float __a, float __b); -__device__ float __nv_fdiv_rz(float __a, float __b); -__device__ int __nv_ffs(int __a); -__device__ int __nv_ffsll(long long __a); -__device__ int __nv_finitef(float __a); -__device__ unsigned short __nv_float2half_rn(float __a); -__device__ int __nv_float2int_rd(float __a); -__device__ int __nv_float2int_rn(float __a); -__device__ int __nv_float2int_ru(float __a); -__device__ int __nv_float2int_rz(float __a); -__device__ long long __nv_float2ll_rd(float __a); -__device__ long long __nv_float2ll_rn(float __a); -__device__ long long __nv_float2ll_ru(float __a); -__device__ long long __nv_float2ll_rz(float __a); -__device__ unsigned int __nv_float2uint_rd(float __a); -__device__ unsigned int __nv_float2uint_rn(float __a); -__device__ unsigned int __nv_float2uint_ru(float __a); -__device__ unsigned int __nv_float2uint_rz(float __a); -__device__ unsigned long long __nv_float2ull_rd(float __a); -__device__ unsigned long long __nv_float2ull_rn(float __a); -__device__ unsigned long long __nv_float2ull_ru(float __a); -__device__ unsigned long long __nv_float2ull_rz(float __a); -__device__ int __nv_float_as_int(float __a); -__device__ unsigned int __nv_float_as_uint(float __a); -__device__ double __nv_floor(double __a); -__device__ float __nv_floorf(float __a); -__device__ double __nv_fma(double __a, double __b, double __c); -__device__ float __nv_fmaf(float __a, float __b, float __c); -__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c); -__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c); -__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c); -__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c); -__device__ float __nv_fmaf_rd(float __a, float __b, float __c); -__device__ float __nv_fmaf_rn(float __a, float __b, float __c); -__device__ float __nv_fmaf_ru(float __a, float __b, float __c); -__device__ float __nv_fmaf_rz(float __a, float __b, float __c); -__device__ double __nv_fma_rd(double __a, double __b, double __c); -__device__ double __nv_fma_rn(double __a, double __b, double __c); -__device__ double __nv_fma_ru(double __a, double __b, double __c); -__device__ double __nv_fma_rz(double __a, double __b, double __c); -__device__ double __nv_fmax(double __a, double __b); -__device__ float __nv_fmaxf(float __a, float __b); -__device__ double __nv_fmin(double __a, double __b); -__device__ float __nv_fminf(float __a, float __b); -__device__ double __nv_fmod(double __a, double __b); -__device__ float __nv_fmodf(float __a, float __b); -__device__ float __nv_fmul_rd(float __a, float __b); -__device__ float __nv_fmul_rn(float __a, float __b); -__device__ float __nv_fmul_ru(float __a, float __b); -__device__ float __nv_fmul_rz(float __a, float __b); -__device__ float __nv_frcp_rd(float __a); -__device__ float __nv_frcp_rn(float __a); -__device__ float __nv_frcp_ru(float __a); -__device__ float __nv_frcp_rz(float __a); -__device__ double __nv_frexp(double __a, int *__b); -__device__ float __nv_frexpf(float __a, int *__b); -__device__ float __nv_frsqrt_rn(float __a); -__device__ float __nv_fsqrt_rd(float __a); -__device__ float __nv_fsqrt_rn(float __a); -__device__ float __nv_fsqrt_ru(float __a); -__device__ float __nv_fsqrt_rz(float __a); -__device__ float __nv_fsub_rd(float __a, float __b); -__device__ float __nv_fsub_rn(float __a, float __b); -__device__ float __nv_fsub_ru(float __a, float __b); -__device__ float __nv_fsub_rz(float __a, float __b); -__device__ int __nv_hadd(int __a, int __b); -__device__ float __nv_half2float(unsigned short __h); -__device__ double __nv_hiloint2double(int __a, int __b); -__device__ double __nv_hypot(double __a, double __b); -__device__ float __nv_hypotf(float __a, float __b); -__device__ int __nv_ilogb(double __a); -__device__ int __nv_ilogbf(float __a); -__device__ double __nv_int2double_rn(int __a); -__device__ float __nv_int2float_rd(int __a); -__device__ float __nv_int2float_rn(int __a); -__device__ float __nv_int2float_ru(int __a); -__device__ float __nv_int2float_rz(int __a); -__device__ float __nv_int_as_float(int __a); -__device__ int __nv_isfinited(double __a); -__device__ int __nv_isinfd(double __a); -__device__ int __nv_isinff(float __a); -__device__ int __nv_isnand(double __a); -__device__ int __nv_isnanf(float __a); -__device__ double __nv_j0(double __a); -__device__ float __nv_j0f(float __a); -__device__ double __nv_j1(double __a); -__device__ float __nv_j1f(float __a); -__device__ float __nv_jnf(int __a, float __b); -__device__ double __nv_jn(int __a, double __b); -__device__ double __nv_ldexp(double __a, int __b); -__device__ float __nv_ldexpf(float __a, int __b); -__device__ double __nv_lgamma(double __a); -__device__ float __nv_lgammaf(float __a); -__device__ double __nv_ll2double_rd(long long __a); -__device__ double __nv_ll2double_rn(long long __a); -__device__ double __nv_ll2double_ru(long long __a); -__device__ double __nv_ll2double_rz(long long __a); -__device__ float __nv_ll2float_rd(long long __a); -__device__ float __nv_ll2float_rn(long long __a); -__device__ float __nv_ll2float_ru(long long __a); -__device__ float __nv_ll2float_rz(long long __a); -__device__ long long __nv_llabs(long long __a); -__device__ long long __nv_llmax(long long __a, long long __b); -__device__ long long __nv_llmin(long long __a, long long __b); -__device__ long long __nv_llrint(double __a); -__device__ long long __nv_llrintf(float __a); -__device__ long long __nv_llround(double __a); -__device__ long long __nv_llroundf(float __a); -__device__ double __nv_log10(double __a); -__device__ float __nv_log10f(float __a); -__device__ double __nv_log1p(double __a); -__device__ float __nv_log1pf(float __a); -__device__ double __nv_log2(double __a); -__device__ float __nv_log2f(float __a); -__device__ double __nv_logb(double __a); -__device__ float __nv_logbf(float __a); -__device__ double __nv_log(double __a); -__device__ float __nv_logf(float __a); -__device__ double __nv_longlong_as_double(long long __a); -__device__ int __nv_max(int __a, int __b); -__device__ int __nv_min(int __a, int __b); -__device__ double __nv_modf(double __a, double *__b); -__device__ float __nv_modff(float __a, float *__b); -__device__ int __nv_mul24(int __a, int __b); -__device__ long long __nv_mul64hi(long long __a, long long __b); -__device__ int __nv_mulhi(int __a, int __b); -__device__ double __nv_nan(const signed char *__a); -__device__ float __nv_nanf(const signed char *__a); -__device__ double __nv_nearbyint(double __a); -__device__ float __nv_nearbyintf(float __a); -__device__ double __nv_nextafter(double __a, double __b); -__device__ float __nv_nextafterf(float __a, float __b); -__device__ double __nv_norm3d(double __a, double __b, double __c); -__device__ float __nv_norm3df(float __a, float __b, float __c); -__device__ double __nv_norm4d(double __a, double __b, double __c, double __d); -__device__ float __nv_norm4df(float __a, float __b, float __c, float __d); -__device__ double __nv_normcdf(double __a); -__device__ float __nv_normcdff(float __a); -__device__ double __nv_normcdfinv(double __a); -__device__ float __nv_normcdfinvf(float __a); -__device__ float __nv_normf(int __a, const float *__b); -__device__ double __nv_norm(int __a, const double *__b); -__device__ int __nv_popc(int __a); -__device__ int __nv_popcll(long long __a); -__device__ double __nv_pow(double __a, double __b); -__device__ float __nv_powf(float __a, float __b); -__device__ double __nv_powi(double __a, int __b); -__device__ float __nv_powif(float __a, int __b); -__device__ double __nv_rcbrt(double __a); -__device__ float __nv_rcbrtf(float __a); -__device__ double __nv_rcp64h(double __a); -__device__ double __nv_remainder(double __a, double __b); -__device__ float __nv_remainderf(float __a, float __b); -__device__ double __nv_remquo(double __a, double __b, int *__c); -__device__ float __nv_remquof(float __a, float __b, int *__c); -__device__ int __nv_rhadd(int __a, int __b); -__device__ double __nv_rhypot(double __a, double __b); -__device__ float __nv_rhypotf(float __a, float __b); -__device__ double __nv_rint(double __a); -__device__ float __nv_rintf(float __a); -__device__ double __nv_rnorm3d(double __a, double __b, double __c); -__device__ float __nv_rnorm3df(float __a, float __b, float __c); -__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d); -__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d); -__device__ float __nv_rnormf(int __a, const float *__b); -__device__ double __nv_rnorm(int __a, const double *__b); -__device__ double __nv_round(double __a); -__device__ float __nv_roundf(float __a); -__device__ double __nv_rsqrt(double __a); -__device__ float __nv_rsqrtf(float __a); -__device__ int __nv_sad(int __a, int __b, int __c); -__device__ float __nv_saturatef(float __a); -__device__ double __nv_scalbn(double __a, int __b); -__device__ float __nv_scalbnf(float __a, int __b); -__device__ int __nv_signbitd(double __a); -__device__ int __nv_signbitf(float __a); -__device__ void __nv_sincos(double __a, double *__b, double *__c); -__device__ void __nv_sincosf(float __a, float *__b, float *__c); -__device__ void __nv_sincospi(double __a, double *__b, double *__c); -__device__ void __nv_sincospif(float __a, float *__b, float *__c); -__device__ double __nv_sin(double __a); -__device__ float __nv_sinf(float __a); -__device__ double __nv_sinh(double __a); -__device__ float __nv_sinhf(float __a); -__device__ double __nv_sinpi(double __a); -__device__ float __nv_sinpif(float __a); -__device__ double __nv_sqrt(double __a); -__device__ float __nv_sqrtf(float __a); -__device__ double __nv_tan(double __a); -__device__ float __nv_tanf(float __a); -__device__ double __nv_tanh(double __a); -__device__ float __nv_tanhf(float __a); -__device__ double __nv_tgamma(double __a); -__device__ float __nv_tgammaf(float __a); -__device__ double __nv_trunc(double __a); -__device__ float __nv_truncf(float __a); -__device__ int __nv_uhadd(unsigned int __a, unsigned int __b); -__device__ double __nv_uint2double_rn(unsigned int __i); -__device__ float __nv_uint2float_rd(unsigned int __a); -__device__ float __nv_uint2float_rn(unsigned int __a); -__device__ float __nv_uint2float_ru(unsigned int __a); -__device__ float __nv_uint2float_rz(unsigned int __a); -__device__ float __nv_uint_as_float(unsigned int __a); -__device__ double __nv_ull2double_rd(unsigned long long __a); -__device__ double __nv_ull2double_rn(unsigned long long __a); -__device__ double __nv_ull2double_ru(unsigned long long __a); -__device__ double __nv_ull2double_rz(unsigned long long __a); -__device__ float __nv_ull2float_rd(unsigned long long __a); -__device__ float __nv_ull2float_rn(unsigned long long __a); -__device__ float __nv_ull2float_ru(unsigned long long __a); -__device__ float __nv_ull2float_rz(unsigned long long __a); -__device__ unsigned long long __nv_ullmax(unsigned long long __a, +#if defined(_OPENMP) +#define __DEVICE__ +#elif defined(__CUDA__) +#define __DEVICE__ __device__ +#endif + +__DEVICE__ int __nv_abs(int __a); +__DEVICE__ double __nv_acos(double __a); +__DEVICE__ float __nv_acosf(float __a); +__DEVICE__ double __nv_acosh(double __a); +__DEVICE__ float __nv_acoshf(float __a); +__DEVICE__ double __nv_asin(double __a); +__DEVICE__ float __nv_asinf(float __a); +__DEVICE__ double __nv_asinh(double __a); +__DEVICE__ float __nv_asinhf(float __a); +__DEVICE__ double __nv_atan2(double __a, double __b); +__DEVICE__ float __nv_atan2f(float __a, float __b); +__DEVICE__ double __nv_atan(double __a); +__DEVICE__ float __nv_atanf(float __a); +__DEVICE__ double __nv_atanh(double __a); +__DEVICE__ float __nv_atanhf(float __a); +__DEVICE__ int __nv_brev(int __a); +__DEVICE__ long long __nv_brevll(long long __a); +__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c); +__DEVICE__ double __nv_cbrt(double __a); +__DEVICE__ float __nv_cbrtf(float __a); +__DEVICE__ double __nv_ceil(double __a); +__DEVICE__ float __nv_ceilf(float __a); +__DEVICE__ int __nv_clz(int __a); +__DEVICE__ int __nv_clzll(long long __a); +__DEVICE__ double __nv_copysign(double __a, double __b); +__DEVICE__ float __nv_copysignf(float __a, float __b); +__DEVICE__ double __nv_cos(double __a); +__DEVICE__ float __nv_cosf(float __a); +__DEVICE__ double __nv_cosh(double __a); +__DEVICE__ float __nv_coshf(float __a); +__DEVICE__ double __nv_cospi(double __a); +__DEVICE__ float __nv_cospif(float __a); +__DEVICE__ double __nv_cyl_bessel_i0(double __a); +__DEVICE__ float __nv_cyl_bessel_i0f(float __a); +__DEVICE__ double __nv_cyl_bessel_i1(double __a); +__DEVICE__ float __nv_cyl_bessel_i1f(float __a); +__DEVICE__ double __nv_dadd_rd(double __a, double __b); +__DEVICE__ double __nv_dadd_rn(double __a, double __b); +__DEVICE__ double __nv_dadd_ru(double __a, double __b); +__DEVICE__ double __nv_dadd_rz(double __a, double __b); +__DEVICE__ double __nv_ddiv_rd(double __a, double __b); +__DEVICE__ double __nv_ddiv_rn(double __a, double __b); +__DEVICE__ double __nv_ddiv_ru(double __a, double __b); +__DEVICE__ double __nv_ddiv_rz(double __a, double __b); +__DEVICE__ double __nv_dmul_rd(double __a, double __b); +__DEVICE__ double __nv_dmul_rn(double __a, double __b); +__DEVICE__ double __nv_dmul_ru(double __a, double __b); +__DEVICE__ double __nv_dmul_rz(double __a, double __b); +__DEVICE__ float __nv_double2float_rd(double __a); +__DEVICE__ float __nv_double2float_rn(double __a); +__DEVICE__ float __nv_double2float_ru(double __a); +__DEVICE__ float __nv_double2float_rz(double __a); +__DEVICE__ int __nv_double2hiint(double __a); +__DEVICE__ int __nv_double2int_rd(double __a); +__DEVICE__ int __nv_double2int_rn(double __a); +__DEVICE__ int __nv_double2int_ru(double __a); +__DEVICE__ int __nv_double2int_rz(double __a); +__DEVICE__ long long __nv_double2ll_rd(double __a); +__DEVICE__ long long __nv_double2ll_rn(double __a); +__DEVICE__ long long __nv_double2ll_ru(double __a); +__DEVICE__ long long __nv_double2ll_rz(double __a); +__DEVICE__ int __nv_double2loint(double __a); +__DEVICE__ unsigned int __nv_double2uint_rd(double __a); +__DEVICE__ unsigned int __nv_double2uint_rn(double __a); +__DEVICE__ unsigned int __nv_double2uint_ru(double __a); +__DEVICE__ unsigned int __nv_double2uint_rz(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rd(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rn(double __a); +__DEVICE__ unsigned long long __nv_double2ull_ru(double __a); +__DEVICE__ unsigned long long __nv_double2ull_rz(double __a); +__DEVICE__ unsigned long long __nv_double_as_longlong(double __a); +__DEVICE__ double __nv_drcp_rd(double __a); +__DEVICE__ double __nv_drcp_rn(double __a); +__DEVICE__ double __nv_drcp_ru(double __a); +__DEVICE__ double __nv_drcp_rz(double __a); +__DEVICE__ double __nv_dsqrt_rd(double __a); +__DEVICE__ double __nv_dsqrt_rn(double __a); +__DEVICE__ double __nv_dsqrt_ru(double __a); +__DEVICE__ double __nv_dsqrt_rz(double __a); +__DEVICE__ double __nv_dsub_rd(double __a, double __b); +__DEVICE__ double __nv_dsub_rn(double __a, double __b); +__DEVICE__ double __nv_dsub_ru(double __a, double __b); +__DEVICE__ double __nv_dsub_rz(double __a, double __b); +__DEVICE__ double __nv_erfc(double __a); +__DEVICE__ float __nv_erfcf(float __a); +__DEVICE__ double __nv_erfcinv(double __a); +__DEVICE__ float __nv_erfcinvf(float __a); +__DEVICE__ double __nv_erfcx(double __a); +__DEVICE__ float __nv_erfcxf(float __a); +__DEVICE__ double __nv_erf(double __a); +__DEVICE__ float __nv_erff(float __a); +__DEVICE__ double __nv_erfinv(double __a); +__DEVICE__ float __nv_erfinvf(float __a); +__DEVICE__ double __nv_exp10(double __a); +__DEVICE__ float __nv_exp10f(float __a); +__DEVICE__ double __nv_exp2(double __a); +__DEVICE__ float __nv_exp2f(float __a); +__DEVICE__ double __nv_exp(double __a); +__DEVICE__ float __nv_expf(float __a); +__DEVICE__ double __nv_expm1(double __a); +__DEVICE__ float __nv_expm1f(float __a); +__DEVICE__ double __nv_fabs(double __a); +__DEVICE__ float __nv_fabsf(float __a); +__DEVICE__ float __nv_fadd_rd(float __a, float __b); +__DEVICE__ float __nv_fadd_rn(float __a, float __b); +__DEVICE__ float __nv_fadd_ru(float __a, float __b); +__DEVICE__ float __nv_fadd_rz(float __a, float __b); +__DEVICE__ float __nv_fast_cosf(float __a); +__DEVICE__ float __nv_fast_exp10f(float __a); +__DEVICE__ float __nv_fast_expf(float __a); +__DEVICE__ float __nv_fast_fdividef(float __a, float __b); +__DEVICE__ float __nv_fast_log10f(float __a); +__DEVICE__ float __nv_fast_log2f(float __a); +__DEVICE__ float __nv_fast_logf(float __a); +__DEVICE__ float __nv_fast_powf(float __a, float __b); +__DEVICE__ void __nv_fast_sincosf(float __a, float *__s, float *__c); +__DEVICE__ float __nv_fast_sinf(float __a); +__DEVICE__ float __nv_fast_tanf(float __a); +__DEVICE__ double __nv_fdim(double __a, double __b); +__DEVICE__ float __nv_fdimf(float __a, float __b); +__DEVICE__ float __nv_fdiv_rd(float __a, float __b); +__DEVICE__ float __nv_fdiv_rn(float __a, float __b); +__DEVICE__ float __nv_fdiv_ru(float __a, float __b); +__DEVICE__ float __nv_fdiv_rz(float __a, float __b); +__DEVICE__ int __nv_ffs(int __a); +__DEVICE__ int __nv_ffsll(long long __a); +__DEVICE__ int __nv_finitef(float __a); +__DEVICE__ unsigned short __nv_float2half_rn(float __a); +__DEVICE__ int __nv_float2int_rd(float __a); +__DEVICE__ int __nv_float2int_rn(float __a); +__DEVICE__ int __nv_float2int_ru(float __a); +__DEVICE__ int __nv_float2int_rz(float __a); +__DEVICE__ long long __nv_float2ll_rd(float __a); +__DEVICE__ long long __nv_float2ll_rn(float __a); +__DEVICE__ long long __nv_float2ll_ru(float __a); +__DEVICE__ long long __nv_float2ll_rz(float __a); +__DEVICE__ unsigned int __nv_float2uint_rd(float __a); +__DEVICE__ unsigned int __nv_float2uint_rn(float __a); +__DEVICE__ unsigned int __nv_float2uint_ru(float __a); +__DEVICE__ unsigned int __nv_float2uint_rz(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rd(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rn(float __a); +__DEVICE__ unsigned long long __nv_float2ull_ru(float __a); +__DEVICE__ unsigned long long __nv_float2ull_rz(float __a); +__DEVICE__ int __nv_float_as_int(float __a); +__DEVICE__ unsigned int __nv_float_as_uint(float __a); +__DEVICE__ double __nv_floor(double __a); +__DEVICE__ float __nv_floorf(float __a); +__DEVICE__ double __nv_fma(double __a, double __b, double __c); +__DEVICE__ float __nv_fmaf(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c); +__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c); +__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c); +__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c); +__DEVICE__ double __nv_fmax(double __a, double __b); +__DEVICE__ float __nv_fmaxf(float __a, float __b); +__DEVICE__ double __nv_fmin(double __a, double __b); +__DEVICE__ float __nv_fminf(float __a, float __b); +__DEVICE__ double __nv_fmod(double __a, double __b); +__DEVICE__ float __nv_fmodf(float __a, float __b); +__DEVICE__ float __nv_fmul_rd(float __a, float __b); +__DEVICE__ float __nv_fmul_rn(float __a, float __b); +__DEVICE__ float __nv_fmul_ru(float __a, float __b); +__DEVICE__ float __nv_fmul_rz(float __a, float __b); +__DEVICE__ float __nv_frcp_rd(float __a); +__DEVICE__ float __nv_frcp_rn(float __a); +__DEVICE__ float __nv_frcp_ru(float __a); +__DEVICE__ float __nv_frcp_rz(float __a); +__DEVICE__ double __nv_frexp(double __a, int *__b); +__DEVICE__ float __nv_frexpf(float __a, int *__b); +__DEVICE__ float __nv_frsqrt_rn(float __a); +__DEVICE__ float __nv_fsqrt_rd(float __a); +__DEVICE__ float __nv_fsqrt_rn(float __a); +__DEVICE__ float __nv_fsqrt_ru(float __a); +__DEVICE__ float __nv_fsqrt_rz(float __a); +__DEVICE__ float __nv_fsub_rd(float __a, float __b); +__DEVICE__ float __nv_fsub_rn(float __a, float __b); +__DEVICE__ float __nv_fsub_ru(float __a, float __b); +__DEVICE__ float __nv_fsub_rz(float __a, float __b); +__DEVICE__ int __nv_hadd(int __a, int __b); +__DEVICE__ float __nv_half2float(unsigned short __h); +__DEVICE__ double __nv_hiloint2double(int __a, int __b); +__DEVICE__ double __nv_hypot(double __a, double __b); +__DEVICE__ float __nv_hypotf(float __a, float __b); +__DEVICE__ int __nv_ilogb(double __a); +__DEVICE__ int __nv_ilogbf(float __a); +__DEVICE__ double __nv_int2double_rn(int __a); +__DEVICE__ float __nv_int2float_rd(int __a); +__DEVICE__ float __nv_int2float_rn(int __a); +__DEVICE__ float __nv_int2float_ru(int __a); +__DEVICE__ float __nv_int2float_rz(int __a); +__DEVICE__ float __nv_int_as_float(int __a); +__DEVICE__ int __nv_isfinited(double __a); +__DEVICE__ int __nv_isinfd(double __a); +__DEVICE__ int __nv_isinff(float __a); +__DEVICE__ int __nv_isnand(double __a); +__DEVICE__ int __nv_isnanf(float __a); +__DEVICE__ double __nv_j0(double __a); +__DEVICE__ float __nv_j0f(float __a); +__DEVICE__ double __nv_j1(double __a); +__DEVICE__ float __nv_j1f(float __a); +__DEVICE__ float __nv_jnf(int __a, float __b); +__DEVICE__ double __nv_jn(int __a, double __b); +__DEVICE__ double __nv_ldexp(double __a, int __b); +__DEVICE__ float __nv_ldexpf(float __a, int __b); +__DEVICE__ double __nv_lgamma(double __a); +__DEVICE__ float __nv_lgammaf(float __a); +__DEVICE__ double __nv_ll2double_rd(long long __a); +__DEVICE__ double __nv_ll2double_rn(long long __a); +__DEVICE__ double __nv_ll2double_ru(long long __a); +__DEVICE__ double __nv_ll2double_rz(long long __a); +__DEVICE__ float __nv_ll2float_rd(long long __a); +__DEVICE__ float __nv_ll2float_rn(long long __a); +__DEVICE__ float __nv_ll2float_ru(long long __a); +__DEVICE__ float __nv_ll2float_rz(long long __a); +__DEVICE__ long long __nv_llabs(long long __a); +__DEVICE__ long long __nv_llmax(long long __a, long long __b); +__DEVICE__ long long __nv_llmin(long long __a, long long __b); +__DEVICE__ long long __nv_llrint(double __a); +__DEVICE__ long long __nv_llrintf(float __a); +__DEVICE__ long long __nv_llround(double __a); +__DEVICE__ long long __nv_llroundf(float __a); +__DEVICE__ double __nv_log10(double __a); +__DEVICE__ float __nv_log10f(float __a); +__DEVICE__ double __nv_log1p(double __a); +__DEVICE__ float __nv_log1pf(float __a); +__DEVICE__ double __nv_log2(double __a); +__DEVICE__ float __nv_log2f(float __a); +__DEVICE__ double __nv_logb(double __a); +__DEVICE__ float __nv_logbf(float __a); +__DEVICE__ double __nv_log(double __a); +__DEVICE__ float __nv_logf(float __a); +__DEVICE__ double __nv_longlong_as_double(long long __a); +__DEVICE__ int __nv_max(int __a, int __b); +__DEVICE__ int __nv_min(int __a, int __b); +__DEVICE__ double __nv_modf(double __a, double *__b); +__DEVICE__ float __nv_modff(float __a, float *__b); +__DEVICE__ int __nv_mul24(int __a, int __b); +__DEVICE__ long long __nv_mul64hi(long long __a, long long __b); +__DEVICE__ int __nv_mulhi(int __a, int __b); +__DEVICE__ double __nv_nan(const signed char *__a); +__DEVICE__ float __nv_nanf(const signed char *__a); +__DEVICE__ double __nv_nearbyint(double __a); +__DEVICE__ float __nv_nearbyintf(float __a); +__DEVICE__ double __nv_nextafter(double __a, double __b); +__DEVICE__ float __nv_nextafterf(float __a, float __b); +__DEVICE__ double __nv_norm3d(double __a, double __b, double __c); +__DEVICE__ float __nv_norm3df(float __a, float __b, float __c); +__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d); +__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d); +__DEVICE__ double __nv_normcdf(double __a); +__DEVICE__ float __nv_normcdff(float __a); +__DEVICE__ double __nv_normcdfinv(double __a); +__DEVICE__ float __nv_normcdfinvf(float __a); +__DEVICE__ float __nv_normf(int __a, const float *__b); +__DEVICE__ double __nv_norm(int __a, const double *__b); +__DEVICE__ int __nv_popc(int __a); +__DEVICE__ int __nv_popcll(long long __a); +__DEVICE__ double __nv_pow(double __a, double __b); +__DEVICE__ float __nv_powf(float __a, float __b); +__DEVICE__ double __nv_powi(double __a, int __b); +__DEVICE__ float __nv_powif(float __a, int __b); +__DEVICE__ double __nv_rcbrt(double __a); +__DEVICE__ float __nv_rcbrtf(float __a); +__DEVICE__ double __nv_rcp64h(double __a); +__DEVICE__ double __nv_remainder(double __a, double __b); +__DEVICE__ float __nv_remainderf(float __a, float __b); +__DEVICE__ double __nv_remquo(double __a, double __b, int *__c); +__DEVICE__ float __nv_remquof(float __a, float __b, int *__c); +__DEVICE__ int __nv_rhadd(int __a, int __b); +__DEVICE__ double __nv_rhypot(double __a, double __b); +__DEVICE__ float __nv_rhypotf(float __a, float __b); +__DEVICE__ double __nv_rint(double __a); +__DEVICE__ float __nv_rintf(float __a); +__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c); +__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c); +__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d); +__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d); +__DEVICE__ float __nv_rnormf(int __a, const float *__b); +__DEVICE__ double __nv_rnorm(int __a, const double *__b); +__DEVICE__ double __nv_round(double __a); +__DEVICE__ float __nv_roundf(float __a); +__DEVICE__ double __nv_rsqrt(double __a); +__DEVICE__ float __nv_rsqrtf(float __a); +__DEVICE__ int __nv_sad(int __a, int __b, int __c); +__DEVICE__ float __nv_saturatef(float __a); +__DEVICE__ double __nv_scalbn(double __a, int __b); +__DEVICE__ float __nv_scalbnf(float __a, int __b); +__DEVICE__ int __nv_signbitd(double __a); +__DEVICE__ int __nv_signbitf(float __a); +__DEVICE__ void __nv_sincos(double __a, double *__b, double *__c); +__DEVICE__ void __nv_sincosf(float __a, float *__b, float *__c); +__DEVICE__ void __nv_sincospi(double __a, double *__b, double *__c); +__DEVICE__ void __nv_sincospif(float __a, float *__b, float *__c); +__DEVICE__ double __nv_sin(double __a); +__DEVICE__ float __nv_sinf(float __a); +__DEVICE__ double __nv_sinh(double __a); +__DEVICE__ float __nv_sinhf(float __a); +__DEVICE__ double __nv_sinpi(double __a); +__DEVICE__ float __nv_sinpif(float __a); +__DEVICE__ double __nv_sqrt(double __a); +__DEVICE__ float __nv_sqrtf(float __a); +__DEVICE__ double __nv_tan(double __a); +__DEVICE__ float __nv_tanf(float __a); +__DEVICE__ double __nv_tanh(double __a); +__DEVICE__ float __nv_tanhf(float __a); +__DEVICE__ double __nv_tgamma(double __a); +__DEVICE__ float __nv_tgammaf(float __a); +__DEVICE__ double __nv_trunc(double __a); +__DEVICE__ float __nv_truncf(float __a); +__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b); +__DEVICE__ double __nv_uint2double_rn(unsigned int __i); +__DEVICE__ float __nv_uint2float_rd(unsigned int __a); +__DEVICE__ float __nv_uint2float_rn(unsigned int __a); +__DEVICE__ float __nv_uint2float_ru(unsigned int __a); +__DEVICE__ float __nv_uint2float_rz(unsigned int __a); +__DEVICE__ float __nv_uint_as_float(unsigned int __a); +__DEVICE__ double __nv_ull2double_rd(unsigned long long __a); +__DEVICE__ double __nv_ull2double_rn(unsigned long long __a); +__DEVICE__ double __nv_ull2double_ru(unsigned long long __a); +__DEVICE__ double __nv_ull2double_rz(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rd(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rn(unsigned long long __a); +__DEVICE__ float __nv_ull2float_ru(unsigned long long __a); +__DEVICE__ float __nv_ull2float_rz(unsigned long long __a); +__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a, unsigned long long __b); -__device__ unsigned long long __nv_ullmin(unsigned long long __a, +__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a, unsigned long long __b); -__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b); -__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b); -__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b); -__device__ unsigned long long __nv_umul64hi(unsigned long long __a, +__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a, unsigned long long __b); -__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b); -__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b); -__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b, +__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b); +__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b, unsigned int __c); #if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 -__device__ int __nv_vabs2(int __a); -__device__ int __nv_vabs4(int __a); -__device__ int __nv_vabsdiffs2(int __a, int __b); -__device__ int __nv_vabsdiffs4(int __a, int __b); -__device__ int __nv_vabsdiffu2(int __a, int __b); -__device__ int __nv_vabsdiffu4(int __a, int __b); -__device__ int __nv_vabsss2(int __a); -__device__ int __nv_vabsss4(int __a); -__device__ int __nv_vadd2(int __a, int __b); -__device__ int __nv_vadd4(int __a, int __b); -__device__ int __nv_vaddss2(int __a, int __b); -__device__ int __nv_vaddss4(int __a, int __b); -__device__ int __nv_vaddus2(int __a, int __b); -__device__ int __nv_vaddus4(int __a, int __b); -__device__ int __nv_vavgs2(int __a, int __b); -__device__ int __nv_vavgs4(int __a, int __b); -__device__ int __nv_vavgu2(int __a, int __b); -__device__ int __nv_vavgu4(int __a, int __b); -__device__ int __nv_vcmpeq2(int __a, int __b); -__device__ int __nv_vcmpeq4(int __a, int __b); -__device__ int __nv_vcmpges2(int __a, int __b); -__device__ int __nv_vcmpges4(int __a, int __b); -__device__ int __nv_vcmpgeu2(int __a, int __b); -__device__ int __nv_vcmpgeu4(int __a, int __b); -__device__ int __nv_vcmpgts2(int __a, int __b); -__device__ int __nv_vcmpgts4(int __a, int __b); -__device__ int __nv_vcmpgtu2(int __a, int __b); -__device__ int __nv_vcmpgtu4(int __a, int __b); -__device__ int __nv_vcmples2(int __a, int __b); -__device__ int __nv_vcmples4(int __a, int __b); -__device__ int __nv_vcmpleu2(int __a, int __b); -__device__ int __nv_vcmpleu4(int __a, int __b); -__device__ int __nv_vcmplts2(int __a, int __b); -__device__ int __nv_vcmplts4(int __a, int __b); -__device__ int __nv_vcmpltu2(int __a, int __b); -__device__ int __nv_vcmpltu4(int __a, int __b); -__device__ int __nv_vcmpne2(int __a, int __b); -__device__ int __nv_vcmpne4(int __a, int __b); -__device__ int __nv_vhaddu2(int __a, int __b); -__device__ int __nv_vhaddu4(int __a, int __b); -__device__ int __nv_vmaxs2(int __a, int __b); -__device__ int __nv_vmaxs4(int __a, int __b); -__device__ int __nv_vmaxu2(int __a, int __b); -__device__ int __nv_vmaxu4(int __a, int __b); -__device__ int __nv_vmins2(int __a, int __b); -__device__ int __nv_vmins4(int __a, int __b); -__device__ int __nv_vminu2(int __a, int __b); -__device__ int __nv_vminu4(int __a, int __b); -__device__ int __nv_vneg2(int __a); -__device__ int __nv_vneg4(int __a); -__device__ int __nv_vnegss2(int __a); -__device__ int __nv_vnegss4(int __a); -__device__ int __nv_vsads2(int __a, int __b); -__device__ int __nv_vsads4(int __a, int __b); -__device__ int __nv_vsadu2(int __a, int __b); -__device__ int __nv_vsadu4(int __a, int __b); -__device__ int __nv_vseteq2(int __a, int __b); -__device__ int __nv_vseteq4(int __a, int __b); -__device__ int __nv_vsetges2(int __a, int __b); -__device__ int __nv_vsetges4(int __a, int __b); -__device__ int __nv_vsetgeu2(int __a, int __b); -__device__ int __nv_vsetgeu4(int __a, int __b); -__device__ int __nv_vsetgts2(int __a, int __b); -__device__ int __nv_vsetgts4(int __a, int __b); -__device__ int __nv_vsetgtu2(int __a, int __b); -__device__ int __nv_vsetgtu4(int __a, int __b); -__device__ int __nv_vsetles2(int __a, int __b); -__device__ int __nv_vsetles4(int __a, int __b); -__device__ int __nv_vsetleu2(int __a, int __b); -__device__ int __nv_vsetleu4(int __a, int __b); -__device__ int __nv_vsetlts2(int __a, int __b); -__device__ int __nv_vsetlts4(int __a, int __b); -__device__ int __nv_vsetltu2(int __a, int __b); -__device__ int __nv_vsetltu4(int __a, int __b); -__device__ int __nv_vsetne2(int __a, int __b); -__device__ int __nv_vsetne4(int __a, int __b); -__device__ int __nv_vsub2(int __a, int __b); -__device__ int __nv_vsub4(int __a, int __b); -__device__ int __nv_vsubss2(int __a, int __b); -__device__ int __nv_vsubss4(int __a, int __b); -__device__ int __nv_vsubus2(int __a, int __b); -__device__ int __nv_vsubus4(int __a, int __b); +__DEVICE__ int __nv_vabs2(int __a); +__DEVICE__ int __nv_vabs4(int __a); +__DEVICE__ int __nv_vabsdiffs2(int __a, int __b); +__DEVICE__ int __nv_vabsdiffs4(int __a, int __b); +__DEVICE__ int __nv_vabsdiffu2(int __a, int __b); +__DEVICE__ int __nv_vabsdiffu4(int __a, int __b); +__DEVICE__ int __nv_vabsss2(int __a); +__DEVICE__ int __nv_vabsss4(int __a); +__DEVICE__ int __nv_vadd2(int __a, int __b); +__DEVICE__ int __nv_vadd4(int __a, int __b); +__DEVICE__ int __nv_vaddss2(int __a, int __b); +__DEVICE__ int __nv_vaddss4(int __a, int __b); +__DEVICE__ int __nv_vaddus2(int __a, int __b); +__DEVICE__ int __nv_vaddus4(int __a, int __b); +__DEVICE__ int __nv_vavgs2(int __a, int __b); +__DEVICE__ int __nv_vavgs4(int __a, int __b); +__DEVICE__ int __nv_vavgu2(int __a, int __b); +__DEVICE__ int __nv_vavgu4(int __a, int __b); +__DEVICE__ int __nv_vcmpeq2(int __a, int __b); +__DEVICE__ int __nv_vcmpeq4(int __a, int __b); +__DEVICE__ int __nv_vcmpges2(int __a, int __b); +__DEVICE__ int __nv_vcmpges4(int __a, int __b); +__DEVICE__ int __nv_vcmpgeu2(int __a, int __b); +__DEVICE__ int __nv_vcmpgeu4(int __a, int __b); +__DEVICE__ int __nv_vcmpgts2(int __a, int __b); +__DEVICE__ int __nv_vcmpgts4(int __a, int __b); +__DEVICE__ int __nv_vcmpgtu2(int __a, int __b); +__DEVICE__ int __nv_vcmpgtu4(int __a, int __b); +__DEVICE__ int __nv_vcmples2(int __a, int __b); +__DEVICE__ int __nv_vcmples4(int __a, int __b); +__DEVICE__ int __nv_vcmpleu2(int __a, int __b); +__DEVICE__ int __nv_vcmpleu4(int __a, int __b); +__DEVICE__ int __nv_vcmplts2(int __a, int __b); +__DEVICE__ int __nv_vcmplts4(int __a, int __b); +__DEVICE__ int __nv_vcmpltu2(int __a, int __b); +__DEVICE__ int __nv_vcmpltu4(int __a, int __b); +__DEVICE__ int __nv_vcmpne2(int __a, int __b); +__DEVICE__ int __nv_vcmpne4(int __a, int __b); +__DEVICE__ int __nv_vhaddu2(int __a, int __b); +__DEVICE__ int __nv_vhaddu4(int __a, int __b); +__DEVICE__ int __nv_vmaxs2(int __a, int __b); +__DEVICE__ int __nv_vmaxs4(int __a, int __b); +__DEVICE__ int __nv_vmaxu2(int __a, int __b); +__DEVICE__ int __nv_vmaxu4(int __a, int __b); +__DEVICE__ int __nv_vmins2(int __a, int __b); +__DEVICE__ int __nv_vmins4(int __a, int __b); +__DEVICE__ int __nv_vminu2(int __a, int __b); +__DEVICE__ int __nv_vminu4(int __a, int __b); +__DEVICE__ int __nv_vneg2(int __a); +__DEVICE__ int __nv_vneg4(int __a); +__DEVICE__ int __nv_vnegss2(int __a); +__DEVICE__ int __nv_vnegss4(int __a); +__DEVICE__ int __nv_vsads2(int __a, int __b); +__DEVICE__ int __nv_vsads4(int __a, int __b); +__DEVICE__ int __nv_vsadu2(int __a, int __b); +__DEVICE__ int __nv_vsadu4(int __a, int __b); +__DEVICE__ int __nv_vseteq2(int __a, int __b); +__DEVICE__ int __nv_vseteq4(int __a, int __b); +__DEVICE__ int __nv_vsetges2(int __a, int __b); +__DEVICE__ int __nv_vsetges4(int __a, int __b); +__DEVICE__ int __nv_vsetgeu2(int __a, int __b); +__DEVICE__ int __nv_vsetgeu4(int __a, int __b); +__DEVICE__ int __nv_vsetgts2(int __a, int __b); +__DEVICE__ int __nv_vsetgts4(int __a, int __b); +__DEVICE__ int __nv_vsetgtu2(int __a, int __b); +__DEVICE__ int __nv_vsetgtu4(int __a, int __b); +__DEVICE__ int __nv_vsetles2(int __a, int __b); +__DEVICE__ int __nv_vsetles4(int __a, int __b); +__DEVICE__ int __nv_vsetleu2(int __a, int __b); +__DEVICE__ int __nv_vsetleu4(int __a, int __b); +__DEVICE__ int __nv_vsetlts2(int __a, int __b); +__DEVICE__ int __nv_vsetlts4(int __a, int __b); +__DEVICE__ int __nv_vsetltu2(int __a, int __b); +__DEVICE__ int __nv_vsetltu4(int __a, int __b); +__DEVICE__ int __nv_vsetne2(int __a, int __b); +__DEVICE__ int __nv_vsetne4(int __a, int __b); +__DEVICE__ int __nv_vsub2(int __a, int __b); +__DEVICE__ int __nv_vsub4(int __a, int __b); +__DEVICE__ int __nv_vsubss2(int __a, int __b); +__DEVICE__ int __nv_vsubss4(int __a, int __b); +__DEVICE__ int __nv_vsubus2(int __a, int __b); +__DEVICE__ int __nv_vsubus4(int __a, int __b); #endif // CUDA_VERSION -__device__ double __nv_y0(double __a); -__device__ float __nv_y0f(float __a); -__device__ double __nv_y1(double __a); -__device__ float __nv_y1f(float __a); -__device__ float __nv_ynf(int __a, float __b); -__device__ double __nv_yn(int __a, double __b); +__DEVICE__ double __nv_y0(double __a); +__DEVICE__ float __nv_y0f(float __a); +__DEVICE__ double __nv_y1(double __a); +__DEVICE__ float __nv_y1f(float __a); +__DEVICE__ float __nv_ynf(int __a, float __b); +__DEVICE__ double __nv_yn(int __a, double __b); +#if defined(__cplusplus) } // extern "C" +#endif #endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__ diff --git a/lib/include/__clang_cuda_math_forward_declares.h b/lib/include/__clang_cuda_math_forward_declares.h index c31b1f4cd..0afe4db55 100644 --- a/lib/include/__clang_cuda_math_forward_declares.h +++ b/lib/include/__clang_cuda_math_forward_declares.h @@ -1,22 +1,8 @@ /*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -34,14 +20,37 @@ // would preclude the use of our own __device__ overloads for these functions. #pragma push_macro("__DEVICE__") +#ifdef _OPENMP +#define __DEVICE__ static __inline__ __attribute__((always_inline)) +#else #define __DEVICE__ \ static __inline__ __attribute__((always_inline)) __attribute__((device)) +#endif -__DEVICE__ double abs(double); -__DEVICE__ float abs(float); -__DEVICE__ int abs(int); +// For C++ 17 we need to include noexcept attribute to be compatible +// with the header-defined version. This may be removed once +// variant is supported. +#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L +#define __NOEXCEPT noexcept +#else +#define __NOEXCEPT +#endif + +#if !(defined(_OPENMP) && defined(__cplusplus)) __DEVICE__ long abs(long); __DEVICE__ long long abs(long long); +__DEVICE__ double abs(double); +__DEVICE__ float abs(float); +#endif +// While providing the CUDA declarations and definitions for math functions, +// we may manually define additional functions. +// TODO: Once variant is supported the additional functions will have +// to be removed. +#if defined(_OPENMP) && defined(__cplusplus) +__DEVICE__ const double abs(const double); +__DEVICE__ const float abs(const float); +#endif +__DEVICE__ int abs(int) __NOEXCEPT; __DEVICE__ double acos(double); __DEVICE__ float acos(float); __DEVICE__ double acosh(double); @@ -76,8 +85,8 @@ __DEVICE__ double exp(double); __DEVICE__ float exp(float); __DEVICE__ double expm1(double); __DEVICE__ float expm1(float); -__DEVICE__ double fabs(double); -__DEVICE__ float fabs(float); +__DEVICE__ double fabs(double) __NOEXCEPT; +__DEVICE__ float fabs(float) __NOEXCEPT; __DEVICE__ double fdim(double, double); __DEVICE__ float fdim(float, float); __DEVICE__ double floor(double); @@ -98,12 +107,18 @@ __DEVICE__ double hypot(double, double); __DEVICE__ float hypot(float, float); __DEVICE__ int ilogb(double); __DEVICE__ int ilogb(float); +#ifdef _MSC_VER +__DEVICE__ bool isfinite(long double); +#endif __DEVICE__ bool isfinite(double); __DEVICE__ bool isfinite(float); __DEVICE__ bool isgreater(double, double); __DEVICE__ bool isgreaterequal(double, double); __DEVICE__ bool isgreaterequal(float, float); __DEVICE__ bool isgreater(float, float); +#ifdef _MSC_VER +__DEVICE__ bool isinf(long double); +#endif __DEVICE__ bool isinf(double); __DEVICE__ bool isinf(float); __DEVICE__ bool isless(double, double); @@ -112,18 +127,21 @@ __DEVICE__ bool islessequal(float, float); __DEVICE__ bool isless(float, float); __DEVICE__ bool islessgreater(double, double); __DEVICE__ bool islessgreater(float, float); +#ifdef _MSC_VER +__DEVICE__ bool isnan(long double); +#endif __DEVICE__ bool isnan(double); __DEVICE__ bool isnan(float); __DEVICE__ bool isnormal(double); __DEVICE__ bool isnormal(float); __DEVICE__ bool isunordered(double, double); __DEVICE__ bool isunordered(float, float); -__DEVICE__ long labs(long); +__DEVICE__ long labs(long) __NOEXCEPT; __DEVICE__ double ldexp(double, int); __DEVICE__ float ldexp(float, int); __DEVICE__ double lgamma(double); __DEVICE__ float lgamma(float); -__DEVICE__ long long llabs(long long); +__DEVICE__ long long llabs(long long) __NOEXCEPT; __DEVICE__ long long llrint(double); __DEVICE__ long long llrint(float); __DEVICE__ double log10(double); @@ -134,6 +152,9 @@ __DEVICE__ double log2(double); __DEVICE__ float log2(float); __DEVICE__ double logb(double); __DEVICE__ float logb(float); +#if defined(_OPENMP) && defined(__cplusplus) +__DEVICE__ long double log(long double); +#endif __DEVICE__ double log(double); __DEVICE__ float log(float); __DEVICE__ long lrint(double); @@ -281,6 +302,7 @@ _GLIBCXX_END_NAMESPACE_VERSION } // namespace std #endif +#undef __NOEXCEPT #pragma pop_macro("__DEVICE__") #endif diff --git a/lib/include/__clang_cuda_runtime_wrapper.h b/lib/include/__clang_cuda_runtime_wrapper.h index f05c0454a..3e362dd96 100644 --- a/lib/include/__clang_cuda_runtime_wrapper.h +++ b/lib/include/__clang_cuda_runtime_wrapper.h @@ -1,22 +1,8 @@ /*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -62,7 +48,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 10000 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 10010 #error "Unsupported CUDA version!" #endif @@ -426,5 +412,15 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const { #pragma pop_macro("__USE_FAST_MATH__") #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__") +// CUDA runtime uses this undocumented function to access kernel launch +// configuration. The declaration is in crt/device_functions.h but that file +// includes a lot of other stuff we don't want. Instead, we'll provide our own +// declaration for it here. +#if CUDA_VERSION >= 9020 +extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim, + size_t sharedMem = 0, + void *stream = 0); +#endif + #endif // __CUDA__ #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__ diff --git a/lib/include/__stddef_max_align_t.h b/lib/include/__stddef_max_align_t.h index 1e10ca986..e3b439285 100644 --- a/lib/include/__stddef_max_align_t.h +++ b/lib/include/__stddef_max_align_t.h @@ -1,24 +1,8 @@ /*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---=== * - * Copyright (c) 2014 Chandler Carruth - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/__wmmintrin_aes.h b/lib/include/__wmmintrin_aes.h index 70c355efc..f540319c7 100644 --- a/lib/include/__wmmintrin_aes.h +++ b/lib/include/__wmmintrin_aes.h @@ -1,22 +1,8 @@ /*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/__wmmintrin_pclmul.h b/lib/include/__wmmintrin_pclmul.h index e0f928796..fef4b93db 100644 --- a/lib/include/__wmmintrin_pclmul.h +++ b/lib/include/__wmmintrin_pclmul.h @@ -1,22 +1,8 @@ /*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/adxintrin.h b/lib/include/adxintrin.h index d6c454db8..72b9ed08f 100644 --- a/lib/include/adxintrin.h +++ b/lib/include/adxintrin.h @@ -1,22 +1,8 @@ /*===---- adxintrin.h - ADX intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/altivec.h b/lib/include/altivec.h index 2dc6adb90..4008440b2 100644 --- a/lib/include/altivec.h +++ b/lib/include/altivec.h @@ -1,22 +1,8 @@ /*===---- altivec.h - Standard header for type generic math ---------------===*\ * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * \*===----------------------------------------------------------------------===*/ diff --git a/lib/include/ammintrin.h b/lib/include/ammintrin.h index 680b4465e..3806be6eb 100644 --- a/lib/include/ammintrin.h +++ b/lib/include/ammintrin.h @@ -1,22 +1,8 @@ /*===---- ammintrin.h - SSE4a intrinsics -----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/arm64intr.h b/lib/include/arm64intr.h index be5228361..4943b2db6 100644 --- a/lib/include/arm64intr.h +++ b/lib/include/arm64intr.h @@ -1,22 +1,8 @@ /*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/arm_acle.h b/lib/include/arm_acle.h index ab2589798..096cc261a 100644 --- a/lib/include/arm_acle.h +++ b/lib/include/arm_acle.h @@ -1,22 +1,8 @@ /*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -611,6 +597,14 @@ __crc32cd(uint32_t __a, uint64_t __b) { } #endif +/* Armv8.3-A Javascript conversion intrinsic */ +#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_JCVT) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__jcvt(double __a) { + return __builtin_arm_jcvt(__a); +} +#endif + /* 10.1 Special register intrinsics */ #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg) #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg) @@ -619,6 +613,16 @@ __crc32cd(uint32_t __a, uint64_t __b) { #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v) #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v) +// Memory Tagging Extensions (MTE) Intrinsics +#if __ARM_FEATURE_MEMORY_TAGGING +#define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask) +#define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset) +#define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded) +#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr) +#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr) +#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb) +#endif + #if defined(__cplusplus) } #endif diff --git a/lib/include/arm_neon.h b/lib/include/arm_neon.h index d6765b36d..694bdfc9c 100644 --- a/lib/include/arm_neon.h +++ b/lib/include/arm_neon.h @@ -44247,13 +44247,13 @@ __ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) #endif #if defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__) #ifdef __LITTLE_ENDIAN__ -__ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vfmlalq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); return __ret; } #else -__ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); @@ -44262,7 +44262,7 @@ __ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_ __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } -__ai float32x4_t __noswap_vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t __noswap_vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vfmlalq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); return __ret; @@ -44270,13 +44270,13 @@ __ai float32x4_t __noswap_vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, f #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __ret; __ret = (float32x2_t) __builtin_neon_vfmlal_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); return __ret; } #else -__ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); @@ -44285,7 +44285,7 @@ __ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __ret = __builtin_shufflevector(__ret, __ret, 1, 0); return __ret; } -__ai float32x2_t __noswap_vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t __noswap_vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __ret; __ret = (float32x2_t) __builtin_neon_vfmlal_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); return __ret; @@ -44293,13 +44293,13 @@ __ai float32x2_t __noswap_vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, fl #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vfmlalq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); return __ret; } #else -__ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); @@ -44308,7 +44308,7 @@ __ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } -__ai float32x4_t __noswap_vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t __noswap_vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vfmlalq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); return __ret; @@ -44316,13 +44316,13 @@ __ai float32x4_t __noswap_vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, fl #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __ret; __ret = (float32x2_t) __builtin_neon_vfmlal_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); return __ret; } #else -__ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); @@ -44331,7 +44331,7 @@ __ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __ret = __builtin_shufflevector(__ret, __ret, 1, 0); return __ret; } -__ai float32x2_t __noswap_vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t __noswap_vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __ret; __ret = (float32x2_t) __builtin_neon_vfmlal_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); return __ret; @@ -44339,13 +44339,13 @@ __ai float32x2_t __noswap_vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, flo #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vfmlslq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); return __ret; } #else -__ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); @@ -44354,7 +44354,7 @@ __ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_ __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } -__ai float32x4_t __noswap_vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t __noswap_vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vfmlslq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); return __ret; @@ -44362,13 +44362,13 @@ __ai float32x4_t __noswap_vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, f #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __ret; __ret = (float32x2_t) __builtin_neon_vfmlsl_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); return __ret; } #else -__ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); @@ -44377,7 +44377,7 @@ __ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __ret = __builtin_shufflevector(__ret, __ret, 1, 0); return __ret; } -__ai float32x2_t __noswap_vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t __noswap_vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __ret; __ret = (float32x2_t) __builtin_neon_vfmlsl_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); return __ret; @@ -44385,13 +44385,13 @@ __ai float32x2_t __noswap_vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, fl #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vfmlslq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); return __ret; } #else -__ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); @@ -44400,7 +44400,7 @@ __ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } -__ai float32x4_t __noswap_vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float32x4_t __noswap_vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vfmlslq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); return __ret; @@ -44408,13 +44408,13 @@ __ai float32x4_t __noswap_vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, fl #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __ret; __ret = (float32x2_t) __builtin_neon_vfmlsl_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); return __ret; } #else -__ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); @@ -44423,7 +44423,7 @@ __ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __ret = __builtin_shufflevector(__ret, __ret, 1, 0); return __ret; } -__ai float32x2_t __noswap_vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float32x2_t __noswap_vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) { float32x2_t __ret; __ret = (float32x2_t) __builtin_neon_vfmlsl_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); return __ret; @@ -64095,15 +64095,15 @@ __ai uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) { +__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, uint8x16_t __p1) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 32); return __ret; } #else -__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) { +__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, uint8x16_t __p1) { int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64129,15 +64129,15 @@ __ai uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) { +__ai int8x8_t vqtbl1_s8(int8x16_t __p0, uint8x8_t __p1) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 0); return __ret; } #else -__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) { +__ai int8x8_t vqtbl1_s8(int8x16_t __p0, uint8x8_t __p1) { int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64203,17 +64203,17 @@ __ai uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) { +__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, uint8x16_t __p1) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 32); return __ret; } #else -__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) { +__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, uint8x16_t __p1) { int8x16x2_t __rev0; __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64241,17 +64241,17 @@ __ai uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) { +__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, uint8x8_t __p1) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 0); return __ret; } #else -__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) { +__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, uint8x8_t __p1) { int8x16x2_t __rev0; __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64320,18 +64320,18 @@ __ai uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) { +__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, uint8x16_t __p1) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 32); return __ret; } #else -__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) { +__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, uint8x16_t __p1) { int8x16x3_t __rev0; __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64360,18 +64360,18 @@ __ai uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) { +__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, uint8x8_t __p1) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 0); return __ret; } #else -__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) { +__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, uint8x8_t __p1) { int8x16x3_t __rev0; __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64443,19 +64443,19 @@ __ai uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) { +__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, uint8x16_t __p1) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 32); return __ret; } #else -__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) { +__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, uint8x16_t __p1) { int8x16x4_t __rev0; __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64485,19 +64485,19 @@ __ai uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) { +__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, uint8x8_t __p1) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 0); return __ret; } #else -__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) { +__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, uint8x8_t __p1) { int8x16x4_t __rev0; __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64560,16 +64560,16 @@ __ai uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) { +__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, uint8x16_t __p2) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32); return __ret; } #else -__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) { +__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, uint8x16_t __p2) { int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64596,16 +64596,16 @@ __ai uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) { +__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, uint8x8_t __p2) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 0); return __ret; } #else -__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) { +__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, uint8x8_t __p2) { int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64674,18 +64674,18 @@ __ai uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2) #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) { +__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, uint8x16_t __p2) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 32); return __ret; } #else -__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) { +__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, uint8x16_t __p2) { int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16x2_t __rev1; __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64714,18 +64714,18 @@ __ai uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) { +__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, uint8x8_t __p2) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 0); return __ret; } #else -__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) { +__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, uint8x8_t __p2) { int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); int8x16x2_t __rev1; __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64797,19 +64797,19 @@ __ai uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2) #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) { +__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, uint8x16_t __p2) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 32); return __ret; } #else -__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) { +__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, uint8x16_t __p2) { int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16x3_t __rev1; __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64839,19 +64839,19 @@ __ai uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) { +__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, uint8x8_t __p2) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 0); return __ret; } #else -__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) { +__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, uint8x8_t __p2) { int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); int8x16x3_t __rev1; __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64926,20 +64926,20 @@ __ai uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2) #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) { +__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, uint8x16_t __p2) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 32); return __ret; } #else -__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) { +__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, uint8x16_t __p2) { int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16x4_t __rev1; __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -64970,20 +64970,20 @@ __ai uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) { +__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, uint8x8_t __p2) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 0); return __ret; } #else -__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) { +__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, uint8x8_t __p2) { int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); int8x16x4_t __rev1; __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -66293,13 +66293,13 @@ __ai int64_t vshld_s64(int64_t __p0, int64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) { +__ai uint8_t vsqaddb_u8(uint8_t __p0, int8_t __p1) { uint8_t __ret; __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1); return __ret; } #else -__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) { +__ai uint8_t vsqaddb_u8(uint8_t __p0, int8_t __p1) { uint8_t __ret; __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1); return __ret; @@ -66307,13 +66307,13 @@ __ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) { +__ai uint32_t vsqadds_u32(uint32_t __p0, int32_t __p1) { uint32_t __ret; __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1); return __ret; } #else -__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) { +__ai uint32_t vsqadds_u32(uint32_t __p0, int32_t __p1) { uint32_t __ret; __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1); return __ret; @@ -66321,13 +66321,13 @@ __ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) { +__ai uint64_t vsqaddd_u64(uint64_t __p0, int64_t __p1) { uint64_t __ret; __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1); return __ret; } #else -__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) { +__ai uint64_t vsqaddd_u64(uint64_t __p0, int64_t __p1) { uint64_t __ret; __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1); return __ret; @@ -66335,13 +66335,13 @@ __ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) { +__ai uint16_t vsqaddh_u16(uint16_t __p0, int16_t __p1) { uint16_t __ret; __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1); return __ret; } #else -__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) { +__ai uint16_t vsqaddh_u16(uint16_t __p0, int16_t __p1) { uint16_t __ret; __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1); return __ret; @@ -66349,15 +66349,15 @@ __ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) { +__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, int8x16_t __p1) { uint8x16_t __ret; __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48); return __ret; } #else -__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) { +__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, int8x16_t __p1) { uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); uint8x16_t __ret; __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -66366,15 +66366,15 @@ __ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) { +__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, int32x4_t __p1) { uint32x4_t __ret; __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50); return __ret; } #else -__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) { +__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, int32x4_t __p1) { uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + int32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); uint32x4_t __ret; __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50); __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); @@ -66383,15 +66383,15 @@ __ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) { +__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, int64x2_t __p1) { uint64x2_t __ret; __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51); return __ret; } #else -__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) { +__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, int64x2_t __p1) { uint64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); - uint64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + int64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); uint64x2_t __ret; __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51); __ret = __builtin_shufflevector(__ret, __ret, 1, 0); @@ -66400,15 +66400,15 @@ __ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) { +__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, int16x8_t __p1) { uint16x8_t __ret; __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); return __ret; } #else -__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) { +__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, int16x8_t __p1) { uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - uint16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); uint16x8_t __ret; __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -66417,15 +66417,15 @@ __ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) { +__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, int8x8_t __p1) { uint8x8_t __ret; __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16); return __ret; } #else -__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) { +__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, int8x8_t __p1) { uint8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); uint8x8_t __ret; __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -66434,15 +66434,15 @@ __ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) { +__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, int32x2_t __p1) { uint32x2_t __ret; __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18); return __ret; } #else -__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) { +__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, int32x2_t __p1) { uint32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); - uint32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + int32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); uint32x2_t __ret; __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18); __ret = __builtin_shufflevector(__ret, __ret, 1, 0); @@ -66451,13 +66451,13 @@ __ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) { +__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, int64x1_t __p1) { uint64x1_t __ret; __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19); return __ret; } #else -__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) { +__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, int64x1_t __p1) { uint64x1_t __ret; __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19); return __ret; @@ -66465,15 +66465,15 @@ __ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) { +__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, int16x4_t __p1) { uint16x4_t __ret; __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17); return __ret; } #else -__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) { +__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, int16x4_t __p1) { uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + int16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); uint16x4_t __ret; __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); @@ -68919,13 +68919,13 @@ __ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) { +__ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) { int8_t __ret; __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1); return __ret; } #else -__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) { +__ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) { int8_t __ret; __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1); return __ret; @@ -68933,13 +68933,13 @@ __ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) { +__ai int32_t vuqadds_s32(int32_t __p0, uint32_t __p1) { int32_t __ret; __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1); return __ret; } #else -__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) { +__ai int32_t vuqadds_s32(int32_t __p0, uint32_t __p1) { int32_t __ret; __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1); return __ret; @@ -68947,13 +68947,13 @@ __ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) { +__ai int64_t vuqaddd_s64(int64_t __p0, uint64_t __p1) { int64_t __ret; __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1); return __ret; } #else -__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) { +__ai int64_t vuqaddd_s64(int64_t __p0, uint64_t __p1) { int64_t __ret; __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1); return __ret; @@ -68961,13 +68961,13 @@ __ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) { +__ai int16_t vuqaddh_s16(int16_t __p0, uint16_t __p1) { int16_t __ret; __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1); return __ret; } #else -__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) { +__ai int16_t vuqaddh_s16(int16_t __p0, uint16_t __p1) { int16_t __ret; __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1); return __ret; @@ -68975,15 +68975,15 @@ __ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) { +__ai int8x16_t vuqaddq_s8(int8x16_t __p0, uint8x16_t __p1) { int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32); return __ret; } #else -__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) { +__ai int8x16_t vuqaddq_s8(int8x16_t __p0, uint8x16_t __p1) { int8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); int8x16_t __ret; __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32); __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); @@ -68992,15 +68992,15 @@ __ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) { +__ai int32x4_t vuqaddq_s32(int32x4_t __p0, uint32x4_t __p1) { int32x4_t __ret; __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34); return __ret; } #else -__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) { +__ai int32x4_t vuqaddq_s32(int32x4_t __p0, uint32x4_t __p1) { int32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - int32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); int32x4_t __ret; __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34); __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); @@ -69009,15 +69009,15 @@ __ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) { +__ai int64x2_t vuqaddq_s64(int64x2_t __p0, uint64x2_t __p1) { int64x2_t __ret; __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35); return __ret; } #else -__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) { +__ai int64x2_t vuqaddq_s64(int64x2_t __p0, uint64x2_t __p1) { int64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); - int64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + uint64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); int64x2_t __ret; __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35); __ret = __builtin_shufflevector(__ret, __ret, 1, 0); @@ -69026,15 +69026,15 @@ __ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) { +__ai int16x8_t vuqaddq_s16(int16x8_t __p0, uint16x8_t __p1) { int16x8_t __ret; __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33); return __ret; } #else -__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) { +__ai int16x8_t vuqaddq_s16(int16x8_t __p0, uint16x8_t __p1) { int16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - int16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); int16x8_t __ret; __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -69043,15 +69043,15 @@ __ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) { +__ai int8x8_t vuqadd_s8(int8x8_t __p0, uint8x8_t __p1) { int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0); return __ret; } #else -__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) { +__ai int8x8_t vuqadd_s8(int8x8_t __p0, uint8x8_t __p1) { int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); int8x8_t __ret; __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); @@ -69060,15 +69060,15 @@ __ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) { +__ai int32x2_t vuqadd_s32(int32x2_t __p0, uint32x2_t __p1) { int32x2_t __ret; __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2); return __ret; } #else -__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) { +__ai int32x2_t vuqadd_s32(int32x2_t __p0, uint32x2_t __p1) { int32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); - int32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + uint32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); int32x2_t __ret; __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2); __ret = __builtin_shufflevector(__ret, __ret, 1, 0); @@ -69077,13 +69077,13 @@ __ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) { +__ai int64x1_t vuqadd_s64(int64x1_t __p0, uint64x1_t __p1) { int64x1_t __ret; __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3); return __ret; } #else -__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) { +__ai int64x1_t vuqadd_s64(int64x1_t __p0, uint64x1_t __p1) { int64x1_t __ret; __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3); return __ret; @@ -69091,15 +69091,15 @@ __ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) { +__ai int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) { int16x4_t __ret; __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1); return __ret; } #else -__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) { +__ai int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) { int16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - int16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); int16x4_t __ret; __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1); __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); @@ -71912,16 +71912,16 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in #if defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__) #ifdef __LITTLE_ENDIAN__ -#define vfmlalq_lane_high_u32(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \ +#define vfmlalq_lane_high_f16(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \ float32x4_t __s0_264 = __p0_264; \ float16x8_t __s1_264 = __p1_264; \ float16x4_t __s2_264 = __p2_264; \ float32x4_t __ret_264; \ - __ret_264 = vfmlalq_high_u32(__s0_264, __s1_264, (float16x8_t) {vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264)}); \ + __ret_264 = vfmlalq_high_f16(__s0_264, __s1_264, (float16x8_t) {vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264)}); \ __ret_264; \ }) #else -#define vfmlalq_lane_high_u32(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \ +#define vfmlalq_lane_high_f16(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \ float32x4_t __s0_265 = __p0_265; \ float16x8_t __s1_265 = __p1_265; \ float16x4_t __s2_265 = __p2_265; \ @@ -71929,23 +71929,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x8_t __rev1_265; __rev1_265 = __builtin_shufflevector(__s1_265, __s1_265, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x4_t __rev2_265; __rev2_265 = __builtin_shufflevector(__s2_265, __s2_265, 3, 2, 1, 0); \ float32x4_t __ret_265; \ - __ret_265 = __noswap_vfmlalq_high_u32(__rev0_265, __rev1_265, (float16x8_t) {__noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265)}); \ + __ret_265 = __noswap_vfmlalq_high_f16(__rev0_265, __rev1_265, (float16x8_t) {__noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265)}); \ __ret_265 = __builtin_shufflevector(__ret_265, __ret_265, 3, 2, 1, 0); \ __ret_265; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlal_lane_high_u32(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \ +#define vfmlal_lane_high_f16(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \ float32x2_t __s0_266 = __p0_266; \ float16x4_t __s1_266 = __p1_266; \ float16x4_t __s2_266 = __p2_266; \ float32x2_t __ret_266; \ - __ret_266 = vfmlal_high_u32(__s0_266, __s1_266, (float16x4_t) {vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266)}); \ + __ret_266 = vfmlal_high_f16(__s0_266, __s1_266, (float16x4_t) {vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266)}); \ __ret_266; \ }) #else -#define vfmlal_lane_high_u32(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \ +#define vfmlal_lane_high_f16(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \ float32x2_t __s0_267 = __p0_267; \ float16x4_t __s1_267 = __p1_267; \ float16x4_t __s2_267 = __p2_267; \ @@ -71953,23 +71953,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x4_t __rev1_267; __rev1_267 = __builtin_shufflevector(__s1_267, __s1_267, 3, 2, 1, 0); \ float16x4_t __rev2_267; __rev2_267 = __builtin_shufflevector(__s2_267, __s2_267, 3, 2, 1, 0); \ float32x2_t __ret_267; \ - __ret_267 = __noswap_vfmlal_high_u32(__rev0_267, __rev1_267, (float16x4_t) {__noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267)}); \ + __ret_267 = __noswap_vfmlal_high_f16(__rev0_267, __rev1_267, (float16x4_t) {__noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267)}); \ __ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 1, 0); \ __ret_267; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlalq_lane_low_u32(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \ +#define vfmlalq_lane_low_f16(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \ float32x4_t __s0_268 = __p0_268; \ float16x8_t __s1_268 = __p1_268; \ float16x4_t __s2_268 = __p2_268; \ float32x4_t __ret_268; \ - __ret_268 = vfmlalq_low_u32(__s0_268, __s1_268, (float16x8_t) {vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268)}); \ + __ret_268 = vfmlalq_low_f16(__s0_268, __s1_268, (float16x8_t) {vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268)}); \ __ret_268; \ }) #else -#define vfmlalq_lane_low_u32(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \ +#define vfmlalq_lane_low_f16(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \ float32x4_t __s0_269 = __p0_269; \ float16x8_t __s1_269 = __p1_269; \ float16x4_t __s2_269 = __p2_269; \ @@ -71977,23 +71977,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x8_t __rev1_269; __rev1_269 = __builtin_shufflevector(__s1_269, __s1_269, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x4_t __rev2_269; __rev2_269 = __builtin_shufflevector(__s2_269, __s2_269, 3, 2, 1, 0); \ float32x4_t __ret_269; \ - __ret_269 = __noswap_vfmlalq_low_u32(__rev0_269, __rev1_269, (float16x8_t) {__noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269)}); \ + __ret_269 = __noswap_vfmlalq_low_f16(__rev0_269, __rev1_269, (float16x8_t) {__noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269)}); \ __ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 3, 2, 1, 0); \ __ret_269; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlal_lane_low_u32(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \ +#define vfmlal_lane_low_f16(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \ float32x2_t __s0_270 = __p0_270; \ float16x4_t __s1_270 = __p1_270; \ float16x4_t __s2_270 = __p2_270; \ float32x2_t __ret_270; \ - __ret_270 = vfmlal_low_u32(__s0_270, __s1_270, (float16x4_t) {vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270)}); \ + __ret_270 = vfmlal_low_f16(__s0_270, __s1_270, (float16x4_t) {vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270)}); \ __ret_270; \ }) #else -#define vfmlal_lane_low_u32(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \ +#define vfmlal_lane_low_f16(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \ float32x2_t __s0_271 = __p0_271; \ float16x4_t __s1_271 = __p1_271; \ float16x4_t __s2_271 = __p2_271; \ @@ -72001,23 +72001,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x4_t __rev1_271; __rev1_271 = __builtin_shufflevector(__s1_271, __s1_271, 3, 2, 1, 0); \ float16x4_t __rev2_271; __rev2_271 = __builtin_shufflevector(__s2_271, __s2_271, 3, 2, 1, 0); \ float32x2_t __ret_271; \ - __ret_271 = __noswap_vfmlal_low_u32(__rev0_271, __rev1_271, (float16x4_t) {__noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271)}); \ + __ret_271 = __noswap_vfmlal_low_f16(__rev0_271, __rev1_271, (float16x4_t) {__noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271)}); \ __ret_271 = __builtin_shufflevector(__ret_271, __ret_271, 1, 0); \ __ret_271; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlalq_laneq_high_u32(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \ +#define vfmlalq_laneq_high_f16(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \ float32x4_t __s0_272 = __p0_272; \ float16x8_t __s1_272 = __p1_272; \ float16x8_t __s2_272 = __p2_272; \ float32x4_t __ret_272; \ - __ret_272 = vfmlalq_high_u32(__s0_272, __s1_272, (float16x8_t) {vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272)}); \ + __ret_272 = vfmlalq_high_f16(__s0_272, __s1_272, (float16x8_t) {vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272)}); \ __ret_272; \ }) #else -#define vfmlalq_laneq_high_u32(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \ +#define vfmlalq_laneq_high_f16(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \ float32x4_t __s0_273 = __p0_273; \ float16x8_t __s1_273 = __p1_273; \ float16x8_t __s2_273 = __p2_273; \ @@ -72025,23 +72025,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x8_t __rev1_273; __rev1_273 = __builtin_shufflevector(__s1_273, __s1_273, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev2_273; __rev2_273 = __builtin_shufflevector(__s2_273, __s2_273, 7, 6, 5, 4, 3, 2, 1, 0); \ float32x4_t __ret_273; \ - __ret_273 = __noswap_vfmlalq_high_u32(__rev0_273, __rev1_273, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273)}); \ + __ret_273 = __noswap_vfmlalq_high_f16(__rev0_273, __rev1_273, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273)}); \ __ret_273 = __builtin_shufflevector(__ret_273, __ret_273, 3, 2, 1, 0); \ __ret_273; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlal_laneq_high_u32(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \ +#define vfmlal_laneq_high_f16(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \ float32x2_t __s0_274 = __p0_274; \ float16x4_t __s1_274 = __p1_274; \ float16x8_t __s2_274 = __p2_274; \ float32x2_t __ret_274; \ - __ret_274 = vfmlal_high_u32(__s0_274, __s1_274, (float16x4_t) {vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274)}); \ + __ret_274 = vfmlal_high_f16(__s0_274, __s1_274, (float16x4_t) {vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274)}); \ __ret_274; \ }) #else -#define vfmlal_laneq_high_u32(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \ +#define vfmlal_laneq_high_f16(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \ float32x2_t __s0_275 = __p0_275; \ float16x4_t __s1_275 = __p1_275; \ float16x8_t __s2_275 = __p2_275; \ @@ -72049,23 +72049,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x4_t __rev1_275; __rev1_275 = __builtin_shufflevector(__s1_275, __s1_275, 3, 2, 1, 0); \ float16x8_t __rev2_275; __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 7, 6, 5, 4, 3, 2, 1, 0); \ float32x2_t __ret_275; \ - __ret_275 = __noswap_vfmlal_high_u32(__rev0_275, __rev1_275, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275)}); \ + __ret_275 = __noswap_vfmlal_high_f16(__rev0_275, __rev1_275, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275)}); \ __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 1, 0); \ __ret_275; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlalq_laneq_low_u32(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \ +#define vfmlalq_laneq_low_f16(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \ float32x4_t __s0_276 = __p0_276; \ float16x8_t __s1_276 = __p1_276; \ float16x8_t __s2_276 = __p2_276; \ float32x4_t __ret_276; \ - __ret_276 = vfmlalq_low_u32(__s0_276, __s1_276, (float16x8_t) {vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276)}); \ + __ret_276 = vfmlalq_low_f16(__s0_276, __s1_276, (float16x8_t) {vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276)}); \ __ret_276; \ }) #else -#define vfmlalq_laneq_low_u32(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \ +#define vfmlalq_laneq_low_f16(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \ float32x4_t __s0_277 = __p0_277; \ float16x8_t __s1_277 = __p1_277; \ float16x8_t __s2_277 = __p2_277; \ @@ -72073,23 +72073,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x8_t __rev1_277; __rev1_277 = __builtin_shufflevector(__s1_277, __s1_277, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev2_277; __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 7, 6, 5, 4, 3, 2, 1, 0); \ float32x4_t __ret_277; \ - __ret_277 = __noswap_vfmlalq_low_u32(__rev0_277, __rev1_277, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277)}); \ + __ret_277 = __noswap_vfmlalq_low_f16(__rev0_277, __rev1_277, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277)}); \ __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 3, 2, 1, 0); \ __ret_277; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlal_laneq_low_u32(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \ +#define vfmlal_laneq_low_f16(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \ float32x2_t __s0_278 = __p0_278; \ float16x4_t __s1_278 = __p1_278; \ float16x8_t __s2_278 = __p2_278; \ float32x2_t __ret_278; \ - __ret_278 = vfmlal_low_u32(__s0_278, __s1_278, (float16x4_t) {vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278)}); \ + __ret_278 = vfmlal_low_f16(__s0_278, __s1_278, (float16x4_t) {vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278)}); \ __ret_278; \ }) #else -#define vfmlal_laneq_low_u32(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \ +#define vfmlal_laneq_low_f16(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \ float32x2_t __s0_279 = __p0_279; \ float16x4_t __s1_279 = __p1_279; \ float16x8_t __s2_279 = __p2_279; \ @@ -72097,23 +72097,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x4_t __rev1_279; __rev1_279 = __builtin_shufflevector(__s1_279, __s1_279, 3, 2, 1, 0); \ float16x8_t __rev2_279; __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 7, 6, 5, 4, 3, 2, 1, 0); \ float32x2_t __ret_279; \ - __ret_279 = __noswap_vfmlal_low_u32(__rev0_279, __rev1_279, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279)}); \ + __ret_279 = __noswap_vfmlal_low_f16(__rev0_279, __rev1_279, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279)}); \ __ret_279 = __builtin_shufflevector(__ret_279, __ret_279, 1, 0); \ __ret_279; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlslq_lane_high_u32(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \ +#define vfmlslq_lane_high_f16(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \ float32x4_t __s0_280 = __p0_280; \ float16x8_t __s1_280 = __p1_280; \ float16x4_t __s2_280 = __p2_280; \ float32x4_t __ret_280; \ - __ret_280 = vfmlslq_high_u32(__s0_280, __s1_280, (float16x8_t) {vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280)}); \ + __ret_280 = vfmlslq_high_f16(__s0_280, __s1_280, (float16x8_t) {vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280)}); \ __ret_280; \ }) #else -#define vfmlslq_lane_high_u32(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \ +#define vfmlslq_lane_high_f16(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \ float32x4_t __s0_281 = __p0_281; \ float16x8_t __s1_281 = __p1_281; \ float16x4_t __s2_281 = __p2_281; \ @@ -72121,23 +72121,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x8_t __rev1_281; __rev1_281 = __builtin_shufflevector(__s1_281, __s1_281, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x4_t __rev2_281; __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 3, 2, 1, 0); \ float32x4_t __ret_281; \ - __ret_281 = __noswap_vfmlslq_high_u32(__rev0_281, __rev1_281, (float16x8_t) {__noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281)}); \ + __ret_281 = __noswap_vfmlslq_high_f16(__rev0_281, __rev1_281, (float16x8_t) {__noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281)}); \ __ret_281 = __builtin_shufflevector(__ret_281, __ret_281, 3, 2, 1, 0); \ __ret_281; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlsl_lane_high_u32(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \ +#define vfmlsl_lane_high_f16(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \ float32x2_t __s0_282 = __p0_282; \ float16x4_t __s1_282 = __p1_282; \ float16x4_t __s2_282 = __p2_282; \ float32x2_t __ret_282; \ - __ret_282 = vfmlsl_high_u32(__s0_282, __s1_282, (float16x4_t) {vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282)}); \ + __ret_282 = vfmlsl_high_f16(__s0_282, __s1_282, (float16x4_t) {vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282)}); \ __ret_282; \ }) #else -#define vfmlsl_lane_high_u32(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \ +#define vfmlsl_lane_high_f16(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \ float32x2_t __s0_283 = __p0_283; \ float16x4_t __s1_283 = __p1_283; \ float16x4_t __s2_283 = __p2_283; \ @@ -72145,23 +72145,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x4_t __rev1_283; __rev1_283 = __builtin_shufflevector(__s1_283, __s1_283, 3, 2, 1, 0); \ float16x4_t __rev2_283; __rev2_283 = __builtin_shufflevector(__s2_283, __s2_283, 3, 2, 1, 0); \ float32x2_t __ret_283; \ - __ret_283 = __noswap_vfmlsl_high_u32(__rev0_283, __rev1_283, (float16x4_t) {__noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283)}); \ + __ret_283 = __noswap_vfmlsl_high_f16(__rev0_283, __rev1_283, (float16x4_t) {__noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283)}); \ __ret_283 = __builtin_shufflevector(__ret_283, __ret_283, 1, 0); \ __ret_283; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlslq_lane_low_u32(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \ +#define vfmlslq_lane_low_f16(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \ float32x4_t __s0_284 = __p0_284; \ float16x8_t __s1_284 = __p1_284; \ float16x4_t __s2_284 = __p2_284; \ float32x4_t __ret_284; \ - __ret_284 = vfmlslq_low_u32(__s0_284, __s1_284, (float16x8_t) {vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284)}); \ + __ret_284 = vfmlslq_low_f16(__s0_284, __s1_284, (float16x8_t) {vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284)}); \ __ret_284; \ }) #else -#define vfmlslq_lane_low_u32(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \ +#define vfmlslq_lane_low_f16(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \ float32x4_t __s0_285 = __p0_285; \ float16x8_t __s1_285 = __p1_285; \ float16x4_t __s2_285 = __p2_285; \ @@ -72169,23 +72169,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x8_t __rev1_285; __rev1_285 = __builtin_shufflevector(__s1_285, __s1_285, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x4_t __rev2_285; __rev2_285 = __builtin_shufflevector(__s2_285, __s2_285, 3, 2, 1, 0); \ float32x4_t __ret_285; \ - __ret_285 = __noswap_vfmlslq_low_u32(__rev0_285, __rev1_285, (float16x8_t) {__noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285)}); \ + __ret_285 = __noswap_vfmlslq_low_f16(__rev0_285, __rev1_285, (float16x8_t) {__noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285)}); \ __ret_285 = __builtin_shufflevector(__ret_285, __ret_285, 3, 2, 1, 0); \ __ret_285; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlsl_lane_low_u32(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \ +#define vfmlsl_lane_low_f16(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \ float32x2_t __s0_286 = __p0_286; \ float16x4_t __s1_286 = __p1_286; \ float16x4_t __s2_286 = __p2_286; \ float32x2_t __ret_286; \ - __ret_286 = vfmlsl_low_u32(__s0_286, __s1_286, (float16x4_t) {vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286)}); \ + __ret_286 = vfmlsl_low_f16(__s0_286, __s1_286, (float16x4_t) {vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286)}); \ __ret_286; \ }) #else -#define vfmlsl_lane_low_u32(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \ +#define vfmlsl_lane_low_f16(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \ float32x2_t __s0_287 = __p0_287; \ float16x4_t __s1_287 = __p1_287; \ float16x4_t __s2_287 = __p2_287; \ @@ -72193,23 +72193,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x4_t __rev1_287; __rev1_287 = __builtin_shufflevector(__s1_287, __s1_287, 3, 2, 1, 0); \ float16x4_t __rev2_287; __rev2_287 = __builtin_shufflevector(__s2_287, __s2_287, 3, 2, 1, 0); \ float32x2_t __ret_287; \ - __ret_287 = __noswap_vfmlsl_low_u32(__rev0_287, __rev1_287, (float16x4_t) {__noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287)}); \ + __ret_287 = __noswap_vfmlsl_low_f16(__rev0_287, __rev1_287, (float16x4_t) {__noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287)}); \ __ret_287 = __builtin_shufflevector(__ret_287, __ret_287, 1, 0); \ __ret_287; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlslq_laneq_high_u32(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \ +#define vfmlslq_laneq_high_f16(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \ float32x4_t __s0_288 = __p0_288; \ float16x8_t __s1_288 = __p1_288; \ float16x8_t __s2_288 = __p2_288; \ float32x4_t __ret_288; \ - __ret_288 = vfmlslq_high_u32(__s0_288, __s1_288, (float16x8_t) {vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288)}); \ + __ret_288 = vfmlslq_high_f16(__s0_288, __s1_288, (float16x8_t) {vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288)}); \ __ret_288; \ }) #else -#define vfmlslq_laneq_high_u32(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \ +#define vfmlslq_laneq_high_f16(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \ float32x4_t __s0_289 = __p0_289; \ float16x8_t __s1_289 = __p1_289; \ float16x8_t __s2_289 = __p2_289; \ @@ -72217,23 +72217,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x8_t __rev1_289; __rev1_289 = __builtin_shufflevector(__s1_289, __s1_289, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev2_289; __rev2_289 = __builtin_shufflevector(__s2_289, __s2_289, 7, 6, 5, 4, 3, 2, 1, 0); \ float32x4_t __ret_289; \ - __ret_289 = __noswap_vfmlslq_high_u32(__rev0_289, __rev1_289, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289)}); \ + __ret_289 = __noswap_vfmlslq_high_f16(__rev0_289, __rev1_289, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289)}); \ __ret_289 = __builtin_shufflevector(__ret_289, __ret_289, 3, 2, 1, 0); \ __ret_289; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlsl_laneq_high_u32(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \ +#define vfmlsl_laneq_high_f16(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \ float32x2_t __s0_290 = __p0_290; \ float16x4_t __s1_290 = __p1_290; \ float16x8_t __s2_290 = __p2_290; \ float32x2_t __ret_290; \ - __ret_290 = vfmlsl_high_u32(__s0_290, __s1_290, (float16x4_t) {vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290)}); \ + __ret_290 = vfmlsl_high_f16(__s0_290, __s1_290, (float16x4_t) {vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290)}); \ __ret_290; \ }) #else -#define vfmlsl_laneq_high_u32(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \ +#define vfmlsl_laneq_high_f16(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \ float32x2_t __s0_291 = __p0_291; \ float16x4_t __s1_291 = __p1_291; \ float16x8_t __s2_291 = __p2_291; \ @@ -72241,23 +72241,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x4_t __rev1_291; __rev1_291 = __builtin_shufflevector(__s1_291, __s1_291, 3, 2, 1, 0); \ float16x8_t __rev2_291; __rev2_291 = __builtin_shufflevector(__s2_291, __s2_291, 7, 6, 5, 4, 3, 2, 1, 0); \ float32x2_t __ret_291; \ - __ret_291 = __noswap_vfmlsl_high_u32(__rev0_291, __rev1_291, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291)}); \ + __ret_291 = __noswap_vfmlsl_high_f16(__rev0_291, __rev1_291, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291)}); \ __ret_291 = __builtin_shufflevector(__ret_291, __ret_291, 1, 0); \ __ret_291; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlslq_laneq_low_u32(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \ +#define vfmlslq_laneq_low_f16(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \ float32x4_t __s0_292 = __p0_292; \ float16x8_t __s1_292 = __p1_292; \ float16x8_t __s2_292 = __p2_292; \ float32x4_t __ret_292; \ - __ret_292 = vfmlslq_low_u32(__s0_292, __s1_292, (float16x8_t) {vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292)}); \ + __ret_292 = vfmlslq_low_f16(__s0_292, __s1_292, (float16x8_t) {vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292)}); \ __ret_292; \ }) #else -#define vfmlslq_laneq_low_u32(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \ +#define vfmlslq_laneq_low_f16(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \ float32x4_t __s0_293 = __p0_293; \ float16x8_t __s1_293 = __p1_293; \ float16x8_t __s2_293 = __p2_293; \ @@ -72265,23 +72265,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x8_t __rev1_293; __rev1_293 = __builtin_shufflevector(__s1_293, __s1_293, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev2_293; __rev2_293 = __builtin_shufflevector(__s2_293, __s2_293, 7, 6, 5, 4, 3, 2, 1, 0); \ float32x4_t __ret_293; \ - __ret_293 = __noswap_vfmlslq_low_u32(__rev0_293, __rev1_293, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293)}); \ + __ret_293 = __noswap_vfmlslq_low_f16(__rev0_293, __rev1_293, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293)}); \ __ret_293 = __builtin_shufflevector(__ret_293, __ret_293, 3, 2, 1, 0); \ __ret_293; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlsl_laneq_low_u32(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \ +#define vfmlsl_laneq_low_f16(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \ float32x2_t __s0_294 = __p0_294; \ float16x4_t __s1_294 = __p1_294; \ float16x8_t __s2_294 = __p2_294; \ float32x2_t __ret_294; \ - __ret_294 = vfmlsl_low_u32(__s0_294, __s1_294, (float16x4_t) {vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294)}); \ + __ret_294 = vfmlsl_low_f16(__s0_294, __s1_294, (float16x4_t) {vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294)}); \ __ret_294; \ }) #else -#define vfmlsl_laneq_low_u32(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \ +#define vfmlsl_laneq_low_f16(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \ float32x2_t __s0_295 = __p0_295; \ float16x4_t __s1_295 = __p1_295; \ float16x8_t __s2_295 = __p2_295; \ @@ -72289,7 +72289,7 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in float16x4_t __rev1_295; __rev1_295 = __builtin_shufflevector(__s1_295, __s1_295, 3, 2, 1, 0); \ float16x8_t __rev2_295; __rev2_295 = __builtin_shufflevector(__s2_295, __s2_295, 7, 6, 5, 4, 3, 2, 1, 0); \ float32x2_t __ret_295; \ - __ret_295 = __noswap_vfmlsl_low_u32(__rev0_295, __rev1_295, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295)}); \ + __ret_295 = __noswap_vfmlsl_low_f16(__rev0_295, __rev1_295, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295)}); \ __ret_295 = __builtin_shufflevector(__ret_295, __ret_295, 1, 0); \ __ret_295; \ }) diff --git a/lib/include/armintr.h b/lib/include/armintr.h index 933afcbb9..300ed4ee4 100644 --- a/lib/include/armintr.h +++ b/lib/include/armintr.h @@ -1,22 +1,8 @@ /*===---- armintr.h - ARM Windows intrinsics -------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx2intrin.h b/lib/include/avx2intrin.h index 9688a96fd..162e83ea2 100644 --- a/lib/include/avx2intrin.h +++ b/lib/include/avx2intrin.h @@ -1,22 +1,8 @@ /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -146,21 +132,13 @@ _mm256_andnot_si256(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a, __m256i __b) { - typedef unsigned short __v32hu __attribute__((__vector_size__(64))); - return (__m256i)__builtin_convertvector( - ((__builtin_convertvector((__v32qu)__a, __v32hu) + - __builtin_convertvector((__v32qu)__b, __v32hu)) + 1) - >> 1, __v32qu); + return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a, __m256i __b) { - typedef unsigned int __v16su __attribute__((__vector_size__(64))); - return (__m256i)__builtin_convertvector( - ((__builtin_convertvector((__v16hu)__a, __v16su) + - __builtin_convertvector((__v16hu)__b, __v16su)) + 1) - >> 1, __v16hu); + return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 diff --git a/lib/include/avx512bf16intrin.h b/lib/include/avx512bf16intrin.h new file mode 100644 index 000000000..d1d87e72f --- /dev/null +++ b/lib/include/avx512bf16intrin.h @@ -0,0 +1,279 @@ +/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512BF16INTRIN_H +#define __AVX512BF16INTRIN_H + +typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64))); +typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32))); +typedef unsigned short __bfloat16; + +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \ + __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"))) + +/// Convert One BF16 Data to One Single Float Data. +/// +/// \headerfile +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __A +/// A bfloat data. +/// \returns A float data whose sign field and exponent field keep unchanged, +/// and fraction field is extended to 23 bits. +static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) { + return __builtin_ia32_cvtsbf162ss_32(__A); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __B +/// A 512-bit vector of [16 x float]. +/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from +/// conversion of __B, and higher 256 bits come from conversion of __A. +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) { + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A, + (__v16sf) __B); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __B +/// A 512-bit vector of [16 x float]. +/// \param __W +/// A 512-bit vector of [32 x bfloat]. +/// \param __U +/// A 32-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element from __W. +/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from +/// conversion of __B, and higher 256 bits come from conversion of __A. +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { + return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtne2ps_pbh(__A, __B), + (__v32hi)__W); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __B +/// A 512-bit vector of [16 x float]. +/// \param __U +/// A 32-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element is zero. +/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from +/// conversion of __B, and higher 256 bits come from conversion of __A. +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) { + return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtne2ps_pbh(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_cvtneps_pbh(__m512 __A) { + return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, + (__v16hi)_mm256_undefined_si256(), + (__mmask16)-1); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __W +/// A 256-bit vector of [16 x bfloat]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element from __W. +/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) { + return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, + (__v16hi)__W, + (__mmask16)__U); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element is zero. +/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) { + return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 512-bit vector of [32 x bfloat]. +/// \param __B +/// A 512-bit vector of [32 x bfloat]. +/// \param __D +/// A 512-bit vector of [16 x float]. +/// \returns A 512-bit vector of [16 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) { + return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D, + (__v16si) __A, + (__v16si) __B); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 512-bit vector of [32 x bfloat]. +/// \param __B +/// A 512-bit vector of [32 x bfloat]. +/// \param __D +/// A 512-bit vector of [16 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. +/// \returns A 512-bit vector of [16 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), + (__v16sf)__D); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 512-bit vector of [32 x bfloat]. +/// \param __B +/// A 512-bit vector of [32 x bfloat]. +/// \param __D +/// A 512-bit vector of [16 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. +/// \returns A 512-bit vector of [16 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), + (__v16sf)_mm512_setzero_si512()); +} + +/// Convert Packed BF16 Data to Packed float Data. +/// +/// \headerfile +/// +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \returns A 512-bit vector of [16 x float] come from convertion of __A +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { + return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( + (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using zeroing mask. +/// +/// \headerfile +/// +/// \param __U +/// A 16-bit mask. Elements are zeroed out when the corresponding mask +/// bit is not set. +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \returns A 512-bit vector of [16 x float] come from convertion of __A +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { + return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( + (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using merging mask. +/// +/// \headerfile +/// +/// \param __S +/// A 512-bit vector of [16 x float]. Elements are copied from __S when +/// the corresponding mask bit is not set. +/// \param __U +/// A 16-bit mask. +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \returns A 512-bit vector of [16 x float] come from convertion of __A +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { + return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( + (__m512i)__S, (__mmask16)__U, + (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS512 + +#endif diff --git a/lib/include/avx512bitalgintrin.h b/lib/include/avx512bitalgintrin.h index 56046f8c4..d4411d156 100644 --- a/lib/include/avx512bitalgintrin.h +++ b/lib/include/avx512bitalgintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512bwintrin.h b/lib/include/avx512bwintrin.h index a90a25537..cb2e07619 100644 --- a/lib/include/avx512bwintrin.h +++ b/lib/include/avx512bwintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -719,11 +705,7 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_avg_epu8 (__m512i __A, __m512i __B) { - typedef unsigned short __v64hu __attribute__((__vector_size__(128))); - return (__m512i)__builtin_convertvector( - ((__builtin_convertvector((__v64qu) __A, __v64hu) + - __builtin_convertvector((__v64qu) __B, __v64hu)) + 1) - >> 1, __v64qu); + return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -746,11 +728,7 @@ _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_avg_epu16 (__m512i __A, __m512i __B) { - typedef unsigned int __v32su __attribute__((__vector_size__(128))); - return (__m512i)__builtin_convertvector( - ((__builtin_convertvector((__v32hu) __A, __v32su) + - __builtin_convertvector((__v32hu) __B, __v32su)) + 1) - >> 1, __v32hu); + return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -1733,14 +1711,14 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A) (__v64qi) _mm512_setzero_si512()); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, (__mmask64) __B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 +static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, @@ -1751,7 +1729,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_loadu_epi16 (void const *__P) { struct __loadu_epi16 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi16*)__P)->__v; } @@ -1777,7 +1755,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_loadu_epi8 (void const *__P) { struct __loadu_epi8 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi8*)__P)->__v; } @@ -1803,7 +1781,7 @@ static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_epi16 (void *__P, __m512i __A) { struct __storeu_epi16 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi16*)__P)->__v = __A; } @@ -1820,7 +1798,7 @@ static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_epi8 (void *__P, __m512i __A) { struct __storeu_epi8 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi8*)__P)->__v = __A; } diff --git a/lib/include/avx512cdintrin.h b/lib/include/avx512cdintrin.h index e63902743..bfdba84aa 100644 --- a/lib/include/avx512cdintrin.h +++ b/lib/include/avx512cdintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -34,49 +20,45 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_conflict_epi64 (__m512i __A) { - return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_conflict_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_conflict_epi64(__A), + (__v8di)_mm512_setzero_si512 ()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_conflict_epi32 (__m512i __A) { - return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_conflict_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_conflict_epi32(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/lib/include/avx512dqintrin.h b/lib/include/avx512dqintrin.h index 6e6c293af..337256c50 100644 --- a/lib/include/avx512dqintrin.h +++ b/lib/include/avx512dqintrin.h @@ -1,22 +1,8 @@ /*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512erintrin.h b/lib/include/avx512erintrin.h index 6348275c8..857006169 100644 --- a/lib/include/avx512erintrin.h +++ b/lib/include/avx512erintrin.h @@ -1,22 +1,8 @@ /*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512fintrin.h b/lib/include/avx512fintrin.h index 1c19993ff..132761f9e 100644 --- a/lib/include/avx512fintrin.h +++ b/lib/include/avx512fintrin.h @@ -1,22 +1,8 @@ /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -40,9 +26,13 @@ typedef unsigned short __v32hu __attribute__((__vector_size__(64))); typedef unsigned long long __v8du __attribute__((__vector_size__(64))); typedef unsigned int __v16su __attribute__((__vector_size__(64))); -typedef float __m512 __attribute__((__vector_size__(64))); -typedef double __m512d __attribute__((__vector_size__(64))); -typedef long long __m512i __attribute__((__vector_size__(64))); +typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64))); +typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64))); +typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64))); + +typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1))); +typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1))); +typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1))); typedef unsigned char __mmask8; typedef unsigned short __mmask16; @@ -1991,12 +1981,12 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { #define _mm512_mask_add_round_pd(W, U, A, B, R) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_add_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W)); + (__v8df)(__m512d)(W)) #define _mm512_maskz_add_round_pd(U, A, B, R) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_add_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()); + (__v8df)_mm512_setzero_pd()) #define _mm512_add_round_ps(A, B, R) \ (__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ @@ -2005,12 +1995,12 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { #define _mm512_mask_add_round_ps(W, U, A, B, R) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W)); + (__v16sf)(__m512)(W)) #define _mm512_maskz_add_round_ps(U, A, B, R) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()); + (__v16sf)_mm512_setzero_ps()) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2106,12 +2096,12 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { #define _mm512_mask_sub_round_pd(W, U, A, B, R) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W)); + (__v8df)(__m512d)(W)) #define _mm512_maskz_sub_round_pd(U, A, B, R) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()); + (__v8df)_mm512_setzero_pd()) #define _mm512_sub_round_ps(A, B, R) \ (__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ @@ -2120,12 +2110,12 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { #define _mm512_mask_sub_round_ps(W, U, A, B, R) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W)); + (__v16sf)(__m512)(W)) #define _mm512_maskz_sub_round_ps(U, A, B, R) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()); + (__v16sf)_mm512_setzero_ps()) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2221,12 +2211,12 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { #define _mm512_mask_mul_round_pd(W, U, A, B, R) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W)); + (__v8df)(__m512d)(W)) #define _mm512_maskz_mul_round_pd(U, A, B, R) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()); + (__v8df)_mm512_setzero_pd()) #define _mm512_mul_round_ps(A, B, R) \ (__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ @@ -2235,12 +2225,12 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { #define _mm512_mask_mul_round_ps(W, U, A, B, R) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W)); + (__v16sf)(__m512)(W)) #define _mm512_maskz_mul_round_ps(U, A, B, R) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()); + (__v16sf)_mm512_setzero_ps()) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2349,12 +2339,12 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { #define _mm512_mask_div_round_pd(W, U, A, B, R) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_div_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W)); + (__v8df)(__m512d)(W)) #define _mm512_maskz_div_round_pd(U, A, B, R) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_div_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()); + (__v8df)_mm512_setzero_pd()) #define _mm512_div_round_ps(A, B, R) \ (__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ @@ -2363,12 +2353,12 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { #define _mm512_mask_div_round_ps(W, U, A, B, R) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W)); + (__v16sf)(__m512)(W)) #define _mm512_maskz_div_round_ps(U, A, B, R) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()); + (__v16sf)_mm512_setzero_ps()) #define _mm512_roundscale_ps(A, B) \ (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ @@ -3789,20 +3779,9 @@ _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) (__v16hi)_mm256_setzero_si256(), \ (__mmask16)(W)) -#define _mm512_cvtps_ph(A, I) \ - (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)-1) - -#define _mm512_mask_cvtps_ph(U, W, A, I) \ - (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)(__m256i)(U), \ - (__mmask16)(W)) - -#define _mm512_maskz_cvtps_ph(W, A, I) \ - (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(W)) +#define _mm512_cvtps_ph _mm512_cvt_roundps_ph +#define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph +#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph #define _mm512_cvt_roundph_ps(A, R) \ (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ @@ -4324,7 +4303,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_loadu_si512 (void const *__P) { struct __loadu_si512 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_si512*)__P)->__v; } @@ -4333,7 +4312,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_loadu_epi32 (void const *__P) { struct __loadu_epi32 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi32*)__P)->__v; } @@ -4360,7 +4339,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_loadu_epi64 (void const *__P) { struct __loadu_epi64 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi64*)__P)->__v; } @@ -4420,7 +4399,7 @@ static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_loadu_pd(void const *__p) { struct __loadu_pd { - __m512d __v; + __m512d_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_pd*)__p)->__v; } @@ -4429,7 +4408,7 @@ static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_loadu_ps(void const *__p) { struct __loadu_ps { - __m512 __v; + __m512_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_ps*)__p)->__v; } @@ -4504,7 +4483,7 @@ static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_epi64 (void *__P, __m512i __A) { struct __storeu_epi64 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi64*)__P)->__v = __A; } @@ -4520,7 +4499,7 @@ static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_si512 (void *__P, __m512i __A) { struct __storeu_si512 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_si512*)__P)->__v = __A; } @@ -4529,7 +4508,7 @@ static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_epi32 (void *__P, __m512i __A) { struct __storeu_epi32 { - __m512i __v; + __m512i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi32*)__P)->__v = __A; } @@ -4551,7 +4530,7 @@ static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_pd(void *__P, __m512d __A) { struct __storeu_pd { - __m512d __v; + __m512d_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_pd*)__P)->__v = __A; } @@ -4567,7 +4546,7 @@ static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_ps(void *__P, __m512 __A) { struct __storeu_ps { - __m512 __v; + __m512_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_ps*)__P)->__v = __A; } @@ -9329,7 +9308,7 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) __v2du __t6 = __t4 op __t5; \ __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ __v2du __t8 = __t6 op __t7; \ - return __t8[0]; + return __t8[0] static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { _mm512_mask_reduce_operator(+); @@ -9381,7 +9360,7 @@ _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { __m128d __t6 = __t4 op __t5; \ __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ __m128d __t8 = __t6 op __t7; \ - return __t8[0]; + return __t8[0] static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { _mm512_mask_reduce_operator(+); @@ -9415,7 +9394,7 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { __v4su __t8 = __t6 op __t7; \ __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ __v4su __t10 = __t8 op __t9; \ - return __t10[0]; + return __t10[0] static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi32(__m512i __W) { @@ -9473,7 +9452,7 @@ _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { __m128 __t8 = __t6 op __t7; \ __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ __m128 __t10 = __t8 op __t9; \ - return __t10[0]; + return __t10[0] static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_add_ps(__m512 __W) { @@ -9505,7 +9484,7 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { __m512i __t4 = _mm512_##op(__t2, __t3); \ __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \ __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \ - return __t6[0]; + return __t6[0] static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_max_epi64(__m512i __V) { @@ -9563,7 +9542,7 @@ _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { __m128i __t8 = _mm_##op(__t6, __t7); \ __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \ __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \ - return __t10[0]; + return __t10[0] static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_max_epi32(__m512i __V) { @@ -9619,7 +9598,7 @@ _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { __m128d __t6 = _mm_##op(__t4, __t5); \ __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ __m128d __t8 = _mm_##op(__t6, __t7); \ - return __t8[0]; + return __t8[0] static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_max_pd(__m512d __V) { @@ -9655,7 +9634,7 @@ _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { __m128 __t8 = _mm_##op(__t6, __t7); \ __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ __m128 __t10 = _mm_##op(__t8, __t9); \ - return __t10[0]; + return __t10[0] static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_max_ps(__m512 __V) { diff --git a/lib/include/avx512ifmaintrin.h b/lib/include/avx512ifmaintrin.h index 159713049..5f7da52f1 100644 --- a/lib/include/avx512ifmaintrin.h +++ b/lib/include/avx512ifmaintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512ifmavlintrin.h b/lib/include/avx512ifmavlintrin.h index afdea888c..5889401d1 100644 --- a/lib/include/avx512ifmavlintrin.h +++ b/lib/include/avx512ifmavlintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512pfintrin.h b/lib/include/avx512pfintrin.h index 73b2234fb..b8bcf49c6 100644 --- a/lib/include/avx512pfintrin.h +++ b/lib/include/avx512pfintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512pfintrin.h - PF intrinsics ------------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vbmi2intrin.h b/lib/include/avx512vbmi2intrin.h index 532425242..a23144616 100644 --- a/lib/include/avx512vbmi2intrin.h +++ b/lib/include/avx512vbmi2intrin.h @@ -1,23 +1,9 @@ /*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vbmiintrin.h b/lib/include/avx512vbmiintrin.h index 5463d9015..c0e0f94d4 100644 --- a/lib/include/avx512vbmiintrin.h +++ b/lib/include/avx512vbmiintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vbmivlintrin.h b/lib/include/avx512vbmivlintrin.h index b5d5aa9af..c5b96ae8a 100644 --- a/lib/include/avx512vbmivlintrin.h +++ b/lib/include/avx512vbmivlintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vlbf16intrin.h b/lib/include/avx512vlbf16intrin.h new file mode 100644 index 000000000..1b1a744bc --- /dev/null +++ b/lib/include/avx512vlbf16intrin.h @@ -0,0 +1,474 @@ +/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLBF16INTRIN_H +#define __AVX512VLBF16INTRIN_H + +typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16))); + +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vl, avx512bf16"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vl, avx512bf16"), __min_vector_width__(256))) + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __B +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __B, and higher 64 bits come from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) { + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A, + (__v4sf) __B); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __B +/// A 128-bit vector of [4 x float]. +/// \param __W +/// A 128-bit vector of [8 x bfloat]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element from __W. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __B, and higher 64 bits come from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtne2ps_pbh(__A, __B), + (__v8hi)__W); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __B +/// A 128-bit vector of [4 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element is zero. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __B, and higher 64 bits come from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtne2ps_pbh(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __B +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from +/// conversion of __B, and higher 128 bits come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) { + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A, + (__v8sf) __B); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __B +/// A 256-bit vector of [8 x float]. +/// \param __W +/// A 256-bit vector of [16 x bfloat]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element from __W. +/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from +/// conversion of __B, and higher 128 bits come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) { + return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), + (__v16hi)__W); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __B +/// A 256-bit vector of [8 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element is zero. +/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from +/// conversion of __B, and higher 128 bits come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) { + return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __A, and higher 64 bits are 0. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_cvtneps_pbh(__m128 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, + (__v8hi)_mm_undefined_si128(), + (__mmask8)-1); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __W +/// A 128-bit vector of [8 x bfloat]. +/// \param __U +/// A 4-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element from __W. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __A, and higher 64 bits are 0. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, + (__v8hi)__W, + (__mmask8)__U); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __U +/// A 4-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element is zero. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __A, and higher 64 bits are 0. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, + (__v8hi)_mm_setzero_si128(), + (__mmask8)__U); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS256 +_mm256_cvtneps_pbh(__m256 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, + (__v8hi)_mm_undefined_si128(), + (__mmask8)-1); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __W +/// A 256-bit vector of [8 x bfloat]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element from __W. +/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, + (__v8hi)__W, + (__mmask8)__U); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element is zero. +/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, + (__v8hi)_mm_setzero_si128(), + (__mmask8)__U); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \param __B +/// A 128-bit vector of [8 x bfloat]. +/// \param __D +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) { + return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D, + (__v4si)__A, + (__v4si)__B); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \param __B +/// A 128-bit vector of [8 x bfloat]. +/// \param __D +/// A 128-bit vector of [4 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. +/// \returns A 128-bit vector of [4 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_dpbf16_ps(__D, __A, __B), + (__v4sf)__D); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \param __B +/// A 128-bit vector of [8 x bfloat]. +/// \param __D +/// A 128-bit vector of [4 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. +/// \returns A 128-bit vector of [4 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_dpbf16_ps(__D, __A, __B), + (__v4sf)_mm_setzero_si128()); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \param __B +/// A 256-bit vector of [16 x bfloat]. +/// \param __D +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) { + return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D, + (__v8si)__A, + (__v8si)__B); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \param __B +/// A 256-bit vector of [16 x bfloat]. +/// \param __D +/// A 256-bit vector of [8 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. +/// \returns A 256-bit vector of [8 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), + (__v8sf)__D); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \param __B +/// A 256-bit vector of [16 x bfloat]. +/// \param __D +/// A 256-bit vector of [8 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. +/// \returns A 256-bit vector of [8 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), + (__v8sf)_mm256_setzero_si256()); +} + +/// Convert One Single float Data to One BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A float data. +/// \returns A bf16 data whose sign field and exponent field keep unchanged, +/// and fraction field is truncated to 7 bits. +static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { + __v4sf __V = {__A, 0, 0, 0}; + __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask( + (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); + return __R[0]; +} + +/// Convert Packed BF16 Data to Packed float Data. +/// +/// \headerfile +/// +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \returns A 256-bit vector of [8 x float] come from convertion of __A +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { + return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( + (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using zeroing mask. +/// +/// \headerfile +/// +/// \param __U +/// A 8-bit mask. Elements are zeroed out when the corresponding mask +/// bit is not set. +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \returns A 256-bit vector of [8 x float] come from convertion of __A +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { + return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( + (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using merging mask. +/// +/// \headerfile +/// +/// \param __S +/// A 256-bit vector of [8 x float]. Elements are copied from __S when +/// the corresponding mask bit is not set. +/// \param __U +/// A 8-bit mask. Elements are zeroed out when the corresponding mask +/// bit is not set. +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \returns A 256-bit vector of [8 x float] come from convertion of __A +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { + return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32( + (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), + 16)); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/lib/include/avx512vlbitalgintrin.h b/lib/include/avx512vlbitalgintrin.h index 64860b292..5154eae14 100644 --- a/lib/include/avx512vlbitalgintrin.h +++ b/lib/include/avx512vlbitalgintrin.h @@ -1,23 +1,9 @@ /*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vlbwintrin.h b/lib/include/avx512vlbwintrin.h index 87e0023e8..ead09466b 100644 --- a/lib/include/avx512vlbwintrin.h +++ b/lib/include/avx512vlbwintrin.h @@ -1,22 +1,8 @@ /*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -2301,7 +2287,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128 _mm_loadu_epi16 (void const *__P) { struct __loadu_epi16 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi16*)__P)->__v; } @@ -2327,7 +2313,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256 _mm256_loadu_epi16 (void const *__P) { struct __loadu_epi16 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi16*)__P)->__v; } @@ -2353,7 +2339,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128 _mm_loadu_epi8 (void const *__P) { struct __loadu_epi8 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi8*)__P)->__v; } @@ -2379,7 +2365,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256 _mm256_loadu_epi8 (void const *__P) { struct __loadu_epi8 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi8*)__P)->__v; } @@ -2405,7 +2391,7 @@ static __inline void __DEFAULT_FN_ATTRS128 _mm_storeu_epi16 (void *__P, __m128i __A) { struct __storeu_epi16 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi16*)__P)->__v = __A; } @@ -2422,7 +2408,7 @@ static __inline void __DEFAULT_FN_ATTRS256 _mm256_storeu_epi16 (void *__P, __m256i __A) { struct __storeu_epi16 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi16*)__P)->__v = __A; } @@ -2439,7 +2425,7 @@ static __inline void __DEFAULT_FN_ATTRS128 _mm_storeu_epi8 (void *__P, __m128i __A) { struct __storeu_epi8 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi8*)__P)->__v = __A; } @@ -2456,7 +2442,7 @@ static __inline void __DEFAULT_FN_ATTRS256 _mm256_storeu_epi8 (void *__P, __m256i __A) { struct __storeu_epi8 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi8*)__P)->__v = __A; } diff --git a/lib/include/avx512vlcdintrin.h b/lib/include/avx512vlcdintrin.h index 903a7c254..cc8b72528 100644 --- a/lib/include/avx512vlcdintrin.h +++ b/lib/include/avx512vlcdintrin.h @@ -1,22 +1,8 @@ /*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -60,99 +46,89 @@ _mm256_broadcastmw_epi32 (__mmask16 __A) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_conflict_epi64 (__m128i __A) { - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, - (__v2di) _mm_undefined_si128 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_conflict_epi64(__A), + (__v2di)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_conflict_epi64(__A), + (__v2di)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_conflict_epi64 (__m256i __A) { - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, - (__v4di) _mm256_undefined_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_conflict_epi64(__A), + (__v4di)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, - (__v4di) _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_conflict_epi64(__A), + (__v4di)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_conflict_epi32 (__m128i __A) { - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, - (__v4si) _mm_undefined_si128 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_conflict_epi32(__A), + (__v4si)__W); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, - (__v4si) _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_conflict_epi32(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_conflict_epi32 (__m256i __A) { - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, - (__v8si) _mm256_undefined_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_conflict_epi32(__A), + (__v8si)__W); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_conflict_epi32(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 diff --git a/lib/include/avx512vldqintrin.h b/lib/include/avx512vldqintrin.h index 9d13846e8..95ba574ea 100644 --- a/lib/include/avx512vldqintrin.h +++ b/lib/include/avx512vldqintrin.h @@ -1,22 +1,8 @@ /*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -523,23 +509,21 @@ _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) { static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_cvtepi64_ps (__m256i __A) { - return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, - (__v4sf) _mm_setzero_ps(), - (__mmask8) -1); + return (__m128)__builtin_convertvector((__v4di)__A, __v4sf); } static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) { - return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtepi64_ps(__A), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) { - return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, - (__v4sf) _mm_setzero_ps(), - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtepi64_ps(__A), + (__v4sf)_mm_setzero_ps()); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -771,23 +755,21 @@ _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) { static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_cvtepu64_ps (__m256i __A) { - return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, - (__v4sf) _mm_setzero_ps(), - (__mmask8) -1); + return (__m128)__builtin_convertvector((__v4du)__A, __v4sf); } static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) { - return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtepu64_ps(__A), + (__v4sf)__W); } static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) { - return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, - (__v4sf) _mm_setzero_ps(), - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtepu64_ps(__A), + (__v4sf)_mm_setzero_ps()); } #define _mm_range_pd(A, B, C) \ diff --git a/lib/include/avx512vlintrin.h b/lib/include/avx512vlintrin.h index a2cdc0a96..9494fc8a6 100644 --- a/lib/include/avx512vlintrin.h +++ b/lib/include/avx512vlintrin.h @@ -1,22 +1,8 @@ /*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -5513,7 +5499,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128 _mm_loadu_epi64 (void const *__P) { struct __loadu_epi64 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi64*)__P)->__v; } @@ -5539,7 +5525,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256 _mm256_loadu_epi64 (void const *__P) { struct __loadu_epi64 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi64*)__P)->__v; } @@ -5565,7 +5551,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128 _mm_loadu_epi32 (void const *__P) { struct __loadu_epi32 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi32*)__P)->__v; } @@ -5591,7 +5577,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256 _mm256_loadu_epi32 (void const *__P) { struct __loadu_epi32 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_epi32*)__P)->__v; } @@ -5717,7 +5703,7 @@ static __inline void __DEFAULT_FN_ATTRS128 _mm_storeu_epi64 (void *__P, __m128i __A) { struct __storeu_epi64 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi64*)__P)->__v = __A; } @@ -5734,7 +5720,7 @@ static __inline void __DEFAULT_FN_ATTRS256 _mm256_storeu_epi64 (void *__P, __m256i __A) { struct __storeu_epi64 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi64*)__P)->__v = __A; } @@ -5751,7 +5737,7 @@ static __inline void __DEFAULT_FN_ATTRS128 _mm_storeu_epi32 (void *__P, __m128i __A) { struct __storeu_epi32 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi32*)__P)->__v = __A; } @@ -5768,7 +5754,7 @@ static __inline void __DEFAULT_FN_ATTRS256 _mm256_storeu_epi32 (void *__P, __m256i __A) { struct __storeu_epi32 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_epi32*)__P)->__v = __A; } @@ -7000,7 +6986,7 @@ _mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsepi32_epi8 (__m256i __A) { return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, @@ -7023,7 +7009,7 @@ _mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS128 +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); @@ -7581,7 +7567,7 @@ _mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS256 +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); @@ -8425,22 +8411,6 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A) -{ - return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION, - (__v8hi) __W, - (__mmask8) __U); -} - -static __inline __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A) -{ - return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __U); -} - #define _mm_mask_cvt_roundps_ph(W, U, A, I) \ (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ (__v8hi)(__m128i)(W), \ @@ -8451,21 +8421,9 @@ _mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A) (__v8hi)_mm_setzero_si128(), \ (__mmask8)(U)) -static __inline __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A) -{ - return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION, - (__v8hi) __W, - (__mmask8) __U); -} +#define _mm_mask_cvtps_ph _mm_mask_cvt_roundps_ph +#define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph -static __inline __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A) -{ - return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION, - (__v8hi) _mm_setzero_si128(), - (__mmask8) __U); -} #define _mm256_mask_cvt_roundps_ph(W, U, A, I) \ (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ (__v8hi)(__m128i)(W), \ @@ -8476,6 +8434,9 @@ _mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A) (__v8hi)_mm_setzero_si128(), \ (__mmask8)(U)) +#define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph +#define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph + #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/lib/include/avx512vlvbmi2intrin.h b/lib/include/avx512vlvbmi2intrin.h index 632d14fb5..a40f926de 100644 --- a/lib/include/avx512vlvbmi2intrin.h +++ b/lib/include/avx512vlvbmi2intrin.h @@ -1,23 +1,9 @@ /*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vlvnniintrin.h b/lib/include/avx512vlvnniintrin.h index 62382268e..b7c8fa08c 100644 --- a/lib/include/avx512vlvnniintrin.h +++ b/lib/include/avx512vlvnniintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vlvp2intersectintrin.h b/lib/include/avx512vlvp2intersectintrin.h new file mode 100644 index 000000000..3e0815e5d --- /dev/null +++ b/lib/include/avx512vlvp2intersectintrin.h @@ -0,0 +1,121 @@ +/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VLVP2INTERSECT_H +#define _AVX512VLVP2INTERSECT_H + +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \ + __min_vector_width__(128))) + +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \ + __min_vector_width__(256))) +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between dwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \param __b +/// A 256-bit vector of [8 x i32] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1); +} + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between quadwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTQ instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x i64]. +/// \param __b +/// A 256-bit vector of [4 x i64] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1); +} + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between dwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. +/// \param __b +/// A 128-bit vector of [4 x i32] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1); +} + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between quadwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTQ instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. +/// \param __b +/// A 128-bit vector of [2 x i64] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/lib/include/avx512vnniintrin.h b/lib/include/avx512vnniintrin.h index 620ef5a78..9935a119a 100644 --- a/lib/include/avx512vnniintrin.h +++ b/lib/include/avx512vnniintrin.h @@ -1,23 +1,9 @@ /*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vp2intersectintrin.h b/lib/include/avx512vp2intersectintrin.h new file mode 100644 index 000000000..5d3cb48cf --- /dev/null +++ b/lib/include/avx512vp2intersectintrin.h @@ -0,0 +1,77 @@ +/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VP2INTERSECT_H +#define _AVX512VP2INTERSECT_H + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vp2intersect"), \ + __min_vector_width__(512))) + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between dwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTD instruction. +/// +/// \param __a +/// A 512-bit vector of [16 x i32]. +/// \param __b +/// A 512-bit vector of [16 x i32] +/// \param __m0 +/// A pointer point to 16-bit mask +/// \param __m1 +/// A pointer point to 16-bit mask +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) { + __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1); +} + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between quadwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTQ instruction. +/// +/// \param __a +/// A 512-bit vector of [8 x i64]. +/// \param __b +/// A 512-bit vector of [8 x i64] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/lib/include/avx512vpopcntdqintrin.h b/lib/include/avx512vpopcntdqintrin.h index c99f59456..bb435e623 100644 --- a/lib/include/avx512vpopcntdqintrin.h +++ b/lib/include/avx512vpopcntdqintrin.h @@ -1,23 +1,9 @@ /*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avx512vpopcntdqvlintrin.h b/lib/include/avx512vpopcntdqvlintrin.h index 681a75fa0..a3cb9b6bc 100644 --- a/lib/include/avx512vpopcntdqvlintrin.h +++ b/lib/include/avx512vpopcntdqvlintrin.h @@ -1,23 +1,9 @@ /*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/avxintrin.h b/lib/include/avxintrin.h index cb15396b3..a01240b9d 100644 --- a/lib/include/avxintrin.h +++ b/lib/include/avxintrin.h @@ -1,22 +1,8 @@ /*===---- avxintrin.h - AVX intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -45,9 +31,13 @@ typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); * appear in the interface though. */ typedef signed char __v32qs __attribute__((__vector_size__(32))); -typedef float __m256 __attribute__ ((__vector_size__ (32))); -typedef double __m256d __attribute__((__vector_size__(32))); -typedef long long __m256i __attribute__((__vector_size__(32))); +typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32))); +typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32))); +typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32))); + +typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1))); +typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1))); +typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1))); /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256))) @@ -3113,7 +3103,7 @@ static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p) { struct __loadu_pd { - __m256d __v; + __m256d_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_pd*)__p)->__v; } @@ -3133,7 +3123,7 @@ static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p) { struct __loadu_ps { - __m256 __v; + __m256_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_ps*)__p)->__v; } @@ -3166,10 +3156,10 @@ _mm256_load_si256(__m256i const *__p) /// A pointer to a 256-bit integer vector containing integer values. /// \returns A 256-bit integer vector containing the moved values. static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_loadu_si256(__m256i const *__p) +_mm256_loadu_si256(__m256i_u const *__p) { struct __loadu_si256 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_si256*)__p)->__v; } @@ -3246,7 +3236,7 @@ static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a) { struct __storeu_pd { - __m256d __v; + __m256d_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_pd*)__p)->__v = __a; } @@ -3266,7 +3256,7 @@ static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a) { struct __storeu_ps { - __m256 __v; + __m256_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_ps*)__p)->__v = __a; } @@ -3301,10 +3291,10 @@ _mm256_store_si256(__m256i *__p, __m256i __a) /// \param __a /// A 256-bit integer vector containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS -_mm256_storeu_si256(__m256i *__p, __m256i __a) +_mm256_storeu_si256(__m256i_u *__p, __m256i __a) { struct __storeu_si256 { - __m256i __v; + __m256i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_si256*)__p)->__v = __a; } @@ -4834,7 +4824,7 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) /// address of the memory location does not have to be aligned. /// \returns A 256-bit integer vector containing the concatenated result. static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) +_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo) { __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); @@ -4918,7 +4908,7 @@ _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) /// \param __a /// A 256-bit integer vector. static __inline void __DEFAULT_FN_ATTRS -_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) +_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a) { __m128i __v128; diff --git a/lib/include/bmi2intrin.h b/lib/include/bmi2intrin.h index fdae82cf2..0b56aed5f 100644 --- a/lib/include/bmi2intrin.h +++ b/lib/include/bmi2intrin.h @@ -1,22 +1,8 @@ /*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/bmiintrin.h b/lib/include/bmiintrin.h index 56c20b78d..b7af62f60 100644 --- a/lib/include/bmiintrin.h +++ b/lib/include/bmiintrin.h @@ -1,22 +1,8 @@ /*===---- bmiintrin.h - BMI intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/cetintrin.h b/lib/include/cetintrin.h index 120c95424..4290e9d73 100644 --- a/lib/include/cetintrin.h +++ b/lib/include/cetintrin.h @@ -1,22 +1,8 @@ /*===---- cetintrin.h - CET intrinsic --------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/cldemoteintrin.h b/lib/include/cldemoteintrin.h index fa78148eb..2413e7dea 100644 --- a/lib/include/cldemoteintrin.h +++ b/lib/include/cldemoteintrin.h @@ -1,22 +1,8 @@ /*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/clflushoptintrin.h b/lib/include/clflushoptintrin.h index 79bb4589f..060eb36f3 100644 --- a/lib/include/clflushoptintrin.h +++ b/lib/include/clflushoptintrin.h @@ -1,22 +1,8 @@ /*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/clwbintrin.h b/lib/include/clwbintrin.h index c09286ba6..3360d203f 100644 --- a/lib/include/clwbintrin.h +++ b/lib/include/clwbintrin.h @@ -1,22 +1,8 @@ /*===---- clwbintrin.h - CLWB intrinsic ------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/clzerointrin.h b/lib/include/clzerointrin.h index f4e920839..a180984a3 100644 --- a/lib/include/clzerointrin.h +++ b/lib/include/clzerointrin.h @@ -1,22 +1,8 @@ /*===----------------------- clzerointrin.h - CLZERO ----------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/cpuid.h b/lib/include/cpuid.h index fce6af52d..02ffac26c 100644 --- a/lib/include/cpuid.h +++ b/lib/include/cpuid.h @@ -1,22 +1,8 @@ /*===---- cpuid.h - X86 cpu model detection --------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -191,6 +177,7 @@ #define bit_CLDEMOTE 0x02000000 #define bit_MOVDIRI 0x08000000 #define bit_MOVDIR64B 0x10000000 +#define bit_ENQCMD 0x20000000 /* Features in %edx for leaf 7 sub-leaf 0 */ #define bit_AVX5124VNNIW 0x00000004 @@ -198,6 +185,9 @@ #define bit_PCONFIG 0x00040000 #define bit_IBT 0x00100000 +/* Features in %eax for leaf 7 sub-leaf 1 */ +#define bit_AVX512BF16 0x00000020 + /* Features in %eax for leaf 13 sub-leaf 1 */ #define bit_XSAVEOPT 0x00000001 #define bit_XSAVEC 0x00000002 diff --git a/lib/include/emmintrin.h b/lib/include/emmintrin.h index 6d61f9719..3d55f5f27 100644 --- a/lib/include/emmintrin.h +++ b/lib/include/emmintrin.h @@ -1,22 +1,8 @@ /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -26,8 +12,11 @@ #include -typedef double __m128d __attribute__((__vector_size__(16))); -typedef long long __m128i __attribute__((__vector_size__(16))); +typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); + +typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); +typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); /* Type defines. */ typedef double __v2df __attribute__ ((__vector_size__ (16))); @@ -1652,7 +1641,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) { struct __loadu_pd { - __m128d __v; + __m128d_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_pd*)__dp)->__v; } @@ -2042,7 +2031,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a) { struct __storeu_pd { - __m128d __v; + __m128d_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_pd*)__dp)->__v = __a; } @@ -2316,11 +2305,7 @@ _mm_adds_epu16(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b) { - typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); - return (__m128i)__builtin_convertvector( - ((__builtin_convertvector((__v16qu)__a, __v16hu) + - __builtin_convertvector((__v16qu)__b, __v16hu)) + 1) - >> 1, __v16qu); + return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); } /// Computes the rounded avarages of corresponding elements of two @@ -2340,11 +2325,7 @@ _mm_avg_epu8(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b) { - typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); - return (__m128i)__builtin_convertvector( - ((__builtin_convertvector((__v8hu)__a, __v8su) + - __builtin_convertvector((__v8hu)__b, __v8su)) + 1) - >> 1, __v8hu); + return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); } /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] @@ -3564,10 +3545,10 @@ _mm_load_si128(__m128i const *__p) /// A pointer to a memory location containing integer values. /// \returns A 128-bit integer vector containing the moved values. static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_loadu_si128(__m128i const *__p) +_mm_loadu_si128(__m128i_u const *__p) { struct __loadu_si128 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_si128*)__p)->__v; } @@ -3585,7 +3566,7 @@ _mm_loadu_si128(__m128i const *__p) /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the /// moved value. The higher order bits are cleared. static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_loadl_epi64(__m128i const *__p) +_mm_loadl_epi64(__m128i_u const *__p) { struct __mm_loadl_epi64_struct { long long __u; @@ -4027,10 +4008,10 @@ _mm_store_si128(__m128i *__p, __m128i __b) /// \param __b /// A 128-bit integer vector containing the values to be moved. static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeu_si128(__m128i *__p, __m128i __b) +_mm_storeu_si128(__m128i_u *__p, __m128i __b) { struct __storeu_si128 { - __m128i __v; + __m128i_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_si128*)__p)->__v = __b; } @@ -4139,7 +4120,7 @@ _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the /// value to be stored. static __inline__ void __DEFAULT_FN_ATTRS -_mm_storel_epi64(__m128i *__p, __m128i __a) +_mm_storel_epi64(__m128i_u *__p, __m128i __a) { struct __mm_storel_epi64_struct { long long __u; diff --git a/lib/include/enqcmdintrin.h b/lib/include/enqcmdintrin.h new file mode 100644 index 000000000..30af67f6b --- /dev/null +++ b/lib/include/enqcmdintrin.h @@ -0,0 +1,63 @@ +/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __ENQCMDINTRIN_H +#define __ENQCMDINTRIN_H + +/* Define the default attributes for the functions in this file */ +#define _DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("enqcmd"))) + +/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store +/// data, and performs 64-byte enqueue store to memory pointed by \a __dst. +/// This intrinsics may only be used in User mode. +/// +/// \headerfile +/// +/// This intrinsics corresponds to the ENQCMD instruction. +/// +/// \param __dst +/// Pointer to the destination of the enqueue store. +/// \param __src +/// Pointer to 64-byte command data. +/// \returns If the command data is successfully written to \a __dst then 0 is +/// returned. Otherwise 1 is returned. +static __inline__ int _DEFAULT_FN_ATTRS +_enqcmd (void *__dst, const void *__src) +{ + return __builtin_ia32_enqcmd(__dst, __src); +} + +/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store +/// data, and performs 64-byte enqueue store to memory pointed by \a __dst +/// This intrinsic may only be used in Privileged mode. +/// +/// \headerfile +/// +/// This intrinsics corresponds to the ENQCMDS instruction. +/// +/// \param __dst +/// Pointer to the destination of the enqueue store. +/// \param __src +/// Pointer to 64-byte command data. +/// \returns If the command data is successfully written to \a __dst then 0 is +/// returned. Otherwise 1 is returned. +static __inline__ int _DEFAULT_FN_ATTRS +_enqcmds (void *__dst, const void *__src) +{ + return __builtin_ia32_enqcmds(__dst, __src); +} + +#undef _DEFAULT_FN_ATTRS + +#endif /* __ENQCMDINTRIN_H */ diff --git a/lib/include/f16cintrin.h b/lib/include/f16cintrin.h index 3d35f28eb..109b604ad 100644 --- a/lib/include/f16cintrin.h +++ b/lib/include/f16cintrin.h @@ -1,22 +1,8 @@ /*===---- f16cintrin.h - F16C intrinsics -----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -52,9 +38,9 @@ static __inline float __DEFAULT_FN_ATTRS128 _cvtsh_ss(unsigned short __a) { - __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0}; - __v4sf r = __builtin_ia32_vcvtph2ps(v); - return r[0]; + __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0}; + __v4sf __r = __builtin_ia32_vcvtph2ps(__v); + return __r[0]; } /// Converts a 32-bit single-precision float value to a 16-bit diff --git a/lib/include/float.h b/lib/include/float.h index 56215cd62..ed610b24a 100644 --- a/lib/include/float.h +++ b/lib/include/float.h @@ -1,22 +1,8 @@ /*===---- float.h - Characteristics of floating point types ----------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -51,7 +37,7 @@ # undef FLT_MANT_DIG # undef DBL_MANT_DIG # undef LDBL_MANT_DIG -# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) +# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L # undef DECIMAL_DIG # endif # undef FLT_DIG @@ -78,7 +64,7 @@ # undef FLT_MIN # undef DBL_MIN # undef LDBL_MIN -# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) +# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L # undef FLT_TRUE_MIN # undef DBL_TRUE_MIN # undef LDBL_TRUE_MIN @@ -101,7 +87,7 @@ #define DBL_MANT_DIG __DBL_MANT_DIG__ #define LDBL_MANT_DIG __LDBL_MANT_DIG__ -#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) +#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L # define DECIMAL_DIG __DECIMAL_DIG__ #endif @@ -137,7 +123,7 @@ #define DBL_MIN __DBL_MIN__ #define LDBL_MIN __LDBL_MIN__ -#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) +#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L # define FLT_TRUE_MIN __FLT_DENORM_MIN__ # define DBL_TRUE_MIN __DBL_DENORM_MIN__ # define LDBL_TRUE_MIN __LDBL_DENORM_MIN__ diff --git a/lib/include/fma4intrin.h b/lib/include/fma4intrin.h index 7bae2f4a3..694801b3e 100644 --- a/lib/include/fma4intrin.h +++ b/lib/include/fma4intrin.h @@ -1,22 +1,8 @@ /*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/fmaintrin.h b/lib/include/fmaintrin.h index 094d13afe..d889b7c5e 100644 --- a/lib/include/fmaintrin.h +++ b/lib/include/fmaintrin.h @@ -1,22 +1,8 @@ /*===---- fmaintrin.h - FMA intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/fxsrintrin.h b/lib/include/fxsrintrin.h index 704b5ad60..afee6aa97 100644 --- a/lib/include/fxsrintrin.h +++ b/lib/include/fxsrintrin.h @@ -1,22 +1,8 @@ /*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/gfniintrin.h b/lib/include/gfniintrin.h index 804d4f3d0..9bff0fcb6 100644 --- a/lib/include/gfniintrin.h +++ b/lib/include/gfniintrin.h @@ -1,23 +1,9 @@ /*===----------------- gfniintrin.h - GFNI intrinsics ----------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/htmintrin.h b/lib/include/htmintrin.h index 69c8d7bb5..49c2b9860 100644 --- a/lib/include/htmintrin.h +++ b/lib/include/htmintrin.h @@ -1,22 +1,8 @@ /*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\ * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * \*===----------------------------------------------------------------------===*/ diff --git a/lib/include/htmxlintrin.h b/lib/include/htmxlintrin.h index 049dbd61d..6ef6f4b34 100644 --- a/lib/include/htmxlintrin.h +++ b/lib/include/htmxlintrin.h @@ -1,22 +1,8 @@ /*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\ * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * \*===----------------------------------------------------------------------===*/ diff --git a/lib/include/ia32intrin.h b/lib/include/ia32intrin.h index f8972e305..8e38df731 100644 --- a/lib/include/ia32intrin.h +++ b/lib/include/ia32intrin.h @@ -1,22 +1,8 @@ /* ===-------- ia32intrin.h ---------------------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -28,6 +14,160 @@ #ifndef __IA32INTRIN_H #define __IA32INTRIN_H +/** Find the first set bit starting from the lsb. Result is undefined if + * input is 0. + * + * \headerfile + * + * This intrinsic corresponds to the BSF instruction or the + * TZCNT instruction. + * + * \param __A + * A 32-bit integer operand. + * \returns A 32-bit integer containing the bit number. + */ +static __inline__ int __attribute__((__always_inline__, __nodebug__)) +__bsfd(int __A) { + return __builtin_ctz(__A); +} + +/** Find the first set bit starting from the msb. Result is undefined if + * input is 0. + * + * \headerfile + * + * This intrinsic corresponds to the BSR instruction or the + * LZCNT instruction and an XOR . + * + * \param __A + * A 32-bit integer operand. + * \returns A 32-bit integer containing the bit number. + */ +static __inline__ int __attribute__((__always_inline__, __nodebug__)) +__bsrd(int __A) { + return 31 - __builtin_clz(__A); +} + +/** Swaps the bytes in the input. Converting little endian to big endian or + * vice versa. + * + * \headerfile + * + * This intrinsic corresponds to the BSWAP instruction. + * + * \param __A + * A 32-bit integer operand. + * \returns A 32-bit integer containing the swapped bytes. + */ +static __inline__ int __attribute__((__always_inline__, __nodebug__)) +__bswapd(int __A) { + return __builtin_bswap32(__A); +} + +static __inline__ int __attribute__((__always_inline__, __nodebug__)) +_bswap(int __A) { + return __builtin_bswap32(__A); +} + +#define _bit_scan_forward(A) __bsfd((A)) +#define _bit_scan_reverse(A) __bsrd((A)) + +#ifdef __x86_64__ +/** Find the first set bit starting from the lsb. Result is undefined if + * input is 0. + * + * \headerfile + * + * This intrinsic corresponds to the BSF instruction or the + * TZCNT instruction. + * + * \param __A + * A 64-bit integer operand. + * \returns A 32-bit integer containing the bit number. + */ +static __inline__ int __attribute__((__always_inline__, __nodebug__)) +__bsfq(long long __A) { + return __builtin_ctzll(__A); +} + +/** Find the first set bit starting from the msb. Result is undefined if + * input is 0. + * + * \headerfile + * + * This intrinsic corresponds to the BSR instruction or the + * LZCNT instruction and an XOR . + * + * \param __A + * A 64-bit integer operand. + * \returns A 32-bit integer containing the bit number. + */ +static __inline__ int __attribute__((__always_inline__, __nodebug__)) +__bsrq(long long __A) { + return 63 - __builtin_clzll(__A); +} + +/** Swaps the bytes in the input. Converting little endian to big endian or + * vice versa. + * + * \headerfile + * + * This intrinsic corresponds to the BSWAP instruction. + * + * \param __A + * A 64-bit integer operand. + * \returns A 64-bit integer containing the swapped bytes. + */ +static __inline__ long long __attribute__((__always_inline__, __nodebug__)) +__bswapq(long long __A) { + return __builtin_bswap64(__A); +} + +#define _bswap64(A) __bswapq((A)) +#endif + +/** Counts the number of bits in the source operand having a value of 1. + * + * \headerfile + * + * This intrinsic corresponds to the POPCNT instruction or a + * a sequence of arithmetic and logic ops to calculate it. + * + * \param __A + * An unsigned 32-bit integer operand. + * \returns A 32-bit integer containing the number of bits with value 1 in the + * source operand. + */ +static __inline__ int __attribute__((__always_inline__, __nodebug__)) +__popcntd(unsigned int __A) +{ + return __builtin_popcount(__A); +} + +#define _popcnt32(A) __popcntd((A)) + +#ifdef __x86_64__ +/** Counts the number of bits in the source operand having a value of 1. + * + * \headerfile + * + * This intrinsic corresponds to the POPCNT instruction or a + * a sequence of arithmetic and logic ops to calculate it. + * + * \param __A + * An unsigned 64-bit integer operand. + * \returns A 64-bit integer containing the number of bits with value 1 in the + * source operand. + */ +static __inline__ long long __attribute__((__always_inline__, __nodebug__)) +__popcntq(unsigned long long __A) +{ + return __builtin_popcountll(__A); +} + +#define _popcnt64(A) __popcntq((A)) +#endif /* __x86_64__ */ + #ifdef __x86_64__ static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) __readeflags(void) @@ -55,6 +195,92 @@ __writeeflags(unsigned int __f) } #endif /* !__x86_64__ */ +/** Adds the unsigned integer operand to the CRC-32C checksum of the + * unsigned char operand. + * + * \headerfile + * + * This intrinsic corresponds to the CRC32B instruction. + * + * \param __C + * An unsigned integer operand to add to the CRC-32C checksum of operand + * \a __D. + * \param __D + * An unsigned 8-bit integer operand used to compute the CRC-32C checksum. + * \returns The result of adding operand \a __C to the CRC-32C checksum of + * operand \a __D. + */ +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +__crc32b(unsigned int __C, unsigned char __D) +{ + return __builtin_ia32_crc32qi(__C, __D); +} + +/** Adds the unsigned integer operand to the CRC-32C checksum of the + * unsigned short operand. + * + * \headerfile + * + * This intrinsic corresponds to the CRC32W instruction. + * + * \param __C + * An unsigned integer operand to add to the CRC-32C checksum of operand + * \a __D. + * \param __D + * An unsigned 16-bit integer operand used to compute the CRC-32C checksum. + * \returns The result of adding operand \a __C to the CRC-32C checksum of + * operand \a __D. + */ +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +__crc32w(unsigned int __C, unsigned short __D) +{ + return __builtin_ia32_crc32hi(__C, __D); +} + +/** Adds the unsigned integer operand to the CRC-32C checksum of the + * second unsigned integer operand. + * + * \headerfile + * + * This intrinsic corresponds to the CRC32D instruction. + * + * \param __C + * An unsigned integer operand to add to the CRC-32C checksum of operand + * \a __D. + * \param __D + * An unsigned 32-bit integer operand used to compute the CRC-32C checksum. + * \returns The result of adding operand \a __C to the CRC-32C checksum of + * operand \a __D. + */ +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +__crc32d(unsigned int __C, unsigned int __D) +{ + return __builtin_ia32_crc32si(__C, __D); +} + +#ifdef __x86_64__ +/** Adds the unsigned integer operand to the CRC-32C checksum of the + * unsigned 64-bit integer operand. + * + * \headerfile + * + * This intrinsic corresponds to the CRC32Q instruction. + * + * \param __C + * An unsigned integer operand to add to the CRC-32C checksum of operand + * \a __D. + * \param __D + * An unsigned 64-bit integer operand used to compute the CRC-32C checksum. + * \returns The result of adding operand \a __C to the CRC-32C checksum of + * operand \a __D. + */ +static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +__crc32q(unsigned long long __C, unsigned long long __D) +{ + return __builtin_ia32_crc32di(__C, __D); +} +#endif /* __x86_64__ */ + static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) __rdpmc(int __A) { return __builtin_ia32_rdpmc(__A); @@ -75,4 +301,64 @@ _wbinvd(void) { __builtin_ia32_wbinvd(); } +static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__)) +__rolb(unsigned char __X, int __C) { + return __builtin_rotateleft8(__X, __C); +} + +static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__)) +__rorb(unsigned char __X, int __C) { + return __builtin_rotateright8(__X, __C); +} + +static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__)) +__rolw(unsigned short __X, int __C) { + return __builtin_rotateleft16(__X, __C); +} + +static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__)) +__rorw(unsigned short __X, int __C) { + return __builtin_rotateright16(__X, __C); +} + +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) +__rold(unsigned int __X, int __C) { + return __builtin_rotateleft32(__X, __C); +} + +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) +__rord(unsigned int __X, int __C) { + return __builtin_rotateright32(__X, __C); +} + +#ifdef __x86_64__ +static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) +__rolq(unsigned long long __X, int __C) { + return __builtin_rotateleft64(__X, __C); +} + +static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) +__rorq(unsigned long long __X, int __C) { + return __builtin_rotateright64(__X, __C); +} +#endif /* __x86_64__ */ + +#ifndef _MSC_VER +/* These are already provided as builtins for MSVC. */ +/* Select the correct function based on the size of long. */ +#ifdef __LP64__ +#define _lrotl(a,b) __rolq((a), (b)) +#define _lrotr(a,b) __rorq((a), (b)) +#else +#define _lrotl(a,b) __rold((a), (b)) +#define _lrotr(a,b) __rord((a), (b)) +#endif +#define _rotl(a,b) __rold((a), (b)) +#define _rotr(a,b) __rord((a), (b)) +#endif // _MSC_VER + +/* These are not builtins so need to be provided in all modes. */ +#define _rotwl(a,b) __rolw((a), (b)) +#define _rotwr(a,b) __rorw((a), (b)) + #endif /* __IA32INTRIN_H */ diff --git a/lib/include/immintrin.h b/lib/include/immintrin.h index 7d0722ec7..7555ad82f 100644 --- a/lib/include/immintrin.h +++ b/lib/include/immintrin.h @@ -1,22 +1,8 @@ /*===---- immintrin.h - Intel intrinsics -----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -195,6 +181,15 @@ #include #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BF16__) +#include +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512BF16__)) +#include +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__) #include #endif @@ -241,18 +236,6 @@ _rdrand64_step(unsigned long long *__p) #endif #endif /* __RDRND__ */ -/* __bit_scan_forward */ -static __inline__ int __attribute__((__always_inline__, __nodebug__)) -_bit_scan_forward(int __A) { - return __builtin_ctz(__A); -} - -/* __bit_scan_reverse */ -static __inline__ int __attribute__((__always_inline__, __nodebug__)) -_bit_scan_reverse(int __A) { - return 31 - __builtin_clz(__A); -} - #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__) #ifdef __x86_64__ static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) @@ -378,9 +361,8 @@ _storebe_i64(void * __P, long long __D) { #include #endif -#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__) +/* No feature check desired due to internal MSC_VER checks */ #include -#endif #if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__) #include @@ -439,7 +421,21 @@ _storebe_i64(void * __P, long long __D) { #include #endif -#ifdef _MSC_VER +#if !defined(_MSC_VER) || __has_feature(modules) || \ + defined(__AVX512VP2INTERSECT__) +#include +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__)) +#include +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__ENQCMD__) +#include +#endif + +#if defined(_MSC_VER) && __has_extension(gnu_asm) /* Define the default attributes for these intrinsics */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) #ifdef __cplusplus @@ -521,6 +517,6 @@ _InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination, #undef __DEFAULT_FN_ATTRS -#endif /* _MSC_VER */ +#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */ #endif /* __IMMINTRIN_H */ diff --git a/lib/include/intrin.h b/lib/include/intrin.h index 966258bab..9786ba147 100644 --- a/lib/include/intrin.h +++ b/lib/include/intrin.h @@ -1,22 +1,8 @@ /* ===-------- intrin.h ---------------------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -200,10 +186,6 @@ __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void); unsigned __int32 xbegin(void); void _xend(void); -static __inline__ -#define _XCR_XFEATURE_ENABLED_MASK 0 -unsigned __int64 __cdecl _xgetbv(unsigned int); -void __cdecl _xsetbv(unsigned int, unsigned __int64); /* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */ #ifdef __x86_64__ @@ -539,12 +521,6 @@ __cpuidex(int __info[4], int __level, int __ecx) { __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3]) : "a"(__level), "c"(__ecx)); } -static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS -_xgetbv(unsigned int __xcr_no) { - unsigned int __eax, __edx; - __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no)); - return ((unsigned __int64)__edx << 32) | __eax; -} static __inline__ void __DEFAULT_FN_ATTRS __halt(void) { __asm__ volatile ("hlt"); @@ -567,15 +543,9 @@ long _InterlockedAdd(long volatile *Addend, long Value); __int64 _ReadStatusReg(int); void _WriteStatusReg(int, __int64); -static inline unsigned short _byteswap_ushort (unsigned short val) { - return __builtin_bswap16(val); -} -static inline unsigned long _byteswap_ulong (unsigned long val) { - return __builtin_bswap32(val); -} -static inline unsigned __int64 _byteswap_uint64 (unsigned __int64 val) { - return __builtin_bswap64(val); -} +unsigned short __cdecl _byteswap_ushort(unsigned short val); +unsigned long __cdecl _byteswap_ulong (unsigned long val); +unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val); #endif /*----------------------------------------------------------------------------*\ diff --git a/lib/include/inttypes.h b/lib/include/inttypes.h index 1d8eabab0..1c894c4ac 100644 --- a/lib/include/inttypes.h +++ b/lib/include/inttypes.h @@ -1,27 +1,18 @@ /*===---- inttypes.h - Standard header for integer printf macros ----------===*\ * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * \*===----------------------------------------------------------------------===*/ #ifndef __CLANG_INTTYPES_H +// AIX system headers need inttypes.h to be re-enterable while _STD_TYPES_T +// is defined until an inclusion of it without _STD_TYPES_T occurs, in which +// case the header guard macro is defined. +#if !defined(_AIX) || !defined(_STD_TYPES_T) #define __CLANG_INTTYPES_H +#endif #if defined(_MSC_VER) && _MSC_VER < 1800 #error MSVC does not have inttypes.h prior to Visual Studio 2013 diff --git a/lib/include/invpcidintrin.h b/lib/include/invpcidintrin.h index c30a19fa3..48dae0a86 100644 --- a/lib/include/invpcidintrin.h +++ b/lib/include/invpcidintrin.h @@ -1,22 +1,8 @@ /*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/iso646.h b/lib/include/iso646.h index dca13c5ba..e0a20c6f1 100644 --- a/lib/include/iso646.h +++ b/lib/include/iso646.h @@ -1,24 +1,8 @@ /*===---- iso646.h - Standard header for alternate spellings of operators---=== * - * Copyright (c) 2008 Eli Friedman - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/limits.h b/lib/include/limits.h index f04187ced..c653580ba 100644 --- a/lib/include/limits.h +++ b/lib/include/limits.h @@ -1,24 +1,8 @@ /*===---- limits.h - Standard header for integer sizes --------------------===*\ * - * Copyright (c) 2009 Chris Lattner - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * \*===----------------------------------------------------------------------===*/ diff --git a/lib/include/lwpintrin.h b/lib/include/lwpintrin.h index 0b28d7358..d8ab0db03 100644 --- a/lib/include/lwpintrin.h +++ b/lib/include/lwpintrin.h @@ -1,22 +1,8 @@ /*===---- lwpintrin.h - LWP intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/lzcntintrin.h b/lib/include/lzcntintrin.h index 35c1651cc..f4ddce9d0 100644 --- a/lib/include/lzcntintrin.h +++ b/lib/include/lzcntintrin.h @@ -1,22 +1,8 @@ /*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/mm3dnow.h b/lib/include/mm3dnow.h index b0288757a..22ab13aa3 100644 --- a/lib/include/mm3dnow.h +++ b/lib/include/mm3dnow.h @@ -1,22 +1,8 @@ /*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/mm_malloc.h b/lib/include/mm_malloc.h index 305afd31a..0ea32517a 100644 --- a/lib/include/mm_malloc.h +++ b/lib/include/mm_malloc.h @@ -1,22 +1,8 @@ /*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/mmintrin.h b/lib/include/mmintrin.h index a73539942..79a8b5501 100644 --- a/lib/include/mmintrin.h +++ b/lib/include/mmintrin.h @@ -1,22 +1,8 @@ /*===---- mmintrin.h - MMX intrinsics --------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -24,7 +10,7 @@ #ifndef __MMINTRIN_H #define __MMINTRIN_H -typedef long long __m64 __attribute__((__vector_size__(8))); +typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8))); typedef long long __v1di __attribute__((__vector_size__(8))); typedef int __v2si __attribute__((__vector_size__(8))); diff --git a/lib/include/module.modulemap b/lib/include/module.modulemap index 1d1af57fd..7954a77a4 100644 --- a/lib/include/module.modulemap +++ b/lib/include/module.modulemap @@ -1,22 +1,8 @@ /*===---- module.modulemap - intrinsics module map -------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -168,4 +154,5 @@ module _Builtin_stddef_max_align_t [system] [extern_c] { module opencl_c { requires opencl header "opencl-c.h" + header "opencl-c-base.h" } diff --git a/lib/include/movdirintrin.h b/lib/include/movdirintrin.h index ec20c5370..30c4d02c8 100644 --- a/lib/include/movdirintrin.h +++ b/lib/include/movdirintrin.h @@ -1,22 +1,8 @@ /*===------------------------- movdirintrin.h ------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/msa.h b/lib/include/msa.h index da680f5ca..19ea6071a 100644 --- a/lib/include/msa.h +++ b/lib/include/msa.h @@ -1,22 +1,8 @@ /*===---- msa.h - MIPS MSA intrinsics --------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/mwaitxintrin.h b/lib/include/mwaitxintrin.h index 2921eadfa..bca395b0e 100644 --- a/lib/include/mwaitxintrin.h +++ b/lib/include/mwaitxintrin.h @@ -1,22 +1,8 @@ /*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/nmmintrin.h b/lib/include/nmmintrin.h index 348fb8c7c..672aea496 100644 --- a/lib/include/nmmintrin.h +++ b/lib/include/nmmintrin.h @@ -1,22 +1,8 @@ /*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/opencl-c-base.h b/lib/include/opencl-c-base.h new file mode 100644 index 000000000..a82954ddd --- /dev/null +++ b/lib/include/opencl-c-base.h @@ -0,0 +1,578 @@ +//===----- opencl-c-base.h - OpenCL C language base definitions -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _OPENCL_BASE_H_ +#define _OPENCL_BASE_H_ + +// built-in scalar data types: + +/** + * An unsigned 8-bit integer. + */ +typedef unsigned char uchar; + +/** + * An unsigned 16-bit integer. + */ +typedef unsigned short ushort; + +/** + * An unsigned 32-bit integer. + */ +typedef unsigned int uint; + +/** + * An unsigned 64-bit integer. + */ +typedef unsigned long ulong; + +/** + * The unsigned integer type of the result of the sizeof operator. This + * is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS + * defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if + * CL_DEVICE_ADDRESS_BITS is 64-bits. + */ +typedef __SIZE_TYPE__ size_t; + +/** + * A signed integer type that is the result of subtracting two pointers. + * This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS + * defined in table 4.3 is 32-bits and is a 64-bit signed integer if + * CL_DEVICE_ADDRESS_BITS is 64-bits. + */ +typedef __PTRDIFF_TYPE__ ptrdiff_t; + +/** + * A signed integer type with the property that any valid pointer to + * void can be converted to this type, then converted back to pointer + * to void, and the result will compare equal to the original pointer. + */ +typedef __INTPTR_TYPE__ intptr_t; + +/** + * An unsigned integer type with the property that any valid pointer to + * void can be converted to this type, then converted back to pointer + * to void, and the result will compare equal to the original pointer. + */ +typedef __UINTPTR_TYPE__ uintptr_t; + +// built-in vector data types: +typedef char char2 __attribute__((ext_vector_type(2))); +typedef char char3 __attribute__((ext_vector_type(3))); +typedef char char4 __attribute__((ext_vector_type(4))); +typedef char char8 __attribute__((ext_vector_type(8))); +typedef char char16 __attribute__((ext_vector_type(16))); +typedef uchar uchar2 __attribute__((ext_vector_type(2))); +typedef uchar uchar3 __attribute__((ext_vector_type(3))); +typedef uchar uchar4 __attribute__((ext_vector_type(4))); +typedef uchar uchar8 __attribute__((ext_vector_type(8))); +typedef uchar uchar16 __attribute__((ext_vector_type(16))); +typedef short short2 __attribute__((ext_vector_type(2))); +typedef short short3 __attribute__((ext_vector_type(3))); +typedef short short4 __attribute__((ext_vector_type(4))); +typedef short short8 __attribute__((ext_vector_type(8))); +typedef short short16 __attribute__((ext_vector_type(16))); +typedef ushort ushort2 __attribute__((ext_vector_type(2))); +typedef ushort ushort3 __attribute__((ext_vector_type(3))); +typedef ushort ushort4 __attribute__((ext_vector_type(4))); +typedef ushort ushort8 __attribute__((ext_vector_type(8))); +typedef ushort ushort16 __attribute__((ext_vector_type(16))); +typedef int int2 __attribute__((ext_vector_type(2))); +typedef int int3 __attribute__((ext_vector_type(3))); +typedef int int4 __attribute__((ext_vector_type(4))); +typedef int int8 __attribute__((ext_vector_type(8))); +typedef int int16 __attribute__((ext_vector_type(16))); +typedef uint uint2 __attribute__((ext_vector_type(2))); +typedef uint uint3 __attribute__((ext_vector_type(3))); +typedef uint uint4 __attribute__((ext_vector_type(4))); +typedef uint uint8 __attribute__((ext_vector_type(8))); +typedef uint uint16 __attribute__((ext_vector_type(16))); +typedef long long2 __attribute__((ext_vector_type(2))); +typedef long long3 __attribute__((ext_vector_type(3))); +typedef long long4 __attribute__((ext_vector_type(4))); +typedef long long8 __attribute__((ext_vector_type(8))); +typedef long long16 __attribute__((ext_vector_type(16))); +typedef ulong ulong2 __attribute__((ext_vector_type(2))); +typedef ulong ulong3 __attribute__((ext_vector_type(3))); +typedef ulong ulong4 __attribute__((ext_vector_type(4))); +typedef ulong ulong8 __attribute__((ext_vector_type(8))); +typedef ulong ulong16 __attribute__((ext_vector_type(16))); +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); +typedef float float8 __attribute__((ext_vector_type(8))); +typedef float float16 __attribute__((ext_vector_type(16))); +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +typedef half half2 __attribute__((ext_vector_type(2))); +typedef half half3 __attribute__((ext_vector_type(3))); +typedef half half4 __attribute__((ext_vector_type(4))); +typedef half half8 __attribute__((ext_vector_type(8))); +typedef half half16 __attribute__((ext_vector_type(16))); +#endif +#ifdef cl_khr_fp64 +#if __OPENCL_C_VERSION__ < CL_VERSION_1_2 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif +typedef double double2 __attribute__((ext_vector_type(2))); +typedef double double3 __attribute__((ext_vector_type(3))); +typedef double double4 __attribute__((ext_vector_type(4))); +typedef double double8 __attribute__((ext_vector_type(8))); +typedef double double16 __attribute__((ext_vector_type(16))); +#endif + +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 +#define NULL ((void*)0) +#endif + +/** + * Value of maximum non-infinite single-precision floating-point + * number. + */ +#define MAXFLOAT 0x1.fffffep127f + +/** + * A positive float constant expression. HUGE_VALF evaluates + * to +infinity. Used as an error value returned by the built-in + * math functions. + */ +#define HUGE_VALF (__builtin_huge_valf()) + +/** + * A positive double constant expression. HUGE_VAL evaluates + * to +infinity. Used as an error value returned by the built-in + * math functions. + */ +#define HUGE_VAL (__builtin_huge_val()) + +/** + * A constant expression of type float representing positive or + * unsigned infinity. + */ +#define INFINITY (__builtin_inff()) + +/** + * A constant expression of type float representing a quiet NaN. + */ +#define NAN as_float(INT_MAX) + +#define FP_ILOGB0 INT_MIN +#define FP_ILOGBNAN INT_MAX + +#define FLT_DIG 6 +#define FLT_MANT_DIG 24 +#define FLT_MAX_10_EXP +38 +#define FLT_MAX_EXP +128 +#define FLT_MIN_10_EXP -37 +#define FLT_MIN_EXP -125 +#define FLT_RADIX 2 +#define FLT_MAX 0x1.fffffep127f +#define FLT_MIN 0x1.0p-126f +#define FLT_EPSILON 0x1.0p-23f + +#define M_E_F 2.71828182845904523536028747135266250f +#define M_LOG2E_F 1.44269504088896340735992468100189214f +#define M_LOG10E_F 0.434294481903251827651128918916605082f +#define M_LN2_F 0.693147180559945309417232121458176568f +#define M_LN10_F 2.30258509299404568401799145468436421f +#define M_PI_F 3.14159265358979323846264338327950288f +#define M_PI_2_F 1.57079632679489661923132169163975144f +#define M_PI_4_F 0.785398163397448309615660845819875721f +#define M_1_PI_F 0.318309886183790671537767526745028724f +#define M_2_PI_F 0.636619772367581343075535053490057448f +#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f +#define M_SQRT2_F 1.41421356237309504880168872420969808f +#define M_SQRT1_2_F 0.707106781186547524400844362104849039f + +#define DBL_DIG 15 +#define DBL_MANT_DIG 53 +#define DBL_MAX_10_EXP +308 +#define DBL_MAX_EXP +1024 +#define DBL_MIN_10_EXP -307 +#define DBL_MIN_EXP -1021 +#define DBL_RADIX 2 +#define DBL_MAX 0x1.fffffffffffffp1023 +#define DBL_MIN 0x1.0p-1022 +#define DBL_EPSILON 0x1.0p-52 + +#define M_E 0x1.5bf0a8b145769p+1 +#define M_LOG2E 0x1.71547652b82fep+0 +#define M_LOG10E 0x1.bcb7b1526e50ep-2 +#define M_LN2 0x1.62e42fefa39efp-1 +#define M_LN10 0x1.26bb1bbb55516p+1 +#define M_PI 0x1.921fb54442d18p+1 +#define M_PI_2 0x1.921fb54442d18p+0 +#define M_PI_4 0x1.921fb54442d18p-1 +#define M_1_PI 0x1.45f306dc9c883p-2 +#define M_2_PI 0x1.45f306dc9c883p-1 +#define M_2_SQRTPI 0x1.20dd750429b6dp+0 +#define M_SQRT2 0x1.6a09e667f3bcdp+0 +#define M_SQRT1_2 0x1.6a09e667f3bcdp-1 + +#ifdef cl_khr_fp16 + +#define HALF_DIG 3 +#define HALF_MANT_DIG 11 +#define HALF_MAX_10_EXP +4 +#define HALF_MAX_EXP +16 +#define HALF_MIN_10_EXP -4 +#define HALF_MIN_EXP -13 +#define HALF_RADIX 2 +#define HALF_MAX ((0x1.ffcp15h)) +#define HALF_MIN ((0x1.0p-14h)) +#define HALF_EPSILON ((0x1.0p-10h)) + +#define M_E_H 2.71828182845904523536028747135266250h +#define M_LOG2E_H 1.44269504088896340735992468100189214h +#define M_LOG10E_H 0.434294481903251827651128918916605082h +#define M_LN2_H 0.693147180559945309417232121458176568h +#define M_LN10_H 2.30258509299404568401799145468436421h +#define M_PI_H 3.14159265358979323846264338327950288h +#define M_PI_2_H 1.57079632679489661923132169163975144h +#define M_PI_4_H 0.785398163397448309615660845819875721h +#define M_1_PI_H 0.318309886183790671537767526745028724h +#define M_2_PI_H 0.636619772367581343075535053490057448h +#define M_2_SQRTPI_H 1.12837916709551257389615890312154517h +#define M_SQRT2_H 1.41421356237309504880168872420969808h +#define M_SQRT1_2_H 0.707106781186547524400844362104849039h + +#endif //cl_khr_fp16 + +#define CHAR_BIT 8 +#define SCHAR_MAX 127 +#define SCHAR_MIN (-128) +#define UCHAR_MAX 255 +#define CHAR_MAX SCHAR_MAX +#define CHAR_MIN SCHAR_MIN +#define USHRT_MAX 65535 +#define SHRT_MAX 32767 +#define SHRT_MIN (-32768) +#define UINT_MAX 0xffffffff +#define INT_MAX 2147483647 +#define INT_MIN (-2147483647-1) +#define ULONG_MAX 0xffffffffffffffffUL +#define LONG_MAX 0x7fffffffffffffffL +#define LONG_MIN (-0x7fffffffffffffffL-1) + +// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions + +// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence +typedef uint cl_mem_fence_flags; + +/** + * Queue a memory fence to ensure correct + * ordering of memory operations to local memory + */ +#define CLK_LOCAL_MEM_FENCE 0x01 + +/** + * Queue a memory fence to ensure correct + * ordering of memory operations to global memory + */ +#define CLK_GLOBAL_MEM_FENCE 0x02 + +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 + +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif +} memory_scope; + +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 + +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 +/** + * Queue a memory fence to ensure correct ordering of memory + * operations between work-items of a work-group to + * image memory. + */ +#define CLK_IMAGE_MEM_FENCE 0x04 + +#ifndef ATOMIC_VAR_INIT +#define ATOMIC_VAR_INIT(x) (x) +#endif //ATOMIC_VAR_INIT +#define ATOMIC_FLAG_INIT 0 + +// enum values aligned with what clang uses in EmitAtomicExpr() +typedef enum memory_order +{ + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 + +// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions + +// These values need to match the runtime equivalent +// +// Addressing Mode. +// +#define CLK_ADDRESS_NONE 0 +#define CLK_ADDRESS_CLAMP_TO_EDGE 2 +#define CLK_ADDRESS_CLAMP 4 +#define CLK_ADDRESS_REPEAT 6 +#define CLK_ADDRESS_MIRRORED_REPEAT 8 + +// +// Coordination Normalization +// +#define CLK_NORMALIZED_COORDS_FALSE 0 +#define CLK_NORMALIZED_COORDS_TRUE 1 + +// +// Filtering Mode. +// +#define CLK_FILTER_NEAREST 0x10 +#define CLK_FILTER_LINEAR 0x20 + +#ifdef cl_khr_gl_msaa_sharing +#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable +#endif //cl_khr_gl_msaa_sharing + +// +// Channel Datatype. +// +#define CLK_SNORM_INT8 0x10D0 +#define CLK_SNORM_INT16 0x10D1 +#define CLK_UNORM_INT8 0x10D2 +#define CLK_UNORM_INT16 0x10D3 +#define CLK_UNORM_SHORT_565 0x10D4 +#define CLK_UNORM_SHORT_555 0x10D5 +#define CLK_UNORM_INT_101010 0x10D6 +#define CLK_SIGNED_INT8 0x10D7 +#define CLK_SIGNED_INT16 0x10D8 +#define CLK_SIGNED_INT32 0x10D9 +#define CLK_UNSIGNED_INT8 0x10DA +#define CLK_UNSIGNED_INT16 0x10DB +#define CLK_UNSIGNED_INT32 0x10DC +#define CLK_HALF_FLOAT 0x10DD +#define CLK_FLOAT 0x10DE +#define CLK_UNORM_INT24 0x10DF + +// Channel order, numbering must be aligned with cl_channel_order in cl.h +// +#define CLK_R 0x10B0 +#define CLK_A 0x10B1 +#define CLK_RG 0x10B2 +#define CLK_RA 0x10B3 +#define CLK_RGB 0x10B4 +#define CLK_RGBA 0x10B5 +#define CLK_BGRA 0x10B6 +#define CLK_ARGB 0x10B7 +#define CLK_INTENSITY 0x10B8 +#define CLK_LUMINANCE 0x10B9 +#define CLK_Rx 0x10BA +#define CLK_RGx 0x10BB +#define CLK_RGBx 0x10BC +#define CLK_DEPTH 0x10BD +#define CLK_DEPTH_STENCIL 0x10BE +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 +#define CLK_sRGB 0x10BF +#define CLK_sRGBx 0x10C0 +#define CLK_sRGBA 0x10C1 +#define CLK_sBGRA 0x10C2 +#define CLK_ABGR 0x10C3 +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 + +// OpenCL v2.0 s6.13.16 - Pipe Functions +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 +#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t)) +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 + + +// OpenCL v2.0 s6.13.17 - Enqueue Kernels +#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 + +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +#define CLK_SUCCESS 0 +#define CLK_ENQUEUE_FAILURE -101 +#define CLK_INVALID_QUEUE -102 +#define CLK_INVALID_NDRANGE -160 +#define CLK_INVALID_EVENT_WAIT_LIST -57 +#define CLK_DEVICE_QUEUE_FULL -161 +#define CLK_INVALID_ARG_SIZE -51 +#define CLK_EVENT_ALLOCATION_FAILURE -100 +#define CLK_OUT_OF_RESOURCES -5 + +#define CLK_NULL_QUEUE 0 +#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t)) + +// execution model related definitions +#define CLK_ENQUEUE_FLAGS_NO_WAIT 0x0 +#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0x1 +#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 0x2 + +typedef int kernel_enqueue_flags_t; +typedef int clk_profiling_info; + +// Profiling info name (see capture_event_profiling_info) +#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1 + +#define MAX_WORK_DIM 3 + +typedef struct { + unsigned int workDimension; + size_t globalWorkOffset[MAX_WORK_DIM]; + size_t globalWorkSize[MAX_WORK_DIM]; + size_t localWorkSize[MAX_WORK_DIM]; +} ndrange_t; + +#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 + +#ifdef cl_intel_device_side_avc_motion_estimation +#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin + +#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0 +#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1 +#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2 +#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3 + +#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0 +#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1 +#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2 +#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3 + +#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0 +#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 +#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 + +#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 +#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E +#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D +#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B +#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 +#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F +#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F +#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F + +#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 +#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 +#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 + +#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 +#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 +#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 +#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 +#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 +#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 +#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 +#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 +#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 + +#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 + +#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 + +#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 + +#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 +#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 +#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 +#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B +#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 + +#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 +#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 +#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 +#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 + +#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0 +#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1 +#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2 + +#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 +#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 + +#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30) +#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30) + +#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 +#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 + +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 +#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 + +#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 +#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 + +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1 +#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2 +#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3 + +#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 +#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 + +#define CLK_AVC_ME_INITIALIZE_INTEL 0x0 + +#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0 +#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0 +#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0 + +#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0 +#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0 +#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0 + +#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0 +#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0 +#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0 +#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0 + +#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end +#endif // cl_intel_device_side_avc_motion_estimation + +#endif //_OPENCL_BASE_H_ diff --git a/lib/include/opencl-c.h b/lib/include/opencl-c.h index 3d3dfb749..4207c53cc 100644 --- a/lib/include/opencl-c.h +++ b/lib/include/opencl-c.h @@ -1,15 +1,16 @@ //===--- opencl-c.h - OpenCL C language builtin function header -----------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef _OPENCL_H_ #define _OPENCL_H_ +#include "opencl-c-base.h" + #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 #ifndef cl_khr_depth_images #define cl_khr_depth_images @@ -23,9 +24,6 @@ #endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0 #if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 -#ifndef cl_intel_planar_yuv -#define cl_intel_planar_yuv -#endif // cl_intel_planar_yuv #pragma OPENCL EXTENSION cl_intel_planar_yuv : begin #pragma OPENCL EXTENSION cl_intel_planar_yuv : end #endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2 @@ -37,255 +35,6 @@ #define __purefn __attribute__((pure)) #define __cnfn __attribute__((const)) -// built-in scalar data types: - -/** - * An unsigned 8-bit integer. - */ -typedef unsigned char uchar; - -/** - * An unsigned 16-bit integer. - */ -typedef unsigned short ushort; - -/** - * An unsigned 32-bit integer. - */ -typedef unsigned int uint; - -/** - * An unsigned 64-bit integer. - */ -typedef unsigned long ulong; - -/** - * The unsigned integer type of the result of the sizeof operator. This - * is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS - * defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if - * CL_DEVICE_ADDRESS_BITS is 64-bits. - */ -typedef __SIZE_TYPE__ size_t; - -/** - * A signed integer type that is the result of subtracting two pointers. - * This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS - * defined in table 4.3 is 32-bits and is a 64-bit signed integer if - * CL_DEVICE_ADDRESS_BITS is 64-bits. - */ -typedef __PTRDIFF_TYPE__ ptrdiff_t; - -/** -* A signed integer type with the property that any valid pointer to -* void can be converted to this type, then converted back to pointer -* to void, and the result will compare equal to the original pointer. -*/ -typedef __INTPTR_TYPE__ intptr_t; - -/** -* An unsigned integer type with the property that any valid pointer to -* void can be converted to this type, then converted back to pointer -* to void, and the result will compare equal to the original pointer. -*/ -typedef __UINTPTR_TYPE__ uintptr_t; - -// built-in vector data types: -typedef char char2 __attribute__((ext_vector_type(2))); -typedef char char3 __attribute__((ext_vector_type(3))); -typedef char char4 __attribute__((ext_vector_type(4))); -typedef char char8 __attribute__((ext_vector_type(8))); -typedef char char16 __attribute__((ext_vector_type(16))); -typedef uchar uchar2 __attribute__((ext_vector_type(2))); -typedef uchar uchar3 __attribute__((ext_vector_type(3))); -typedef uchar uchar4 __attribute__((ext_vector_type(4))); -typedef uchar uchar8 __attribute__((ext_vector_type(8))); -typedef uchar uchar16 __attribute__((ext_vector_type(16))); -typedef short short2 __attribute__((ext_vector_type(2))); -typedef short short3 __attribute__((ext_vector_type(3))); -typedef short short4 __attribute__((ext_vector_type(4))); -typedef short short8 __attribute__((ext_vector_type(8))); -typedef short short16 __attribute__((ext_vector_type(16))); -typedef ushort ushort2 __attribute__((ext_vector_type(2))); -typedef ushort ushort3 __attribute__((ext_vector_type(3))); -typedef ushort ushort4 __attribute__((ext_vector_type(4))); -typedef ushort ushort8 __attribute__((ext_vector_type(8))); -typedef ushort ushort16 __attribute__((ext_vector_type(16))); -typedef int int2 __attribute__((ext_vector_type(2))); -typedef int int3 __attribute__((ext_vector_type(3))); -typedef int int4 __attribute__((ext_vector_type(4))); -typedef int int8 __attribute__((ext_vector_type(8))); -typedef int int16 __attribute__((ext_vector_type(16))); -typedef uint uint2 __attribute__((ext_vector_type(2))); -typedef uint uint3 __attribute__((ext_vector_type(3))); -typedef uint uint4 __attribute__((ext_vector_type(4))); -typedef uint uint8 __attribute__((ext_vector_type(8))); -typedef uint uint16 __attribute__((ext_vector_type(16))); -typedef long long2 __attribute__((ext_vector_type(2))); -typedef long long3 __attribute__((ext_vector_type(3))); -typedef long long4 __attribute__((ext_vector_type(4))); -typedef long long8 __attribute__((ext_vector_type(8))); -typedef long long16 __attribute__((ext_vector_type(16))); -typedef ulong ulong2 __attribute__((ext_vector_type(2))); -typedef ulong ulong3 __attribute__((ext_vector_type(3))); -typedef ulong ulong4 __attribute__((ext_vector_type(4))); -typedef ulong ulong8 __attribute__((ext_vector_type(8))); -typedef ulong ulong16 __attribute__((ext_vector_type(16))); -typedef float float2 __attribute__((ext_vector_type(2))); -typedef float float3 __attribute__((ext_vector_type(3))); -typedef float float4 __attribute__((ext_vector_type(4))); -typedef float float8 __attribute__((ext_vector_type(8))); -typedef float float16 __attribute__((ext_vector_type(16))); -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -typedef half half2 __attribute__((ext_vector_type(2))); -typedef half half3 __attribute__((ext_vector_type(3))); -typedef half half4 __attribute__((ext_vector_type(4))); -typedef half half8 __attribute__((ext_vector_type(8))); -typedef half half16 __attribute__((ext_vector_type(16))); -#endif -#ifdef cl_khr_fp64 -#if __OPENCL_C_VERSION__ < CL_VERSION_1_2 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -#endif -typedef double double2 __attribute__((ext_vector_type(2))); -typedef double double3 __attribute__((ext_vector_type(3))); -typedef double double4 __attribute__((ext_vector_type(4))); -typedef double double8 __attribute__((ext_vector_type(8))); -typedef double double16 __attribute__((ext_vector_type(16))); -#endif - -#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -#define NULL ((void*)0) -#endif - -/** - * Value of maximum non-infinite single-precision floating-point - * number. - */ -#define MAXFLOAT 0x1.fffffep127f - -/** - * A positive float constant expression. HUGE_VALF evaluates - * to +infinity. Used as an error value returned by the built-in - * math functions. - */ -#define HUGE_VALF (__builtin_huge_valf()) - -/** - * A positive double constant expression. HUGE_VAL evaluates - * to +infinity. Used as an error value returned by the built-in - * math functions. - */ -#define HUGE_VAL (__builtin_huge_val()) - -/** - * A constant expression of type float representing positive or - * unsigned infinity. - */ -#define INFINITY (__builtin_inff()) - -/** - * A constant expression of type float representing a quiet NaN. - */ -#define NAN as_float(INT_MAX) - -#define FP_ILOGB0 INT_MIN -#define FP_ILOGBNAN INT_MAX - -#define FLT_DIG 6 -#define FLT_MANT_DIG 24 -#define FLT_MAX_10_EXP +38 -#define FLT_MAX_EXP +128 -#define FLT_MIN_10_EXP -37 -#define FLT_MIN_EXP -125 -#define FLT_RADIX 2 -#define FLT_MAX 0x1.fffffep127f -#define FLT_MIN 0x1.0p-126f -#define FLT_EPSILON 0x1.0p-23f - -#define M_E_F 2.71828182845904523536028747135266250f -#define M_LOG2E_F 1.44269504088896340735992468100189214f -#define M_LOG10E_F 0.434294481903251827651128918916605082f -#define M_LN2_F 0.693147180559945309417232121458176568f -#define M_LN10_F 2.30258509299404568401799145468436421f -#define M_PI_F 3.14159265358979323846264338327950288f -#define M_PI_2_F 1.57079632679489661923132169163975144f -#define M_PI_4_F 0.785398163397448309615660845819875721f -#define M_1_PI_F 0.318309886183790671537767526745028724f -#define M_2_PI_F 0.636619772367581343075535053490057448f -#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f -#define M_SQRT2_F 1.41421356237309504880168872420969808f -#define M_SQRT1_2_F 0.707106781186547524400844362104849039f - -#define DBL_DIG 15 -#define DBL_MANT_DIG 53 -#define DBL_MAX_10_EXP +308 -#define DBL_MAX_EXP +1024 -#define DBL_MIN_10_EXP -307 -#define DBL_MIN_EXP -1021 -#define DBL_RADIX 2 -#define DBL_MAX 0x1.fffffffffffffp1023 -#define DBL_MIN 0x1.0p-1022 -#define DBL_EPSILON 0x1.0p-52 - -#define M_E 0x1.5bf0a8b145769p+1 -#define M_LOG2E 0x1.71547652b82fep+0 -#define M_LOG10E 0x1.bcb7b1526e50ep-2 -#define M_LN2 0x1.62e42fefa39efp-1 -#define M_LN10 0x1.26bb1bbb55516p+1 -#define M_PI 0x1.921fb54442d18p+1 -#define M_PI_2 0x1.921fb54442d18p+0 -#define M_PI_4 0x1.921fb54442d18p-1 -#define M_1_PI 0x1.45f306dc9c883p-2 -#define M_2_PI 0x1.45f306dc9c883p-1 -#define M_2_SQRTPI 0x1.20dd750429b6dp+0 -#define M_SQRT2 0x1.6a09e667f3bcdp+0 -#define M_SQRT1_2 0x1.6a09e667f3bcdp-1 - -#ifdef cl_khr_fp16 - -#define HALF_DIG 3 -#define HALF_MANT_DIG 11 -#define HALF_MAX_10_EXP +4 -#define HALF_MAX_EXP +16 -#define HALF_MIN_10_EXP -4 -#define HALF_MIN_EXP -13 -#define HALF_RADIX 2 -#define HALF_MAX ((0x1.ffcp15h)) -#define HALF_MIN ((0x1.0p-14h)) -#define HALF_EPSILON ((0x1.0p-10h)) - -#define M_E_H 2.71828182845904523536028747135266250h -#define M_LOG2E_H 1.44269504088896340735992468100189214h -#define M_LOG10E_H 0.434294481903251827651128918916605082h -#define M_LN2_H 0.693147180559945309417232121458176568h -#define M_LN10_H 2.30258509299404568401799145468436421h -#define M_PI_H 3.14159265358979323846264338327950288h -#define M_PI_2_H 1.57079632679489661923132169163975144h -#define M_PI_4_H 0.785398163397448309615660845819875721h -#define M_1_PI_H 0.318309886183790671537767526745028724h -#define M_2_PI_H 0.636619772367581343075535053490057448h -#define M_2_SQRTPI_H 1.12837916709551257389615890312154517h -#define M_SQRT2_H 1.41421356237309504880168872420969808h -#define M_SQRT1_2_H 0.707106781186547524400844362104849039h - -#endif //cl_khr_fp16 - -#define CHAR_BIT 8 -#define SCHAR_MAX 127 -#define SCHAR_MIN (-128) -#define UCHAR_MAX 255 -#define CHAR_MAX SCHAR_MAX -#define CHAR_MIN SCHAR_MIN -#define USHRT_MAX 65535 -#define SHRT_MAX 32767 -#define SHRT_MIN (-32768) -#define UINT_MAX 0xffffffff -#define INT_MAX 2147483647 -#define INT_MIN (-2147483647-1) -#define ULONG_MAX 0xffffffffffffffffUL -#define LONG_MAX 0x7fffffffffffffffL -#define LONG_MIN (-0x7fffffffffffffffL-1) // OpenCL v1.1/1.2/2.0 s6.2.3 - Explicit conversions @@ -9598,8 +9347,6 @@ long8 __ovld __cnfn clamp(long8 x, long8 minval, long8 maxval); ulong8 __ovld __cnfn clamp(ulong8 x, ulong8 minval, ulong8 maxval); long16 __ovld __cnfn clamp(long16 x, long16 minval, long16 maxval); ulong16 __ovld __cnfn clamp(ulong16 x, ulong16 minval, ulong16 maxval); -char __ovld __cnfn clamp(char x, char minval, char maxval); -uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval); char2 __ovld __cnfn clamp(char2 x, char minval, char maxval); uchar2 __ovld __cnfn clamp(uchar2 x, uchar minval, uchar maxval); char3 __ovld __cnfn clamp(char3 x, char minval, char maxval); @@ -9610,8 +9357,6 @@ char8 __ovld __cnfn clamp(char8 x, char minval, char maxval); uchar8 __ovld __cnfn clamp(uchar8 x, uchar minval, uchar maxval); char16 __ovld __cnfn clamp(char16 x, char minval, char maxval); uchar16 __ovld __cnfn clamp(uchar16 x, uchar minval, uchar maxval); -short __ovld __cnfn clamp(short x, short minval, short maxval); -ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval); short2 __ovld __cnfn clamp(short2 x, short minval, short maxval); ushort2 __ovld __cnfn clamp(ushort2 x, ushort minval, ushort maxval); short3 __ovld __cnfn clamp(short3 x, short minval, short maxval); @@ -9622,8 +9367,6 @@ short8 __ovld __cnfn clamp(short8 x, short minval, short maxval); ushort8 __ovld __cnfn clamp(ushort8 x, ushort minval, ushort maxval); short16 __ovld __cnfn clamp(short16 x, short minval, short maxval); ushort16 __ovld __cnfn clamp(ushort16 x, ushort minval, ushort maxval); -int __ovld __cnfn clamp(int x, int minval, int maxval); -uint __ovld __cnfn clamp(uint x, uint minval, uint maxval); int2 __ovld __cnfn clamp(int2 x, int minval, int maxval); uint2 __ovld __cnfn clamp(uint2 x, uint minval, uint maxval); int3 __ovld __cnfn clamp(int3 x, int minval, int maxval); @@ -9634,8 +9377,6 @@ int8 __ovld __cnfn clamp(int8 x, int minval, int maxval); uint8 __ovld __cnfn clamp(uint8 x, uint minval, uint maxval); int16 __ovld __cnfn clamp(int16 x, int minval, int maxval); uint16 __ovld __cnfn clamp(uint16 x, uint minval, uint maxval); -long __ovld __cnfn clamp(long x, long minval, long maxval); -ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval); long2 __ovld __cnfn clamp(long2 x, long minval, long maxval); ulong2 __ovld __cnfn clamp(ulong2 x, ulong minval, ulong maxval); long3 __ovld __cnfn clamp(long3 x, long minval, long maxval); @@ -9911,8 +9652,6 @@ long8 __ovld __cnfn max(long8 x, long8 y); ulong8 __ovld __cnfn max(ulong8 x, ulong8 y); long16 __ovld __cnfn max(long16 x, long16 y); ulong16 __ovld __cnfn max(ulong16 x, ulong16 y); -char __ovld __cnfn max(char x, char y); -uchar __ovld __cnfn max(uchar x, uchar y); char2 __ovld __cnfn max(char2 x, char y); uchar2 __ovld __cnfn max(uchar2 x, uchar y); char3 __ovld __cnfn max(char3 x, char y); @@ -9923,8 +9662,6 @@ char8 __ovld __cnfn max(char8 x, char y); uchar8 __ovld __cnfn max(uchar8 x, uchar y); char16 __ovld __cnfn max(char16 x, char y); uchar16 __ovld __cnfn max(uchar16 x, uchar y); -short __ovld __cnfn max(short x, short y); -ushort __ovld __cnfn max(ushort x, ushort y); short2 __ovld __cnfn max(short2 x, short y); ushort2 __ovld __cnfn max(ushort2 x, ushort y); short3 __ovld __cnfn max(short3 x, short y); @@ -9935,8 +9672,6 @@ short8 __ovld __cnfn max(short8 x, short y); ushort8 __ovld __cnfn max(ushort8 x, ushort y); short16 __ovld __cnfn max(short16 x, short y); ushort16 __ovld __cnfn max(ushort16 x, ushort y); -int __ovld __cnfn max(int x, int y); -uint __ovld __cnfn max(uint x, uint y); int2 __ovld __cnfn max(int2 x, int y); uint2 __ovld __cnfn max(uint2 x, uint y); int3 __ovld __cnfn max(int3 x, int y); @@ -9947,8 +9682,6 @@ int8 __ovld __cnfn max(int8 x, int y); uint8 __ovld __cnfn max(uint8 x, uint y); int16 __ovld __cnfn max(int16 x, int y); uint16 __ovld __cnfn max(uint16 x, uint y); -long __ovld __cnfn max(long x, long y); -ulong __ovld __cnfn max(ulong x, ulong y); long2 __ovld __cnfn max(long2 x, long y); ulong2 __ovld __cnfn max(ulong2 x, ulong y); long3 __ovld __cnfn max(long3 x, long y); @@ -10011,8 +9744,6 @@ long8 __ovld __cnfn min(long8 x, long8 y); ulong8 __ovld __cnfn min(ulong8 x, ulong8 y); long16 __ovld __cnfn min(long16 x, long16 y); ulong16 __ovld __cnfn min(ulong16 x, ulong16 y); -char __ovld __cnfn min(char x, char y); -uchar __ovld __cnfn min(uchar x, uchar y); char2 __ovld __cnfn min(char2 x, char y); uchar2 __ovld __cnfn min(uchar2 x, uchar y); char3 __ovld __cnfn min(char3 x, char y); @@ -10023,8 +9754,6 @@ char8 __ovld __cnfn min(char8 x, char y); uchar8 __ovld __cnfn min(uchar8 x, uchar y); char16 __ovld __cnfn min(char16 x, char y); uchar16 __ovld __cnfn min(uchar16 x, uchar y); -short __ovld __cnfn min(short x, short y); -ushort __ovld __cnfn min(ushort x, ushort y); short2 __ovld __cnfn min(short2 x, short y); ushort2 __ovld __cnfn min(ushort2 x, ushort y); short3 __ovld __cnfn min(short3 x, short y); @@ -10035,8 +9764,6 @@ short8 __ovld __cnfn min(short8 x, short y); ushort8 __ovld __cnfn min(ushort8 x, ushort y); short16 __ovld __cnfn min(short16 x, short y); ushort16 __ovld __cnfn min(ushort16 x, ushort y); -int __ovld __cnfn min(int x, int y); -uint __ovld __cnfn min(uint x, uint y); int2 __ovld __cnfn min(int2 x, int y); uint2 __ovld __cnfn min(uint2 x, uint y); int3 __ovld __cnfn min(int3 x, int y); @@ -10047,8 +9774,6 @@ int8 __ovld __cnfn min(int8 x, int y); uint8 __ovld __cnfn min(uint8 x, uint y); int16 __ovld __cnfn min(int16 x, int y); uint16 __ovld __cnfn min(uint16 x, uint y); -long __ovld __cnfn min(long x, long y); -ulong __ovld __cnfn min(ulong x, ulong y); long2 __ovld __cnfn min(long2 x, long y); ulong2 __ovld __cnfn min(ulong2 x, ulong y); long3 __ovld __cnfn min(long3 x, long y); @@ -10627,7 +10352,6 @@ half3 __ovld __cnfn step(half3 edge, half3 x); half4 __ovld __cnfn step(half4 edge, half4 x); half8 __ovld __cnfn step(half8 edge, half8 x); half16 __ovld __cnfn step(half16 edge, half16 x); -half __ovld __cnfn step(half edge, half x); half2 __ovld __cnfn step(half edge, half2 x); half3 __ovld __cnfn step(half edge, half3 x); half4 __ovld __cnfn step(half edge, half4 x); @@ -10679,7 +10403,6 @@ half3 __ovld __cnfn smoothstep(half3 edge0, half3 edge1, half3 x); half4 __ovld __cnfn smoothstep(half4 edge0, half4 edge1, half4 x); half8 __ovld __cnfn smoothstep(half8 edge0, half8 edge1, half8 x); half16 __ovld __cnfn smoothstep(half16 edge0, half16 edge1, half16 x); -half __ovld __cnfn smoothstep(half edge0, half edge1, half x); half2 __ovld __cnfn smoothstep(half edge0, half edge1, half2 x); half3 __ovld __cnfn smoothstep(half edge0, half edge1, half3 x); half4 __ovld __cnfn smoothstep(half edge0, half edge1, half4 x); @@ -12777,30 +12500,6 @@ void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p); // OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions -// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence -typedef uint cl_mem_fence_flags; - -/** - * Queue a memory fence to ensure correct - * ordering of memory operations to local memory - */ -#define CLK_LOCAL_MEM_FENCE 0x01 - -/** - * Queue a memory fence to ensure correct - * ordering of memory operations to global memory - */ -#define CLK_GLOBAL_MEM_FENCE 0x02 - -#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -/** - * Queue a memory fence to ensure correct ordering of memory - * operations between work-items of a work-group to - * image memory. - */ -#define CLK_IMAGE_MEM_FENCE 0x04 -#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 - /** * All work-items in a work-group executing the kernel * on a processor must execute this function before any @@ -12834,17 +12533,6 @@ typedef uint cl_mem_fence_flags; void __ovld __conv barrier(cl_mem_fence_flags flags); #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 - -typedef enum memory_scope { - memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, - memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, - memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, - memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, -#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) - memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP -#endif -} memory_scope; - void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope); void __ovld __conv work_group_barrier(cl_mem_fence_flags flags); #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -13341,6 +13029,10 @@ int __ovld atomic_add(volatile __global int *p, int val); unsigned int __ovld atomic_add(volatile __global unsigned int *p, unsigned int val); int __ovld atomic_add(volatile __local int *p, int val); unsigned int __ovld atomic_add(volatile __local unsigned int *p, unsigned int val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_add(volatile int *p, int val); +unsigned int __ovld atomic_add(volatile unsigned int *p, unsigned int val); +#endif #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_add(volatile __global int *p, int val); @@ -13367,6 +13059,10 @@ int __ovld atomic_sub(volatile __global int *p, int val); unsigned int __ovld atomic_sub(volatile __global unsigned int *p, unsigned int val); int __ovld atomic_sub(volatile __local int *p, int val); unsigned int __ovld atomic_sub(volatile __local unsigned int *p, unsigned int val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_sub(volatile int *p, int val); +unsigned int __ovld atomic_sub(volatile unsigned int *p, unsigned int val); +#endif #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_sub(volatile __global int *p, int val); @@ -13395,6 +13091,11 @@ int __ovld atomic_xchg(volatile __local int *p, int val); unsigned int __ovld atomic_xchg(volatile __local unsigned int *p, unsigned int val); float __ovld atomic_xchg(volatile __global float *p, float val); float __ovld atomic_xchg(volatile __local float *p, float val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_xchg(volatile int *p, int val); +unsigned int __ovld atomic_xchg(volatile unsigned int *p, unsigned int val); +float __ovld atomic_xchg(volatile float *p, float val); +#endif #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_xchg(volatile __global int *p, int val); @@ -13422,6 +13123,10 @@ int __ovld atomic_inc(volatile __global int *p); unsigned int __ovld atomic_inc(volatile __global unsigned int *p); int __ovld atomic_inc(volatile __local int *p); unsigned int __ovld atomic_inc(volatile __local unsigned int *p); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_inc(volatile int *p); +unsigned int __ovld atomic_inc(volatile unsigned int *p); +#endif #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_inc(volatile __global int *p); @@ -13449,6 +13154,10 @@ int __ovld atomic_dec(volatile __global int *p); unsigned int __ovld atomic_dec(volatile __global unsigned int *p); int __ovld atomic_dec(volatile __local int *p); unsigned int __ovld atomic_dec(volatile __local unsigned int *p); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_dec(volatile int *p); +unsigned int __ovld atomic_dec(volatile unsigned int *p); +#endif #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_dec(volatile __global int *p); @@ -13477,6 +13186,10 @@ int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val); unsigned int __ovld atomic_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val); int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val); unsigned int __ovld atomic_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val); +unsigned int __ovld atomic_cmpxchg(volatile unsigned int *p, unsigned int cmp, unsigned int val); +#endif #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val); @@ -13505,6 +13218,10 @@ int __ovld atomic_min(volatile __global int *p, int val); unsigned int __ovld atomic_min(volatile __global unsigned int *p, unsigned int val); int __ovld atomic_min(volatile __local int *p, int val); unsigned int __ovld atomic_min(volatile __local unsigned int *p, unsigned int val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_min(volatile int *p, int val); +unsigned int __ovld atomic_min(volatile unsigned int *p, unsigned int val); +#endif #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_min(volatile __global int *p, int val); @@ -13533,6 +13250,10 @@ int __ovld atomic_max(volatile __global int *p, int val); unsigned int __ovld atomic_max(volatile __global unsigned int *p, unsigned int val); int __ovld atomic_max(volatile __local int *p, int val); unsigned int __ovld atomic_max(volatile __local unsigned int *p, unsigned int val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_max(volatile int *p, int val); +unsigned int __ovld atomic_max(volatile unsigned int *p, unsigned int val); +#endif #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_max(volatile __global int *p, int val); @@ -13560,6 +13281,10 @@ int __ovld atomic_and(volatile __global int *p, int val); unsigned int __ovld atomic_and(volatile __global unsigned int *p, unsigned int val); int __ovld atomic_and(volatile __local int *p, int val); unsigned int __ovld atomic_and(volatile __local unsigned int *p, unsigned int val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_and(volatile int *p, int val); +unsigned int __ovld atomic_and(volatile unsigned int *p, unsigned int val); +#endif #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_and(volatile __global int *p, int val); @@ -13587,6 +13312,10 @@ int __ovld atomic_or(volatile __global int *p, int val); unsigned int __ovld atomic_or(volatile __global unsigned int *p, unsigned int val); int __ovld atomic_or(volatile __local int *p, int val); unsigned int __ovld atomic_or(volatile __local unsigned int *p, unsigned int val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_or(volatile int *p, int val); +unsigned int __ovld atomic_or(volatile unsigned int *p, unsigned int val); +#endif #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_or(volatile __global int *p, int val); @@ -13614,6 +13343,10 @@ int __ovld atomic_xor(volatile __global int *p, int val); unsigned int __ovld atomic_xor(volatile __global unsigned int *p, unsigned int val); int __ovld atomic_xor(volatile __local int *p, int val); unsigned int __ovld atomic_xor(volatile __local unsigned int *p, unsigned int val); +#ifdef __OPENCL_CPP_VERSION__ +int __ovld atomic_xor(volatile int *p, int val); +unsigned int __ovld atomic_xor(volatile unsigned int *p, unsigned int val); +#endif #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_xor(volatile __global int *p, int val); @@ -13639,20 +13372,6 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v // OpenCL v2.0 s6.13.11 - Atomics Functions #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -#ifndef ATOMIC_VAR_INIT -#define ATOMIC_VAR_INIT(x) (x) -#endif //ATOMIC_VAR_INIT -#define ATOMIC_FLAG_INIT 0 - -// enum values aligned with what clang uses in EmitAtomicExpr() -typedef enum memory_order -{ - memory_order_relaxed = __ATOMIC_RELAXED, - memory_order_acquire = __ATOMIC_ACQUIRE, - memory_order_release = __ATOMIC_RELEASE, - memory_order_acq_rel = __ATOMIC_ACQ_REL, - memory_order_seq_cst = __ATOMIC_SEQ_CST -} memory_order; // double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) @@ -14470,33 +14189,11 @@ half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask); #if __OPENCL_C_VERSION__ >= CL_VERSION_1_2 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf -int printf(__constant const char* st, ...); +int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); #endif // OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions -// These values need to match the runtime equivalent -// -// Addressing Mode. -// -#define CLK_ADDRESS_NONE 0 -#define CLK_ADDRESS_CLAMP_TO_EDGE 2 -#define CLK_ADDRESS_CLAMP 4 -#define CLK_ADDRESS_REPEAT 6 -#define CLK_ADDRESS_MIRRORED_REPEAT 8 - -// -// Coordination Normalization -// -#define CLK_NORMALIZED_COORDS_FALSE 0 -#define CLK_NORMALIZED_COORDS_TRUE 1 - -// -// Filtering Mode. -// -#define CLK_FILTER_NEAREST 0x10 -#define CLK_FILTER_LINEAR 0x20 - #ifdef cl_khr_gl_msaa_sharing #pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable #endif //cl_khr_gl_msaa_sharing @@ -14712,30 +14409,6 @@ float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY); uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY); -float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod); -int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod); -uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod); - -float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod); -int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod); -uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod); - -float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod); -int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod); -uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod); - -float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod); - -float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod); -int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod); -uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod); - -float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod); - -float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod); -int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod); -uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod); - #endif //cl_khr_mipmap_image #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -14895,29 +14568,6 @@ float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY); uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY); -float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod); -int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod); -uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod); - -float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod); -int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod); -uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod); - -float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod); -int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod); -uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod); - -float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod); - -float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod); -int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod); -uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod); - -float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod); - -float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod); -int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod); -uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod); #endif //cl_khr_mipmap_image #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -15332,26 +14982,6 @@ int __ovld get_image_num_mip_levels(read_write image2d_depth_t image); * CLK_FLOAT */ -// -// Channel Datatype. -// -#define CLK_SNORM_INT8 0x10D0 -#define CLK_SNORM_INT16 0x10D1 -#define CLK_UNORM_INT8 0x10D2 -#define CLK_UNORM_INT16 0x10D3 -#define CLK_UNORM_SHORT_565 0x10D4 -#define CLK_UNORM_SHORT_555 0x10D5 -#define CLK_UNORM_INT_101010 0x10D6 -#define CLK_SIGNED_INT8 0x10D7 -#define CLK_SIGNED_INT16 0x10D8 -#define CLK_SIGNED_INT32 0x10D9 -#define CLK_UNSIGNED_INT8 0x10DA -#define CLK_UNSIGNED_INT16 0x10DB -#define CLK_UNSIGNED_INT32 0x10DC -#define CLK_HALF_FLOAT 0x10DD -#define CLK_FLOAT 0x10DE -#define CLK_UNORM_INT24 0x10DF - int __ovld __cnfn get_image_channel_data_type(read_only image1d_t image); int __ovld __cnfn get_image_channel_data_type(read_only image1d_buffer_t image); int __ovld __cnfn get_image_channel_data_type(read_only image2d_t image); @@ -15423,30 +15053,6 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept * CLK_INTENSITY * CLK_LUMINANCE */ -// Channel order, numbering must be aligned with cl_channel_order in cl.h -// -#define CLK_R 0x10B0 -#define CLK_A 0x10B1 -#define CLK_RG 0x10B2 -#define CLK_RA 0x10B3 -#define CLK_RGB 0x10B4 -#define CLK_RGBA 0x10B5 -#define CLK_BGRA 0x10B6 -#define CLK_ARGB 0x10B7 -#define CLK_INTENSITY 0x10B8 -#define CLK_LUMINANCE 0x10B9 -#define CLK_Rx 0x10BA -#define CLK_RGx 0x10BB -#define CLK_RGBx 0x10BC -#define CLK_DEPTH 0x10BD -#define CLK_DEPTH_STENCIL 0x10BE -#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -#define CLK_sRGB 0x10BF -#define CLK_sRGBx 0x10C0 -#define CLK_sRGBA 0x10C1 -#define CLK_sBGRA 0x10C2 -#define CLK_ABGR 0x10C3 -#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 int __ovld __cnfn get_image_channel_order(read_only image1d_t image); int __ovld __cnfn get_image_channel_order(read_only image1d_buffer_t image); @@ -15605,20 +15211,17 @@ size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t #if defined(cl_khr_gl_msaa_sharing) int __ovld get_image_num_samples(read_only image2d_msaa_t image); int __ovld get_image_num_samples(read_only image2d_msaa_depth_t image); -int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image); int __ovld get_image_num_samples(read_only image2d_array_msaa_t image); int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image); int __ovld get_image_num_samples(write_only image2d_msaa_t image); int __ovld get_image_num_samples(write_only image2d_msaa_depth_t image); -int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image); int __ovld get_image_num_samples(write_only image2d_array_msaa_t image); int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image); #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 int __ovld get_image_num_samples(read_write image2d_msaa_t image); int __ovld get_image_num_samples(read_write image2d_msaa_depth_t image); -int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image); int __ovld get_image_num_samples(read_write image2d_array_msaa_t image); int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image); #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -15728,7 +15331,6 @@ double __ovld __conv work_group_scan_inclusive_max(double x); // OpenCL v2.0 s6.13.16 - Pipe Functions #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t)) bool __ovld is_valid_reserve_id(reserve_id_t reserve_id); #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 @@ -15736,44 +15338,6 @@ bool __ovld is_valid_reserve_id(reserve_id_t reserve_id); // OpenCL v2.0 s6.13.17 - Enqueue Kernels #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -#define CL_COMPLETE 0x0 -#define CL_RUNNING 0x1 -#define CL_SUBMITTED 0x2 -#define CL_QUEUED 0x3 - -#define CLK_SUCCESS 0 -#define CLK_ENQUEUE_FAILURE -101 -#define CLK_INVALID_QUEUE -102 -#define CLK_INVALID_NDRANGE -160 -#define CLK_INVALID_EVENT_WAIT_LIST -57 -#define CLK_DEVICE_QUEUE_FULL -161 -#define CLK_INVALID_ARG_SIZE -51 -#define CLK_EVENT_ALLOCATION_FAILURE -100 -#define CLK_OUT_OF_RESOURCES -5 - -#define CLK_NULL_QUEUE 0 -#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t)) - -// execution model related definitions -#define CLK_ENQUEUE_FLAGS_NO_WAIT 0x0 -#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0x1 -#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 0x2 - -typedef int kernel_enqueue_flags_t; -typedef int clk_profiling_info; - -// Profiling info name (see capture_event_profiling_info) -#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1 - -#define MAX_WORK_DIM 3 - -typedef struct { - unsigned int workDimension; - size_t globalWorkOffset[MAX_WORK_DIM]; - size_t globalWorkSize[MAX_WORK_DIM]; - size_t localWorkSize[MAX_WORK_DIM]; -} ndrange_t; - ndrange_t __ovld ndrange_1D(size_t); ndrange_t __ovld ndrange_1D(size_t, size_t); ndrange_t __ovld ndrange_1D(size_t, size_t, size_t); @@ -16216,138 +15780,6 @@ void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, u #ifdef cl_intel_device_side_avc_motion_estimation #pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin -#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0 -#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1 -#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2 -#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3 - -#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0 -#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1 -#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2 -#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3 - -#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0 -#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 -#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 - -#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 -#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E -#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D -#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B -#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 -#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F -#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F -#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F - -#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 -#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 -#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 - -#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 -#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 -#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 -#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 -#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 -#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 -#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 -#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 -#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 - -#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 -#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 - -#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 -#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 -#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 - -#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 -#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 -#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 -#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 - -#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 -#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 -#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 -#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B -#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 - -#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 -#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 -#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 -#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 - -#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0 -#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1 -#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2 - -#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 -#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 - -#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24) -#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24) -#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30) -#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30) - -#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 -#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 - -#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0 -#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 -#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 -#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 - -#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 -#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 -#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 -#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 - -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 -#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 -#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 -#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 -#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 - -#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1 -#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2 -#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3 - -#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 -#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 - -#define CLK_AVC_ME_INITIALIZE_INTEL 0x0 - -#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0 -#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0 -#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0 - -#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0 -#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0 -#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0 - -#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0 -#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0 -#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0 -#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0 - // MCE built-in functions uchar __ovld intel_sub_group_avc_mce_get_default_inter_base_multi_reference_penalty( @@ -17034,6 +16466,34 @@ uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2); uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2); #endif // cl_amd_media_ops2 +#if defined(cl_arm_integer_dot_product_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : begin +uint __ovld arm_dot(uchar4 a, uchar4 b); +int __ovld arm_dot(char4 a, char4 b); +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : end +#endif // defined(cl_arm_integer_dot_product_int8) + +#if defined(cl_arm_integer_dot_product_accumulate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : begin +uint __ovld arm_dot_acc(uchar4 a, uchar4 b, uint c); +int __ovld arm_dot_acc(char4 a, char4 b, int c); +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : end +#endif // defined(cl_arm_integer_dot_product_accumulate_int8) + +#if defined(cl_arm_integer_dot_product_accumulate_int16) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int16 : begin +uint __ovld arm_dot_acc(ushort2 a, ushort2 b, uint c); +int __ovld arm_dot_acc(short2 a, short2 b, int c); +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int16 : end +#endif // defined(cl_arm_integer_dot_product_accumulate_int16) + +#if defined(cl_arm_integer_dot_product_accumulate_saturate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_saturate_int8 : begin +uint __ovld arm_dot_acc_sat(uchar4 a, uchar4 b, uint c); +int __ovld arm_dot_acc_sat(char4 a, char4 b, int c); +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_saturate_int8 : end +#endif // defined(cl_arm_integer_dot_product_accumulate_saturate_int8) + // Disable any extensions we may have enabled previously. #pragma OPENCL EXTENSION all : disable diff --git a/lib/include/openmp_wrappers/__clang_openmp_math.h b/lib/include/openmp_wrappers/__clang_openmp_math.h new file mode 100644 index 000000000..5d7ce9a96 --- /dev/null +++ b/lib/include/openmp_wrappers/__clang_openmp_math.h @@ -0,0 +1,35 @@ +/*===---- __clang_openmp_math.h - OpenMP target math support ---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if defined(__NVPTX__) && defined(_OPENMP) +/// TODO: +/// We are currently reusing the functionality of the Clang-CUDA code path +/// as an alternative to the host declarations provided by math.h and cmath. +/// This is suboptimal. +/// +/// We should instead declare the device functions in a similar way, e.g., +/// through OpenMP 5.0 variants, and afterwards populate the module with the +/// host declarations by unconditionally including the host math.h or cmath, +/// respectively. This is actually what the Clang-CUDA code path does, using +/// __device__ instead of variants to avoid redeclarations and get the desired +/// overload resolution. + +#define __CUDA__ + +#if defined(__cplusplus) + #include <__clang_cuda_cmath.h> +#endif + +#undef __CUDA__ + +/// Magic macro for stopping the math.h/cmath host header from being included. +#define __CLANG_NO_HOST_MATH__ + +#endif + diff --git a/lib/include/openmp_wrappers/__clang_openmp_math_declares.h b/lib/include/openmp_wrappers/__clang_openmp_math_declares.h new file mode 100644 index 000000000..a422c98bf --- /dev/null +++ b/lib/include/openmp_wrappers/__clang_openmp_math_declares.h @@ -0,0 +1,33 @@ +/*===---- __clang_openmp_math_declares.h - OpenMP math declares ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_OPENMP_MATH_DECLARES_H__ +#define __CLANG_OPENMP_MATH_DECLARES_H__ + +#ifndef _OPENMP +#error "This file is for OpenMP compilation only." +#endif + +#if defined(__NVPTX__) && defined(_OPENMP) + +#define __CUDA__ + +#if defined(__cplusplus) + #include <__clang_cuda_math_forward_declares.h> +#endif + +/// Include declarations for libdevice functions. +#include <__clang_cuda_libdevice_declares.h> +/// Provide definitions for these functions. +#include <__clang_cuda_device_functions.h> + +#undef __CUDA__ + +#endif +#endif diff --git a/lib/include/openmp_wrappers/cmath b/lib/include/openmp_wrappers/cmath new file mode 100644 index 000000000..a5183a1d8 --- /dev/null +++ b/lib/include/openmp_wrappers/cmath @@ -0,0 +1,16 @@ +/*===-------------- cmath - Alternative cmath header -----------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include <__clang_openmp_math.h> + +#ifndef __CLANG_NO_HOST_MATH__ +#include_next +#else +#undef __CLANG_NO_HOST_MATH__ +#endif diff --git a/lib/include/openmp_wrappers/math.h b/lib/include/openmp_wrappers/math.h new file mode 100644 index 000000000..d2786ecb2 --- /dev/null +++ b/lib/include/openmp_wrappers/math.h @@ -0,0 +1,17 @@ +/*===------------- math.h - Alternative math.h header ----------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#include <__clang_openmp_math.h> + +#ifndef __CLANG_NO_HOST_MATH__ +#include_next +#else +#undef __CLANG_NO_HOST_MATH__ +#endif + diff --git a/lib/include/pconfigintrin.h b/lib/include/pconfigintrin.h index fee3cad38..d2014b026 100644 --- a/lib/include/pconfigintrin.h +++ b/lib/include/pconfigintrin.h @@ -1,22 +1,8 @@ /*===---- pconfigintrin.h - X86 platform configuration ---------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -30,6 +16,8 @@ #define __PCONFIG_KEY_PROGRAM 0x00000001 +#if __has_extension(gnu_asm) + /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("pconfig"))) @@ -47,4 +35,6 @@ _pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) #undef __DEFAULT_FN_ATTRS +#endif /* __has_extension(gnu_asm) */ + #endif diff --git a/lib/include/pkuintrin.h b/lib/include/pkuintrin.h index 6976924d8..c62080bec 100644 --- a/lib/include/pkuintrin.h +++ b/lib/include/pkuintrin.h @@ -1,23 +1,9 @@ /*===---- pkuintrin.h - PKU intrinsics -------------------------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/pmmintrin.h b/lib/include/pmmintrin.h index 7e1a9eae5..c376f298c 100644 --- a/lib/include/pmmintrin.h +++ b/lib/include/pmmintrin.h @@ -1,22 +1,8 @@ /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/popcntintrin.h b/lib/include/popcntintrin.h index 75ceab9e1..312901014 100644 --- a/lib/include/popcntintrin.h +++ b/lib/include/popcntintrin.h @@ -1,22 +1,8 @@ /*===---- popcntintrin.h - POPCNT intrinsics -------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -43,22 +29,6 @@ _mm_popcnt_u32(unsigned int __A) return __builtin_popcount(__A); } -/// Counts the number of bits in the source operand having a value of 1. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the POPCNT instruction. -/// -/// \param __A -/// A signed 32-bit integer operand. -/// \returns A 32-bit integer containing the number of bits with value 1 in the -/// source operand. -static __inline__ int __DEFAULT_FN_ATTRS -_popcnt32(int __A) -{ - return __builtin_popcount(__A); -} - #ifdef __x86_64__ /// Counts the number of bits in the source operand having a value of 1. /// @@ -75,22 +45,6 @@ _mm_popcnt_u64(unsigned long long __A) { return __builtin_popcountll(__A); } - -/// Counts the number of bits in the source operand having a value of 1. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the POPCNT instruction. -/// -/// \param __A -/// A signed 64-bit integer operand. -/// \returns A 64-bit integer containing the number of bits with value 1 in the -/// source operand. -static __inline__ long long __DEFAULT_FN_ATTRS -_popcnt64(long long __A) -{ - return __builtin_popcountll(__A); -} #endif /* __x86_64__ */ #undef __DEFAULT_FN_ATTRS diff --git a/lib/include/ppc_wrappers/emmintrin.h b/lib/include/ppc_wrappers/emmintrin.h new file mode 100644 index 000000000..617ce24ac --- /dev/null +++ b/lib/include/ppc_wrappers/emmintrin.h @@ -0,0 +1,2318 @@ +/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header file is to help porting code using Intel intrinsics + explicitly from x86_64 to powerpc64/powerpc64le. + + Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type, + PowerPC VMX/VSX ISA is a good match for vector float SIMD operations. + However scalar float operations in vector (XMM) registers require + the POWER8 VSX ISA (2.07) level. There are differences for data + format and placement of float scalars in the vector register, which + require extra steps to match SSE2 scalar float semantics on POWER. + + It should be noted that there's much difference between X86_64's + MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use + portable instead of access MXSCR directly. + + Most SSE2 scalar float intrinsic operations can be performed more + efficiently as C language float scalar operations or optimized to + use vector SIMD operations. We recommend this for new applications. +*/ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef EMMINTRIN_H_ +#define EMMINTRIN_H_ + +#include + +/* We need definitions from the SSE header files. */ +#include + +/* SSE2 */ +typedef __vector double __v2df; +typedef __vector long long __v2di; +typedef __vector unsigned long long __v2du; +typedef __vector int __v4si; +typedef __vector unsigned int __v4su; +typedef __vector short __v8hi; +typedef __vector unsigned short __v8hu; +typedef __vector signed char __v16qi; +typedef __vector unsigned char __v16qu; + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); + +/* Unaligned version of the same types. */ +typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); + +/* Define two value permute mask. */ +#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y)) + +/* Create a vector with element 0 as F and the rest zero. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_sd (double __F) +{ + return __extension__ (__m128d){ __F, 0.0 }; +} + +/* Create a vector with both elements equal to F. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pd (double __F) +{ + return __extension__ (__m128d){ __F, __F }; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pd1 (double __F) +{ + return _mm_set1_pd (__F); +} + +/* Create a vector with the lower value X and upper value W. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pd (double __W, double __X) +{ + return __extension__ (__m128d){ __X, __W }; +} + +/* Create a vector with the lower value W and upper value X. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pd (double __W, double __X) +{ + return __extension__ (__m128d){ __W, __X }; +} + +/* Create an undefined vector. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_pd (void) +{ + __m128d __Y = __Y; + return __Y; +} + +/* Create a vector of zeros. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_pd (void) +{ + return (__m128d) vec_splats (0); +} + +/* Sets the low DPFP value of A from the low value of B. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_sd (__m128d __A, __m128d __B) +{ + __v2df result = (__v2df) __A; + result [0] = ((__v2df) __B)[0]; + return (__m128d) result; +} + +/* Load two DPFP values from P. The address must be 16-byte aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_pd (double const *__P) +{ + return ((__m128d)vec_ld(0, (__v16qu*)__P)); +} + +/* Load two DPFP values from P. The address need not be 16-byte aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_pd (double const *__P) +{ + return (vec_vsx_ld(0, __P)); +} + +/* Create a vector with all two elements equal to *P. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load1_pd (double const *__P) +{ + return (vec_splats (*__P)); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_sd (double const *__P) +{ + return _mm_set_sd (*__P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_pd1 (double const *__P) +{ + return _mm_load1_pd (__P); +} + +/* Load two DPFP values in reverse order. The address must be aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadr_pd (double const *__P) +{ + __v2df __tmp = _mm_load_pd (__P); + return (__m128d)vec_xxpermdi (__tmp, __tmp, 2); +} + +/* Store two DPFP values. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_pd (double *__P, __m128d __A) +{ + vec_st((__v16qu)__A, 0, (__v16qu*)__P); +} + +/* Store two DPFP values. The address need not be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_pd (double *__P, __m128d __A) +{ + *(__m128d_u *)__P = __A; +} + +/* Stores the lower DPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_sd (double *__P, __m128d __A) +{ + *__P = ((__v2df)__A)[0]; +} + +extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_f64 (__m128d __A) +{ + return ((__v2df)__A)[0]; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_pd (double *__P, __m128d __A) +{ + _mm_store_sd (__P, __A); +} + +/* Stores the upper DPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeh_pd (double *__P, __m128d __A) +{ + *__P = ((__v2df)__A)[1]; +} +/* Store the lower DPFP value across two words. + The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store1_pd (double *__P, __m128d __A) +{ + _mm_store_pd (__P, vec_splat (__A, 0)); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_pd1 (double *__P, __m128d __A) +{ + _mm_store1_pd (__P, __A); +} + +/* Store two DPFP values in reverse order. The address must be aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storer_pd (double *__P, __m128d __A) +{ + _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2)); +} + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si64 (__m128i __A) +{ + return ((__v2di)__A)[0]; +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si64x (__m128i __A) +{ + return ((__v2di)__A)[0]; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A + (__v2df)__B); +} + +/* Add the lower double-precision (64-bit) floating-point element in + a and b, store the result in the lower element of dst, and copy + the upper element from a to the upper element of dst. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_sd (__m128d __A, __m128d __B) +{ + __A[0] = __A[0] + __B[0]; + return (__A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A - (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_sd (__m128d __A, __m128d __B) +{ + __A[0] = __A[0] - __B[0]; + return (__A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A * (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_sd (__m128d __A, __m128d __B) +{ + __A[0] = __A[0] * __B[0]; + return (__A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A / (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_sd (__m128d __A, __m128d __B) +{ + __A[0] = __A[0] / __B[0]; + return (__A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_pd (__m128d __A) +{ + return (vec_sqrt (__A)); +} + +/* Return pair {sqrt (B[0]), A[1]}. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_sd (__m128d __A, __m128d __B) +{ + __v2df c; + c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0])); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pd (__m128d __A, __m128d __B) +{ + return (vec_min (__A, __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + c = vec_min (a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pd (__m128d __A, __m128d __B) +{ + return (vec_max (__A, __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + c = vec_max (a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_pd (__m128d __A, __m128d __B) +{ + __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B); + return ((__m128d)vec_nor (temp, temp)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_pd (__m128d __A, __m128d __B) +{ + return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_pd (__m128d __A, __m128d __B) +{ +#if _ARCH_PWR8 + __v2du c, d; + /* Compare against self will return false (0's) if NAN. */ + c = (__v2du)vec_cmpeq (__A, __A); + d = (__v2du)vec_cmpeq (__B, __B); +#else + __v2du a, b; + __v2du c, d; + const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; + a = (__v2du)vec_abs ((__v2df)__A); + b = (__v2du)vec_abs ((__v2df)__B); + c = (__v2du)vec_cmpgt (double_exp_mask, a); + d = (__v2du)vec_cmpgt (double_exp_mask, b); +#endif + /* A != NAN and B != NAN. */ + return ((__m128d)vec_and(c, d)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_pd (__m128d __A, __m128d __B) +{ +#if _ARCH_PWR8 + __v2du c, d; + /* Compare against self will return false (0's) if NAN. */ + c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); + d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); + /* A == NAN OR B == NAN converts too: + NOT(A != NAN) OR NOT(B != NAN). */ + c = vec_nor (c, c); + return ((__m128d)vec_orc(c, d)); +#else + __v2du c, d; + /* Compare against self will return false (0's) if NAN. */ + c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A); + d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B); + /* Convert the true ('1's) is NAN. */ + c = vec_nor (c, c); + d = vec_nor (d, d); + return ((__m128d)vec_or(c, d)); +#endif +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_sd(__m128d __A, __m128d __B) +{ + __v2df a, b, c; + /* PowerISA VSX does not allow partial (for just lower double) + results. So to insure we don't generate spurious exceptions + (from the upper double values) we splat the lower double + before we do the operation. */ + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + c = (__v2df) vec_cmpeq(a, b); + /* Then we merge the lower double result with the original upper + double from __A. */ + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + c = (__v2df) vec_cmplt(a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + c = (__v2df) vec_cmple(a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + c = (__v2df) vec_cmpgt(a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + c = (__v2df) vec_cmpge(a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + c = (__v2df) vec_cmpeq(a, b); + c = vec_nor (c, c); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + /* Not less than is just greater than or equal. */ + c = (__v2df) vec_cmpge(a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + /* Not less than or equal is just greater than. */ + c = (__v2df) vec_cmpge(a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + /* Not greater than is just less than or equal. */ + c = (__v2df) vec_cmple(a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_sd (__m128d __A, __m128d __B) +{ + __v2df a, b, c; + a = vec_splats (__A[0]); + b = vec_splats (__B[0]); + /* Not greater than or equal is just less than. */ + c = (__v2df) vec_cmplt(a, b); + return (__m128d) _mm_setr_pd (c[0], __A[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_sd (__m128d __A, __m128d __B) +{ + __v2df r; + r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0])); + return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_sd (__m128d __A, __m128d __B) +{ + __v2df r; + r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0])); + return (__m128d) _mm_setr_pd (r[0], __A[1]); +} + +/* FIXME + The __mm_comi??_sd and __mm_ucomi??_sd implementations below are + exactly the same because GCC for PowerPC only generates unordered + compares (scalar and vector). + Technically __mm_comieq_sp et all should be using the ordered + compare and signal for QNaNs. The __mm_ucomieq_sd et all should + be OK. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_sd (__m128d __A, __m128d __B) +{ + return (__A[0] == __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_sd (__m128d __A, __m128d __B) +{ + return (__A[0] < __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_sd (__m128d __A, __m128d __B) +{ + return (__A[0] <= __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_sd (__m128d __A, __m128d __B) +{ + return (__A[0] > __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_sd (__m128d __A, __m128d __B) +{ + return (__A[0] >= __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_sd (__m128d __A, __m128d __B) +{ + return (__A[0] != __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_sd (__m128d __A, __m128d __B) +{ + return (__A[0] == __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_sd (__m128d __A, __m128d __B) +{ + return (__A[0] < __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_sd (__m128d __A, __m128d __B) +{ + return (__A[0] <= __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_sd (__m128d __A, __m128d __B) +{ + return (__A[0] > __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_sd (__m128d __A, __m128d __B) +{ + return (__A[0] >= __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_sd (__m128d __A, __m128d __B) +{ + return (__A[0] != __B[0]); +} + +/* Create a vector of Qi, where i is the element number. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi64x (long long __q1, long long __q0) +{ + return __extension__ (__m128i)(__v2di){ __q0, __q1 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi64 (__m64 __q1, __m64 __q0) +{ + return _mm_set_epi64x ((long long)__q1, (long long)__q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) +{ + return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, + short __q3, short __q2, short __q1, short __q0) +{ + return __extension__ (__m128i)(__v8hi){ + __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m128i)(__v16qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 + }; +} + +/* Set all of the elements of the vector to A. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi64x (long long __A) +{ + return _mm_set_epi64x (__A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi64 (__m64 __A) +{ + return _mm_set_epi64 (__A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi32 (int __A) +{ + return _mm_set_epi32 (__A, __A, __A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi16 (short __A) +{ + return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi8 (char __A) +{ + return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +/* Create a vector of Qi, where i is the element number. + The parameter order is reversed from the _mm_set_epi* functions. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi64 (__m64 __q0, __m64 __q1) +{ + return _mm_set_epi64 (__q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) +{ + return _mm_set_epi32 (__q3, __q2, __q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, + short __q4, short __q5, short __q6, short __q7) +{ + return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, + char __q04, char __q05, char __q06, char __q07, + char __q08, char __q09, char __q10, char __q11, + char __q12, char __q13, char __q14, char __q15) +{ + return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, + __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_si128 (__m128i const *__P) +{ + return *__P; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si128 (__m128i_u const *__P) +{ + return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_epi64 (__m128i_u const *__P) +{ + return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_si128 (__m128i *__P, __m128i __B) +{ + vec_st ((__v16qu) __B, 0, (__v16qu*)__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si128 (__m128i_u *__P, __m128i __B) +{ + *__P = __B; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_epi64 (__m128i_u *__P, __m128i __B) +{ + *(long long *)__P = ((__v2di)__B)[0]; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi64_pi64 (__m128i_u __B) +{ + return (__m64) ((__v2di)__B)[0]; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movpi64_epi64 (__m64 __A) +{ + return _mm_set_epi64 ((__m64)0LL, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_epi64 (__m128i __A) +{ + return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]); +} + +/* Create an undefined vector. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_si128 (void) +{ + __m128i __Y = __Y; + return __Y; +} + +/* Create a vector of zeros. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si128 (void) +{ + return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_pd (__m128i __A) +{ + __v2di val; + /* For LE need to generate Vector Unpack Low Signed Word. + Which is generated from unpackh. */ + val = (__v2di)vec_unpackh ((__v4si)__A); + + return (__m128d)vec_ctf (val, 0); +} +#endif + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_ps (__m128i __A) +{ + return ((__m128)vec_ctf((__v4si)__A, 0)); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epi32 (__m128d __A) +{ + __v2df rounded = vec_rint (__A); + __v4si result, temp; + const __v4si vzero = + { 0, 0, 0, 0 }; + + /* VSX Vector truncate Double-Precision to integer and Convert to + Signed Integer Word format with Saturate. */ + __asm__( + "xvcvdpsxws %x0,%x1" + : "=wa" (temp) + : "wa" (rounded) + : ); + +#ifdef _ARCH_PWR8 + temp = vec_mergeo (temp, temp); + result = (__v4si) vec_vpkudum ((__vector long long) temp, + (__vector long long) vzero); +#else + { + const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; + result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + } +#endif + return (__m128i) result; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_pi32 (__m128d __A) +{ + __m128i result = _mm_cvtpd_epi32(__A); + + return (__m64) result[0]; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_ps (__m128d __A) +{ + __v4sf result; + __v4si temp; + const __v4si vzero = { 0, 0, 0, 0 }; + + __asm__( + "xvcvdpsp %x0,%x1" + : "=wa" (temp) + : "wa" (__A) + : ); + +#ifdef _ARCH_PWR8 + temp = vec_mergeo (temp, temp); + result = (__v4sf) vec_vpkudum ((__vector long long) temp, + (__vector long long) vzero); +#else + { + const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; + result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + } +#endif + return ((__m128)result); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epi32 (__m128d __A) +{ + __v4si result; + __v4si temp; + const __v4si vzero = { 0, 0, 0, 0 }; + + /* VSX Vector truncate Double-Precision to integer and Convert to + Signed Integer Word format with Saturate. */ + __asm__( + "xvcvdpsxws %x0,%x1" + : "=wa" (temp) + : "wa" (__A) + : ); + +#ifdef _ARCH_PWR8 + temp = vec_mergeo (temp, temp); + result = (__v4si) vec_vpkudum ((__vector long long) temp, + (__vector long long) vzero); +#else + { + const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f }; + result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm); + } +#endif + + return ((__m128i) result); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_pi32 (__m128d __A) +{ + __m128i result = _mm_cvttpd_epi32 (__A); + + return (__m64) result[0]; +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si32 (__m128i __A) +{ + return ((__v4si)__A)[0]; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32_pd (__m64 __A) +{ + __v4si temp; + __v2di tmp2; + __v2df result; + + temp = (__v4si)vec_splats (__A); + tmp2 = (__v2di)vec_unpackl (temp); + result = vec_ctf ((__vector signed long long) tmp2, 0); + return (__m128d)result; +} +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epi32 (__m128 __A) +{ + __v4sf rounded; + __v4si result; + + rounded = vec_rint((__v4sf) __A); + result = vec_cts (rounded, 0); + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epi32 (__m128 __A) +{ + __v4si result; + + result = vec_cts ((__v4sf) __A, 0); + return (__m128i) result; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pd (__m128 __A) +{ + /* Check if vec_doubleh is defined by . If so use that. */ +#ifdef vec_doubleh + return (__m128d) vec_doubleh ((__v4sf)__A); +#else + /* Otherwise the compiler is not current and so need to generate the + equivalent code. */ + __v4sf a = (__v4sf)__A; + __v4sf temp; + __v2df result; +#ifdef __LITTLE_ENDIAN__ + /* The input float values are in elements {[0], [1]} but the convert + instruction needs them in elements {[1], [3]}, So we use two + shift left double vector word immediates to get the elements + lined up. */ + temp = __builtin_vsx_xxsldwi (a, a, 3); + temp = __builtin_vsx_xxsldwi (a, temp, 2); +#else + /* The input float values are in elements {[0], [1]} but the convert + instruction needs them in elements {[0], [2]}, So we use two + shift left double vector word immediates to get the elements + lined up. */ + temp = vec_vmrghw (a, a); +#endif + __asm__( + " xvcvspdp %x0,%x1" + : "=wa" (result) + : "wa" (temp) + : ); + return (__m128d) result; +#endif +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si32 (__m128d __A) +{ + __v2df rounded = vec_rint((__v2df) __A); + int result = ((__v2df)rounded)[0]; + + return result; +} +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si64 (__m128d __A) +{ + __v2df rounded = vec_rint ((__v2df) __A ); + long long result = ((__v2df) rounded)[0]; + + return result; +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si64x (__m128d __A) +{ + return _mm_cvtsd_si64 ((__v2df)__A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si32 (__m128d __A) +{ + int result = ((__v2df)__A)[0]; + + return result; +} + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si64 (__m128d __A) +{ + long long result = ((__v2df)__A)[0]; + + return result; +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si64x (__m128d __A) +{ + return _mm_cvttsd_si64 (__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_ss (__m128 __A, __m128d __B) +{ + __v4sf result = (__v4sf)__A; + +#ifdef __LITTLE_ENDIAN__ + __v4sf temp_s; + /* Copy double element[0] to element [1] for conversion. */ + __v2df temp_b = vec_splat((__v2df)__B, 0); + + /* Pre-rotate __A left 3 (logically right 1) elements. */ + result = __builtin_vsx_xxsldwi (result, result, 3); + /* Convert double to single float scalar in a vector. */ + __asm__( + "xscvdpsp %x0,%x1" + : "=wa" (temp_s) + : "wa" (temp_b) + : ); + /* Shift the resulting scalar into vector element [0]. */ + result = __builtin_vsx_xxsldwi (result, temp_s, 1); +#else + result [0] = ((__v2df)__B)[0]; +#endif + return (__m128) result; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_sd (__m128d __A, int __B) +{ + __v2df result = (__v2df)__A; + double db = __B; + result [0] = db; + return (__m128d)result; +} + +/* Intel intrinsic. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_sd (__m128d __A, long long __B) +{ + __v2df result = (__v2df)__A; + double db = __B; + result [0] = db; + return (__m128d)result; +} + +/* Microsoft intrinsic. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_sd (__m128d __A, long long __B) +{ + return _mm_cvtsi64_sd (__A, __B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_sd (__m128d __A, __m128 __B) +{ +#ifdef __LITTLE_ENDIAN__ + /* Use splat to move element [0] into position for the convert. */ + __v4sf temp = vec_splat ((__v4sf)__B, 0); + __v2df res; + /* Convert single float scalar to double in a vector. */ + __asm__( + "xscvspdp %x0,%x1" + : "=wa" (res) + : "wa" (temp) + : ); + return (__m128d) vec_mergel (res, (__v2df)__A); +#else + __v2df res = (__v2df)__A; + res [0] = ((__v4sf)__B) [0]; + return (__m128d) res; +#endif +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) +{ + __vector double result; + const int litmsk = __mask & 0x3; + + if (litmsk == 0) + result = vec_mergeh (__A, __B); +#if __GNUC__ < 6 + else if (litmsk == 1) + result = vec_xxpermdi (__B, __A, 2); + else if (litmsk == 2) + result = vec_xxpermdi (__B, __A, 1); +#else + else if (litmsk == 1) + result = vec_xxpermdi (__A, __B, 2); + else if (litmsk == 2) + result = vec_xxpermdi (__A, __B, 1); +#endif + else + result = vec_mergel (__A, __B); + + return result; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pd (__m128d __A, __m128d __B) +{ + return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pd (__m128d __A, __m128d __B) +{ + return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadh_pd (__m128d __A, double const *__B) +{ + __v2df result = (__v2df)__A; + result [1] = *__B; + return (__m128d)result; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_pd (__m128d __A, double const *__B) +{ + __v2df result = (__v2df)__A; + result [0] = *__B; + return (__m128d)result; +} + +#ifdef _ARCH_PWR8 +/* Intrinsic functions that require PowerISA 2.07 minimum. */ + +/* Creates a 2-bit mask from the most significant bits of the DPFP values. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_pd (__m128d __A) +{ + __vector unsigned long long result; + static const __vector unsigned int perm_mask = + { +#ifdef __LITTLE_ENDIAN__ + 0x80800040, 0x80808080, 0x80808080, 0x80808080 +#else + 0x80808080, 0x80808080, 0x80808080, 0x80804000 +#endif + }; + + result = ((__vector unsigned long long) + vec_vbpermq ((__vector unsigned char) __A, + (__vector unsigned char) perm_mask)); + +#ifdef __LITTLE_ENDIAN__ + return result[1]; +#else + return result[0]; +#endif +} +#endif /* _ARCH_PWR8 */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_mergel ((__vector long long) __A, + (__vector long long) __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_mergeh ((__vector long long) __A, + (__vector long long) __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qu)__A + (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hu)__A + (__v8hu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A + (__v4su)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A + (__v2du)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qu)__A - (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hu)__A - (__v8hu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A - (__v4su)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A - (__v2du)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_epi16 (__m128i __A, __m128i __B) +{ + __vector signed int zero = {0, 0, 0, 0}; + + return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_epi16 (__m128i __A, __m128i __B) +{ + __vector signed int w0, w1; + + __vector unsigned char xform1 = { +#ifdef __LITTLE_ENDIAN__ + 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, + 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F +#else + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, + 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D +#endif + }; + + w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B); + w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B); + return (__m128i) vec_perm (w0, w1, xform1); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hi)__A * (__v8hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_su32 (__m64 __A, __m64 __B) +{ + unsigned int a = __A; + unsigned int b = __B; + + return ((__m64)a * (__m64)b); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epu32 (__m128i __A, __m128i __B) +{ +#if __GNUC__ < 8 + __v2du result; + +#ifdef __LITTLE_ENDIAN__ + /* VMX Vector Multiply Odd Unsigned Word. */ + __asm__( + "vmulouw %0,%1,%2" + : "=v" (result) + : "v" (__A), "v" (__B) + : ); +#else + /* VMX Vector Multiply Even Unsigned Word. */ + __asm__( + "vmuleuw %0,%1,%2" + : "=v" (result) + : "v" (__A), "v" (__B) + : ); +#endif + return (__m128i) result; +#else + return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); +#endif +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi16 (__m128i __A, int __B) +{ + __v8hu lshift; + __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (__B >= 0 && __B < 16) + { + if (__builtin_constant_p(__B)) + lshift = (__v8hu) vec_splat_s16(__B); + else + lshift = vec_splats ((unsigned short) __B); + + result = vec_sl ((__v8hi) __A, lshift); + } + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi32 (__m128i __A, int __B) +{ + __v4su lshift; + __v4si result = { 0, 0, 0, 0 }; + + if (__B >= 0 && __B < 32) + { + if (__builtin_constant_p(__B) && __B < 16) + lshift = (__v4su) vec_splat_s32(__B); + else + lshift = vec_splats ((unsigned int) __B); + + result = vec_sl ((__v4si) __A, lshift); + } + + return (__m128i) result; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi64 (__m128i __A, int __B) +{ + __v2du lshift; + __v2di result = { 0, 0 }; + + if (__B >= 0 && __B < 64) + { + if (__builtin_constant_p(__B) && __B < 16) + lshift = (__v2du) vec_splat_s32(__B); + else + lshift = (__v2du) vec_splats ((unsigned int) __B); + + result = vec_sl ((__v2di) __A, lshift); + } + + return (__m128i) result; +} +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi16 (__m128i __A, int __B) +{ + __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 }; + __v8hi result; + + if (__B < 16) + { + if (__builtin_constant_p(__B)) + rshift = (__v8hu) vec_splat_s16(__B); + else + rshift = vec_splats ((unsigned short) __B); + } + result = vec_sra ((__v8hi) __A, rshift); + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi32 (__m128i __A, int __B) +{ + __v4su rshift = { 31, 31, 31, 31 }; + __v4si result; + + if (__B < 32) + { + if (__builtin_constant_p(__B)) + { + if (__B < 16) + rshift = (__v4su) vec_splat_s32(__B); + else + rshift = (__v4su) vec_splats((unsigned int)__B); + } + else + rshift = vec_splats ((unsigned int) __B); + } + result = vec_sra ((__v4si) __A, rshift); + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bslli_si128 (__m128i __A, const int __N) +{ + __v16qu result; + const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (__N < 16) + result = vec_sld ((__v16qu) __A, zeros, __N); + else + result = zeros; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bsrli_si128 (__m128i __A, const int __N) +{ + __v16qu result; + const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (__N < 16) +#ifdef __LITTLE_ENDIAN__ + if (__builtin_constant_p(__N)) + /* Would like to use Vector Shift Left Double by Octet + Immediate here to use the immediate form and avoid + load of __N * 8 value into a separate VR. */ + result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); + else +#endif + { + __v16qu shift = vec_splats((unsigned char)(__N*8)); +#ifdef __LITTLE_ENDIAN__ + result = vec_sro ((__v16qu)__A, shift); +#else + result = vec_slo ((__v16qu)__A, shift); +#endif + } + else + result = zeros; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si128 (__m128i __A, const int __N) +{ + return _mm_bsrli_si128 (__A, __N); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si128 (__m128i __A, const int _imm5) +{ + __v16qu result; + const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (_imm5 < 16) +#ifdef __LITTLE_ENDIAN__ + result = vec_sld ((__v16qu) __A, zeros, _imm5); +#else + result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); +#endif + else + result = zeros; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + +_mm_srli_epi16 (__m128i __A, int __B) +{ + __v8hu rshift; + __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (__B < 16) + { + if (__builtin_constant_p(__B)) + rshift = (__v8hu) vec_splat_s16(__B); + else + rshift = vec_splats ((unsigned short) __B); + + result = vec_sr ((__v8hi) __A, rshift); + } + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi32 (__m128i __A, int __B) +{ + __v4su rshift; + __v4si result = { 0, 0, 0, 0 }; + + if (__B < 32) + { + if (__builtin_constant_p(__B)) + { + if (__B < 16) + rshift = (__v4su) vec_splat_s32(__B); + else + rshift = (__v4su) vec_splats((unsigned int)__B); + } + else + rshift = vec_splats ((unsigned int) __B); + + result = vec_sr ((__v4si) __A, rshift); + } + + return (__m128i) result; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi64 (__m128i __A, int __B) +{ + __v2du rshift; + __v2di result = { 0, 0 }; + + if (__B < 64) + { + if (__builtin_constant_p(__B)) + { + if (__B < 16) + rshift = (__v2du) vec_splat_s32(__B); + else + rshift = (__v2du) vec_splats((unsigned long long)__B); + } + else + rshift = (__v2du) vec_splats ((unsigned int) __B); + + result = vec_sr ((__v2di) __A, rshift); + } + + return (__m128i) result; +} +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi16 (__m128i __A, __m128i __B) +{ + __v8hu lshift; + __vector __bool short shmask; + const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; + __v8hu result; + +#ifdef __LITTLE_ENDIAN__ + lshift = vec_splat ((__v8hu) __B, 0); +#else + lshift = vec_splat ((__v8hu) __B, 3); +#endif + shmask = vec_cmple (lshift, shmax); + result = vec_sl ((__v8hu) __A, lshift); + result = vec_sel ((__v8hu) shmask, result, shmask); + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi32 (__m128i __A, __m128i __B) +{ + __v4su lshift; + __vector __bool int shmask; + const __v4su shmax = { 32, 32, 32, 32 }; + __v4su result; +#ifdef __LITTLE_ENDIAN__ + lshift = vec_splat ((__v4su) __B, 0); +#else + lshift = vec_splat ((__v4su) __B, 1); +#endif + shmask = vec_cmplt (lshift, shmax); + result = vec_sl ((__v4su) __A, lshift); + result = vec_sel ((__v4su) shmask, result, shmask); + + return (__m128i) result; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi64 (__m128i __A, __m128i __B) +{ + __v2du lshift; + __vector __bool long long shmask; + const __v2du shmax = { 64, 64 }; + __v2du result; + + lshift = vec_splat ((__v2du) __B, 0); + shmask = vec_cmplt (lshift, shmax); + result = vec_sl ((__v2du) __A, lshift); + result = vec_sel ((__v2du) shmask, result, shmask); + + return (__m128i) result; +} +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi16 (__m128i __A, __m128i __B) +{ + const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; + __v8hu rshift; + __v8hi result; + +#ifdef __LITTLE_ENDIAN__ + rshift = vec_splat ((__v8hu)__B, 0); +#else + rshift = vec_splat ((__v8hu)__B, 3); +#endif + rshift = vec_min (rshift, rshmax); + result = vec_sra ((__v8hi) __A, rshift); + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi32 (__m128i __A, __m128i __B) +{ + const __v4su rshmax = { 31, 31, 31, 31 }; + __v4su rshift; + __v4si result; + +#ifdef __LITTLE_ENDIAN__ + rshift = vec_splat ((__v4su)__B, 0); +#else + rshift = vec_splat ((__v4su)__B, 1); +#endif + rshift = vec_min (rshift, rshmax); + result = vec_sra ((__v4si) __A, rshift); + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi16 (__m128i __A, __m128i __B) +{ + __v8hu rshift; + __vector __bool short shmask; + const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 }; + __v8hu result; + +#ifdef __LITTLE_ENDIAN__ + rshift = vec_splat ((__v8hu) __B, 0); +#else + rshift = vec_splat ((__v8hu) __B, 3); +#endif + shmask = vec_cmple (rshift, shmax); + result = vec_sr ((__v8hu) __A, rshift); + result = vec_sel ((__v8hu) shmask, result, shmask); + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi32 (__m128i __A, __m128i __B) +{ + __v4su rshift; + __vector __bool int shmask; + const __v4su shmax = { 32, 32, 32, 32 }; + __v4su result; + +#ifdef __LITTLE_ENDIAN__ + rshift = vec_splat ((__v4su) __B, 0); +#else + rshift = vec_splat ((__v4su) __B, 1); +#endif + shmask = vec_cmplt (rshift, shmax); + result = vec_sr ((__v4su) __A, rshift); + result = vec_sel ((__v4su) shmask, result, shmask); + + return (__m128i) result; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi64 (__m128i __A, __m128i __B) +{ + __v2du rshift; + __vector __bool long long shmask; + const __v2du shmax = { 64, 64 }; + __v2du result; + + rshift = vec_splat ((__v2du) __B, 0); + shmask = vec_cmplt (rshift, shmax); + result = vec_sr ((__v2du) __A, rshift); + result = vec_sel ((__v2du) shmask, result, shmask); + + return (__m128i) result; +} +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_pd (__m128d __A, __m128d __B) +{ + return (vec_and ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_pd (__m128d __A, __m128d __B) +{ + return (vec_andc ((__v2df) __B, (__v2df) __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_pd (__m128d __A, __m128d __B) +{ + return (vec_or ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_pd (__m128d __A, __m128d __B) +{ + return (vec_xor ((__v2df) __A, (__v2df) __B)); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)vec_and ((__v2di) __A, (__v2di) __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)vec_or ((__v2di) __A, (__v2di) __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi16 (__m128i const __A, int const __N) +{ + return (unsigned short) ((__v8hi)__A)[__N & 7]; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) +{ + __v8hi result = (__v8hi)__A; + + result [(__N & 7)] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B); +} + + +#ifdef _ARCH_PWR8 +/* Intrinsic functions that require PowerISA 2.07 minimum. */ + +/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_epi8 (__m128i __A) +{ + __vector unsigned long long result; + static const __vector unsigned char perm_mask = + { + 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, + 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 + }; + + result = ((__vector unsigned long long) + vec_vbpermq ((__vector unsigned char) __A, + (__vector unsigned char) perm_mask)); + +#ifdef __LITTLE_ENDIAN__ + return result[1]; +#else + return result[0]; +#endif +} +#endif /* _ARCH_PWR8 */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_epu16 (__m128i __A, __m128i __B) +{ + __v4su w0, w1; + __v16qu xform1 = { +#ifdef __LITTLE_ENDIAN__ + 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, + 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F +#else + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, + 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D +#endif + }; + + w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B); + w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B); + return (__m128i) vec_perm (w0, w1, xform1); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflehi_epi16 (__m128i __A, const int __mask) +{ + unsigned long element_selector_98 = __mask & 0x03; + unsigned long element_selector_BA = (__mask >> 2) & 0x03; + unsigned long element_selector_DC = (__mask >> 4) & 0x03; + unsigned long element_selector_FE = (__mask >> 6) & 0x03; + static const unsigned short permute_selectors[4] = + { +#ifdef __LITTLE_ENDIAN__ + 0x0908, 0x0B0A, 0x0D0C, 0x0F0E +#else + 0x0809, 0x0A0B, 0x0C0D, 0x0E0F +#endif + }; + __v2du pmask = +#ifdef __LITTLE_ENDIAN__ + { 0x1716151413121110UL, 0UL}; +#else + { 0x1011121314151617UL, 0UL}; +#endif + __m64_union t; + __v2du a, r; + + t.as_short[0] = permute_selectors[element_selector_98]; + t.as_short[1] = permute_selectors[element_selector_BA]; + t.as_short[2] = permute_selectors[element_selector_DC]; + t.as_short[3] = permute_selectors[element_selector_FE]; + pmask[1] = t.as_m64; + a = (__v2du)__A; + r = vec_perm (a, a, (__vector unsigned char)pmask); + return (__m128i) r; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflelo_epi16 (__m128i __A, const int __mask) +{ + unsigned long element_selector_10 = __mask & 0x03; + unsigned long element_selector_32 = (__mask >> 2) & 0x03; + unsigned long element_selector_54 = (__mask >> 4) & 0x03; + unsigned long element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned short permute_selectors[4] = + { +#ifdef __LITTLE_ENDIAN__ + 0x0100, 0x0302, 0x0504, 0x0706 +#else + 0x0001, 0x0203, 0x0405, 0x0607 +#endif + }; + __v2du pmask = +#ifdef __LITTLE_ENDIAN__ + { 0UL, 0x1f1e1d1c1b1a1918UL}; +#else + { 0UL, 0x18191a1b1c1d1e1fUL}; +#endif + __m64_union t; + __v2du a, r; + t.as_short[0] = permute_selectors[element_selector_10]; + t.as_short[1] = permute_selectors[element_selector_32]; + t.as_short[2] = permute_selectors[element_selector_54]; + t.as_short[3] = permute_selectors[element_selector_76]; + pmask[0] = t.as_m64; + a = (__v2du)__A; + r = vec_perm (a, a, (__vector unsigned char)pmask); + return (__m128i) r; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi32 (__m128i __A, const int __mask) +{ + unsigned long element_selector_10 = __mask & 0x03; + unsigned long element_selector_32 = (__mask >> 2) & 0x03; + unsigned long element_selector_54 = (__mask >> 4) & 0x03; + unsigned long element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned int permute_selectors[4] = + { +#ifdef __LITTLE_ENDIAN__ + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C +#else + 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F +#endif + }; + __v4su t; + + t[0] = permute_selectors[element_selector_10]; + t[1] = permute_selectors[element_selector_32]; + t[2] = permute_selectors[element_selector_54] + 0x10101010; + t[3] = permute_selectors[element_selector_76] + 0x10101010; + return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) +{ + __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL}; + __v16qu mask, tmp; + __m128i_u *p = (__m128i_u*)__C; + + tmp = (__v16qu)_mm_loadu_si128(p); + mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit); + tmp = vec_sel (tmp, (__v16qu)__A, mask); + _mm_storeu_si128 (p, (__m128i)tmp); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B); +} + + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sad_epu8 (__m128i __A, __m128i __B) +{ + __v16qu a, b; + __v16qu vmin, vmax, vabsdiff; + __v4si vsum; + const __v4su zero = { 0, 0, 0, 0 }; + __v4si result; + + a = (__v16qu) __A; + b = (__v16qu) __B; + vmin = vec_min (a, b); + vmax = vec_max (a, b); + vabsdiff = vec_sub (vmax, vmin); + /* Sum four groups of bytes into integers. */ + vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); + /* Sum across four integers with two integer results. */ + result = vec_sum2s (vsum, (__vector signed int) zero); + /* Rotate the sums into the correct position. */ +#ifdef __LITTLE_ENDIAN__ + result = vec_sld (result, result, 4); +#else + result = vec_sld (result, result, 6); +#endif + /* Rotate the sums into the correct position. */ + return (__m128i) result; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si32 (int *__A, int __B) +{ + /* Use the data cache block touch for store transient. */ + __asm__ ( + "dcbtstt 0,%0" + : + : "b" (__A) + : "memory" + ); + *__A = __B; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si64 (long long int *__A, long long int __B) +{ + /* Use the data cache block touch for store transient. */ + __asm__ ( + " dcbtstt 0,%0" + : + : "b" (__A) + : "memory" + ); + *__A = __B; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si128 (__m128i *__A, __m128i __B) +{ + /* Use the data cache block touch for store transient. */ + __asm__ ( + "dcbtstt 0,%0" + : + : "b" (__A) + : "memory" + ); + *__A = __B; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_pd (double *__A, __m128d __B) +{ + /* Use the data cache block touch for store transient. */ + __asm__ ( + "dcbtstt 0,%0" + : + : "b" (__A) + : "memory" + ); + *(__m128d*)__A = __B; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clflush (void const *__A) +{ + /* Use the data cache block flush. */ + __asm__ ( + "dcbf 0,%0" + : + : "b" (__A) + : "memory" + ); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lfence (void) +{ + /* Use light weight sync for load to load ordering. */ + __atomic_thread_fence (__ATOMIC_RELEASE); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mfence (void) +{ + /* Use heavy weight sync for any to any ordering. */ + __atomic_thread_fence (__ATOMIC_SEQ_CST); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_si128 (int __A) +{ + return _mm_set_epi32 (0, 0, 0, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si128 (long long __A) +{ + return __extension__ (__m128i)(__v2di){ __A, 0LL }; +} + +/* Microsoft intrinsic. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_si128 (long long __A) +{ + return __extension__ (__m128i)(__v2di){ __A, 0LL }; +} + +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_ps(__m128d __A) +{ + return (__m128) __A; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_si128(__m128d __A) +{ + return (__m128i) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_pd(__m128 __A) +{ + return (__m128d) __A; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_si128(__m128 __A) +{ + return (__m128i) __A; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_ps(__m128i __A) +{ + return (__m128) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_pd(__m128i __A) +{ + return (__m128d) __A; +} + +#endif /* EMMINTRIN_H_ */ diff --git a/lib/include/ppc_wrappers/mm_malloc.h b/lib/include/ppc_wrappers/mm_malloc.h new file mode 100644 index 000000000..d91d7865c --- /dev/null +++ b/lib/include/ppc_wrappers/mm_malloc.h @@ -0,0 +1,44 @@ +/*===---- mm_malloc.h - Implementation of _mm_malloc and _mm_free ----------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef _MM_MALLOC_H_INCLUDED +#define _MM_MALLOC_H_INCLUDED + +#include + +/* We can't depend on since the prototype of posix_memalign + may not be visible. */ +#ifndef __cplusplus +extern int posix_memalign (void **, size_t, size_t); +#else +extern "C" int posix_memalign (void **, size_t, size_t) throw (); +#endif + +static __inline void * +_mm_malloc (size_t size, size_t alignment) +{ + /* PowerPC64 ELF V2 ABI requires quadword alignment. */ + size_t vec_align = sizeof (__vector float); + void *ptr; + + if (alignment < vec_align) + alignment = vec_align; + if (posix_memalign (&ptr, alignment, size) == 0) + return ptr; + else + return NULL; +} + +static __inline void +_mm_free (void * ptr) +{ + free (ptr); +} + +#endif /* _MM_MALLOC_H_INCLUDED */ diff --git a/lib/include/ppc_wrappers/mmintrin.h b/lib/include/ppc_wrappers/mmintrin.h new file mode 100644 index 000000000..b949653ad --- /dev/null +++ b/lib/include/ppc_wrappers/mmintrin.h @@ -0,0 +1,1443 @@ +/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header file is to help porting code using Intel intrinsics + explicitly from x86_64 to powerpc64/powerpc64le. + + Since PowerPC target doesn't support native 64-bit vector type, we + typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which + works well for _si64 and some _pi32 operations. + + For _pi16 and _pi8 operations, it's better to transfer __m64 into + 128-bit PowerPC vector first. Power8 introduced direct register + move instructions which helps for more efficient implementation. + + It's user's responsibility to determine if the results of such port + are acceptable or further changes are needed. Please note that much + code using Intel intrinsics CAN BE REWRITTEN in more portable and + efficient standard C or GNU C extensions with 64-bit scalar + operations, or 128-bit SSE/Altivec operations, which are more + recommended. */ +#error \ + "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef _MMINTRIN_H_INCLUDED +#define _MMINTRIN_H_INCLUDED + +#include +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef __attribute__((__aligned__(8))) unsigned long long __m64; + +typedef __attribute__((__aligned__(8))) union { + __m64 as_m64; + char as_char[8]; + signed char as_signed_char[8]; + short as_short[4]; + int as_int[2]; + long long as_long_long; + float as_float[2]; + double as_double; +} __m64_union; + +/* Empty the multimedia state. */ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_empty(void) { + /* nothing to do on PowerPC. */ +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_empty(void) { + /* nothing to do on PowerPC. */ +} + +/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_si64(int __i) { + return (__m64)(unsigned int)__i; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_from_int(int __i) { + return _mm_cvtsi32_si64(__i); +} + +/* Convert the lower 32 bits of the __m64 object into an integer. */ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_si32(__m64 __i) { + return ((int)__i); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_to_int(__m64 __i) { + return _mm_cvtsi64_si32(__i); +} + +/* Convert I to a __m64 object. */ + +/* Intel intrinsic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_from_int64(long long __i) { + return (__m64)__i; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_m64(long long __i) { + return (__m64)__i; +} + +/* Microsoft intrinsic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_si64(long long __i) { + return (__m64)__i; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pi64x(long long __i) { + return (__m64)__i; +} + +/* Convert the __m64 object to a 64bit integer. */ + +/* Intel intrinsic. */ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_to_int64(__m64 __i) { + return (long long)__i; +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtm64_si64(__m64 __i) { + return (long long)__i; +} + +/* Microsoft intrinsic. */ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_si64x(__m64 __i) { + return (long long)__i; +} + +#ifdef _ARCH_PWR8 +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with signed saturation. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_pi16(__m64 __m1, __m64 __m2) { + __vector signed short vm1; + __vector signed char vresult; + + vm1 = (__vector signed short)(__vector unsigned long long) +#ifdef __LITTLE_ENDIAN__ + {__m1, __m2}; +#else + {__m2, __m1}; +#endif + vresult = vec_packs(vm1, vm1); + return (__m64)((__vector long long)vresult)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_packsswb(__m64 __m1, __m64 __m2) { + return _mm_packs_pi16(__m1, __m2); +} + +/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of + the result, and the two 32-bit values from M2 into the upper two 16-bit + values of the result, all with signed saturation. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_pi32(__m64 __m1, __m64 __m2) { + __vector signed int vm1; + __vector signed short vresult; + + vm1 = (__vector signed int)(__vector unsigned long long) +#ifdef __LITTLE_ENDIAN__ + {__m1, __m2}; +#else + {__m2, __m1}; +#endif + vresult = vec_packs(vm1, vm1); + return (__m64)((__vector long long)vresult)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_packssdw(__m64 __m1, __m64 __m2) { + return _mm_packs_pi32(__m1, __m2); +} + +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with unsigned saturation. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_pu16(__m64 __m1, __m64 __m2) { + __vector unsigned char r; + __vector signed short vm1 = (__vector signed short)(__vector long long) +#ifdef __LITTLE_ENDIAN__ + {__m1, __m2}; +#else + {__m2, __m1}; +#endif + const __vector signed short __zero = {0}; + __vector __bool short __select = vec_cmplt(vm1, __zero); + r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1); + __vector __bool char packsel = vec_pack(__select, __select); + r = vec_sel(r, (const __vector unsigned char)__zero, packsel); + return (__m64)((__vector long long)r)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_packuswb(__m64 __m1, __m64 __m2) { + return _mm_packs_pu16(__m1, __m2); +} +#endif /* end ARCH_PWR8 */ + +/* Interleave the four 8-bit values from the high half of M1 with the four + 8-bit values from the high half of M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector unsigned char a, b, c; + + a = (__vector unsigned char)vec_splats(__m1); + b = (__vector unsigned char)vec_splats(__m2); + c = vec_mergel(a, b); + return (__m64)((__vector long long)c)[1]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_char[0] = m1.as_char[4]; + res.as_char[1] = m2.as_char[4]; + res.as_char[2] = m1.as_char[5]; + res.as_char[3] = m2.as_char[5]; + res.as_char[4] = m1.as_char[6]; + res.as_char[5] = m2.as_char[6]; + res.as_char[6] = m1.as_char[7]; + res.as_char[7] = m2.as_char[7]; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpckhbw(__m64 __m1, __m64 __m2) { + return _mm_unpackhi_pi8(__m1, __m2); +} + +/* Interleave the two 16-bit values from the high half of M1 with the two + 16-bit values from the high half of M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_short[0] = m1.as_short[2]; + res.as_short[1] = m2.as_short[2]; + res.as_short[2] = m1.as_short[3]; + res.as_short[3] = m2.as_short[3]; + + return (__m64)res.as_m64; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpckhwd(__m64 __m1, __m64 __m2) { + return _mm_unpackhi_pi16(__m1, __m2); +} +/* Interleave the 32-bit value from the high half of M1 with the 32-bit + value from the high half of M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_int[0] = m1.as_int[1]; + res.as_int[1] = m2.as_int[1]; + + return (__m64)res.as_m64; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpckhdq(__m64 __m1, __m64 __m2) { + return _mm_unpackhi_pi32(__m1, __m2); +} +/* Interleave the four 8-bit values from the low half of M1 with the four + 8-bit values from the low half of M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector unsigned char a, b, c; + + a = (__vector unsigned char)vec_splats(__m1); + b = (__vector unsigned char)vec_splats(__m2); + c = vec_mergel(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_char[0] = m1.as_char[0]; + res.as_char[1] = m2.as_char[0]; + res.as_char[2] = m1.as_char[1]; + res.as_char[3] = m2.as_char[1]; + res.as_char[4] = m1.as_char[2]; + res.as_char[5] = m2.as_char[2]; + res.as_char[6] = m1.as_char[3]; + res.as_char[7] = m2.as_char[3]; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpcklbw(__m64 __m1, __m64 __m2) { + return _mm_unpacklo_pi8(__m1, __m2); +} +/* Interleave the two 16-bit values from the low half of M1 with the two + 16-bit values from the low half of M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_short[0] = m1.as_short[0]; + res.as_short[1] = m2.as_short[0]; + res.as_short[2] = m1.as_short[1]; + res.as_short[3] = m2.as_short[1]; + + return (__m64)res.as_m64; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpcklwd(__m64 __m1, __m64 __m2) { + return _mm_unpacklo_pi16(__m1, __m2); +} + +/* Interleave the 32-bit value from the low half of M1 with the 32-bit + value from the low half of M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_int[0] = m1.as_int[0]; + res.as_int[1] = m2.as_int[0]; + + return (__m64)res.as_m64; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpckldq(__m64 __m1, __m64 __m2) { + return _mm_unpacklo_pi32(__m1, __m2); +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_pi8(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector signed char a, b, c; + + a = (__vector signed char)vec_splats(__m1); + b = (__vector signed char)vec_splats(__m2); + c = vec_add(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_char[0] = m1.as_char[0] + m2.as_char[0]; + res.as_char[1] = m1.as_char[1] + m2.as_char[1]; + res.as_char[2] = m1.as_char[2] + m2.as_char[2]; + res.as_char[3] = m1.as_char[3] + m2.as_char[3]; + res.as_char[4] = m1.as_char[4] + m2.as_char[4]; + res.as_char[5] = m1.as_char[5] + m2.as_char[5]; + res.as_char[6] = m1.as_char[6] + m2.as_char[6]; + res.as_char[7] = m1.as_char[7] + m2.as_char[7]; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddb(__m64 __m1, __m64 __m2) { + return _mm_add_pi8(__m1, __m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_pi16(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector signed short a, b, c; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + c = vec_add(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_short[0] = m1.as_short[0] + m2.as_short[0]; + res.as_short[1] = m1.as_short[1] + m2.as_short[1]; + res.as_short[2] = m1.as_short[2] + m2.as_short[2]; + res.as_short[3] = m1.as_short[3] + m2.as_short[3]; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddw(__m64 __m1, __m64 __m2) { + return _mm_add_pi16(__m1, __m2); +} + +/* Add the 32-bit values in M1 to the 32-bit values in M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_pi32(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR9 + __vector signed int a, b, c; + + a = (__vector signed int)vec_splats(__m1); + b = (__vector signed int)vec_splats(__m2); + c = vec_add(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_int[0] = m1.as_int[0] + m2.as_int[0]; + res.as_int[1] = m1.as_int[1] + m2.as_int[1]; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddd(__m64 __m1, __m64 __m2) { + return _mm_add_pi32(__m1, __m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_pi8(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector signed char a, b, c; + + a = (__vector signed char)vec_splats(__m1); + b = (__vector signed char)vec_splats(__m2); + c = vec_sub(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_char[0] = m1.as_char[0] - m2.as_char[0]; + res.as_char[1] = m1.as_char[1] - m2.as_char[1]; + res.as_char[2] = m1.as_char[2] - m2.as_char[2]; + res.as_char[3] = m1.as_char[3] - m2.as_char[3]; + res.as_char[4] = m1.as_char[4] - m2.as_char[4]; + res.as_char[5] = m1.as_char[5] - m2.as_char[5]; + res.as_char[6] = m1.as_char[6] - m2.as_char[6]; + res.as_char[7] = m1.as_char[7] - m2.as_char[7]; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubb(__m64 __m1, __m64 __m2) { + return _mm_sub_pi8(__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_pi16(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector signed short a, b, c; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + c = vec_sub(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_short[0] = m1.as_short[0] - m2.as_short[0]; + res.as_short[1] = m1.as_short[1] - m2.as_short[1]; + res.as_short[2] = m1.as_short[2] - m2.as_short[2]; + res.as_short[3] = m1.as_short[3] - m2.as_short[3]; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubw(__m64 __m1, __m64 __m2) { + return _mm_sub_pi16(__m1, __m2); +} + +/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_pi32(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR9 + __vector signed int a, b, c; + + a = (__vector signed int)vec_splats(__m1); + b = (__vector signed int)vec_splats(__m2); + c = vec_sub(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_int[0] = m1.as_int[0] - m2.as_int[0]; + res.as_int[1] = m1.as_int[1] - m2.as_int[1]; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubd(__m64 __m1, __m64 __m2) { + return _mm_sub_pi32(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_si64(__m64 __m1, __m64 __m2) { + return (__m1 + __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_si64(__m64 __m1, __m64 __m2) { + return (__m1 - __m2); +} + +/* Shift the 64-bit value in M left by COUNT. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_si64(__m64 __m, __m64 __count) { + return (__m << __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psllq(__m64 __m, __m64 __count) { + return _mm_sll_si64(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_si64(__m64 __m, const int __count) { + return (__m << __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psllqi(__m64 __m, const int __count) { + return _mm_slli_si64(__m, __count); +} + +/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_si64(__m64 __m, __m64 __count) { + return (__m >> __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrlq(__m64 __m, __m64 __count) { + return _mm_srl_si64(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_si64(__m64 __m, const int __count) { + return (__m >> __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrlqi(__m64 __m, const int __count) { + return _mm_srli_si64(__m, __count); +} + +/* Bit-wise AND the 64-bit values in M1 and M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_si64(__m64 __m1, __m64 __m2) { + return (__m1 & __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pand(__m64 __m1, __m64 __m2) { + return _mm_and_si64(__m1, __m2); +} + +/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the + 64-bit value in M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_si64(__m64 __m1, __m64 __m2) { + return (~__m1 & __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pandn(__m64 __m1, __m64 __m2) { + return _mm_andnot_si64(__m1, __m2); +} + +/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_si64(__m64 __m1, __m64 __m2) { + return (__m1 | __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_por(__m64 __m1, __m64 __m2) { + return _mm_or_si64(__m1, __m2); +} + +/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_si64(__m64 __m1, __m64 __m2) { + return (__m1 ^ __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pxor(__m64 __m1, __m64 __m2) { + return _mm_xor_si64(__m1, __m2); +} + +/* Creates a 64-bit zero. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_si64(void) { + return (__m64)0; +} + +/* Compare eight 8-bit values. The result of the comparison is 0xFF if the + test is true and zero if false. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { +#if defined(_ARCH_PWR6) && defined(__powerpc64__) + __m64 res; + __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :); + return (res); +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0; + res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0; + res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0; + res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0; + res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0; + res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0; + res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0; + res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpeqb(__m64 __m1, __m64 __m2) { + return _mm_cmpeq_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector signed char a, b, c; + + a = (__vector signed char)vec_splats(__m1); + b = (__vector signed char)vec_splats(__m2); + c = (__vector signed char)vec_cmpgt(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0; + res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0; + res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0; + res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0; + res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0; + res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0; + res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0; + res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpgtb(__m64 __m1, __m64 __m2) { + return _mm_cmpgt_pi8(__m1, __m2); +} + +/* Compare four 16-bit values. The result of the comparison is 0xFFFF if + the test is true and zero if false. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector signed short a, b, c; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + c = (__vector signed short)vec_cmpeq(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0; + res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0; + res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0; + res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpeqw(__m64 __m1, __m64 __m2) { + return _mm_cmpeq_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR8 + __vector signed short a, b, c; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + c = (__vector signed short)vec_cmpgt(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0; + res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0; + res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0; + res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpgtw(__m64 __m1, __m64 __m2) { + return _mm_cmpgt_pi16(__m1, __m2); +} + +/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if + the test is true and zero if false. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR9 + __vector signed int a, b, c; + + a = (__vector signed int)vec_splats(__m1); + b = (__vector signed int)vec_splats(__m2); + c = (__vector signed int)vec_cmpeq(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0; + res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpeqd(__m64 __m1, __m64 __m2) { + return _mm_cmpeq_pi32(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { +#if _ARCH_PWR9 + __vector signed int a, b, c; + + a = (__vector signed int)vec_splats(__m1); + b = (__vector signed int)vec_splats(__m2); + c = (__vector signed int)vec_cmpgt(a, b); + return (__m64)((__vector long long)c)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __m1; + m2.as_m64 = __m2; + + res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0; + res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0; + + return (__m64)res.as_m64; +#endif +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpgtd(__m64 __m1, __m64 __m2) { + return _mm_cmpgt_pi32(__m1, __m2); +} + +#if _ARCH_PWR8 +/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed + saturated arithmetic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_pi8(__m64 __m1, __m64 __m2) { + __vector signed char a, b, c; + + a = (__vector signed char)vec_splats(__m1); + b = (__vector signed char)vec_splats(__m2); + c = vec_adds(a, b); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddsb(__m64 __m1, __m64 __m2) { + return _mm_adds_pi8(__m1, __m2); +} +/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed + saturated arithmetic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_pi16(__m64 __m1, __m64 __m2) { + __vector signed short a, b, c; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + c = vec_adds(a, b); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddsw(__m64 __m1, __m64 __m2) { + return _mm_adds_pi16(__m1, __m2); +} +/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned + saturated arithmetic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_pu8(__m64 __m1, __m64 __m2) { + __vector unsigned char a, b, c; + + a = (__vector unsigned char)vec_splats(__m1); + b = (__vector unsigned char)vec_splats(__m2); + c = vec_adds(a, b); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddusb(__m64 __m1, __m64 __m2) { + return _mm_adds_pu8(__m1, __m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned + saturated arithmetic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_pu16(__m64 __m1, __m64 __m2) { + __vector unsigned short a, b, c; + + a = (__vector unsigned short)vec_splats(__m1); + b = (__vector unsigned short)vec_splats(__m2); + c = vec_adds(a, b); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddusw(__m64 __m1, __m64 __m2) { + return _mm_adds_pu16(__m1, __m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed + saturating arithmetic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_pi8(__m64 __m1, __m64 __m2) { + __vector signed char a, b, c; + + a = (__vector signed char)vec_splats(__m1); + b = (__vector signed char)vec_splats(__m2); + c = vec_subs(a, b); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubsb(__m64 __m1, __m64 __m2) { + return _mm_subs_pi8(__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using + signed saturating arithmetic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_pi16(__m64 __m1, __m64 __m2) { + __vector signed short a, b, c; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + c = vec_subs(a, b); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubsw(__m64 __m1, __m64 __m2) { + return _mm_subs_pi16(__m1, __m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using + unsigned saturating arithmetic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_pu8(__m64 __m1, __m64 __m2) { + __vector unsigned char a, b, c; + + a = (__vector unsigned char)vec_splats(__m1); + b = (__vector unsigned char)vec_splats(__m2); + c = vec_subs(a, b); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubusb(__m64 __m1, __m64 __m2) { + return _mm_subs_pu8(__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using + unsigned saturating arithmetic. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_pu16(__m64 __m1, __m64 __m2) { + __vector unsigned short a, b, c; + + a = (__vector unsigned short)vec_splats(__m1); + b = (__vector unsigned short)vec_splats(__m2); + c = vec_subs(a, b); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubusw(__m64 __m1, __m64 __m2) { + return _mm_subs_pu16(__m1, __m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing + four 32-bit intermediate results, which are then summed by pairs to + produce two 32-bit results. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_madd_pi16(__m64 __m1, __m64 __m2) { + __vector signed short a, b; + __vector signed int c; + __vector signed int zero = {0, 0, 0, 0}; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + c = vec_vmsumshm(a, b, zero); + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmaddwd(__m64 __m1, __m64 __m2) { + return _mm_madd_pi16(__m1, __m2); +} +/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in + M2 and produce the high 16 bits of the 32-bit results. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { + __vector signed short a, b; + __vector signed short c; + __vector signed int w0, w1; + __vector unsigned char xform1 = { +#ifdef __LITTLE_ENDIAN__ + 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, + 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F +#else + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, + 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 +#endif + }; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + + w0 = vec_vmulesh(a, b); + w1 = vec_vmulosh(a, b); + c = (__vector signed short)vec_perm(w0, w1, xform1); + + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmulhw(__m64 __m1, __m64 __m2) { + return _mm_mulhi_pi16(__m1, __m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce + the low 16 bits of the results. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mullo_pi16(__m64 __m1, __m64 __m2) { + __vector signed short a, b, c; + + a = (__vector signed short)vec_splats(__m1); + b = (__vector signed short)vec_splats(__m2); + c = a * b; + return (__m64)((__vector long long)c)[0]; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmullw(__m64 __m1, __m64 __m2) { + return _mm_mullo_pi16(__m1, __m2); +} + +/* Shift four 16-bit values in M left by COUNT. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_pi16(__m64 __m, __m64 __count) { + __vector signed short m, r; + __vector unsigned short c; + + if (__count <= 15) { + m = (__vector signed short)vec_splats(__m); + c = (__vector unsigned short)vec_splats((unsigned short)__count); + r = vec_sl(m, (__vector unsigned short)c); + return (__m64)((__vector long long)r)[0]; + } else + return (0); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psllw(__m64 __m, __m64 __count) { + return _mm_sll_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_pi16(__m64 __m, int __count) { + /* Promote int to long then invoke mm_sll_pi16. */ + return _mm_sll_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psllwi(__m64 __m, int __count) { + return _mm_slli_pi16(__m, __count); +} + +/* Shift two 32-bit values in M left by COUNT. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_pi32(__m64 __m, __m64 __count) { + __m64_union m, res; + + m.as_m64 = __m; + + res.as_int[0] = m.as_int[0] << __count; + res.as_int[1] = m.as_int[1] << __count; + return (res.as_m64); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pslld(__m64 __m, __m64 __count) { + return _mm_sll_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_pi32(__m64 __m, int __count) { + /* Promote int to long then invoke mm_sll_pi32. */ + return _mm_sll_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pslldi(__m64 __m, int __count) { + return _mm_slli_pi32(__m, __count); +} + +/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_pi16(__m64 __m, __m64 __count) { + __vector signed short m, r; + __vector unsigned short c; + + if (__count <= 15) { + m = (__vector signed short)vec_splats(__m); + c = (__vector unsigned short)vec_splats((unsigned short)__count); + r = vec_sra(m, (__vector unsigned short)c); + return (__m64)((__vector long long)r)[0]; + } else + return (0); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psraw(__m64 __m, __m64 __count) { + return _mm_sra_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_pi16(__m64 __m, int __count) { + /* Promote int to long then invoke mm_sra_pi32. */ + return _mm_sra_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrawi(__m64 __m, int __count) { + return _mm_srai_pi16(__m, __count); +} + +/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_pi32(__m64 __m, __m64 __count) { + __m64_union m, res; + + m.as_m64 = __m; + + res.as_int[0] = m.as_int[0] >> __count; + res.as_int[1] = m.as_int[1] >> __count; + return (res.as_m64); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrad(__m64 __m, __m64 __count) { + return _mm_sra_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_pi32(__m64 __m, int __count) { + /* Promote int to long then invoke mm_sra_pi32. */ + return _mm_sra_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psradi(__m64 __m, int __count) { + return _mm_srai_pi32(__m, __count); +} + +/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_pi16(__m64 __m, __m64 __count) { + __vector unsigned short m, r; + __vector unsigned short c; + + if (__count <= 15) { + m = (__vector unsigned short)vec_splats(__m); + c = (__vector unsigned short)vec_splats((unsigned short)__count); + r = vec_sr(m, (__vector unsigned short)c); + return (__m64)((__vector long long)r)[0]; + } else + return (0); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrlw(__m64 __m, __m64 __count) { + return _mm_srl_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_pi16(__m64 __m, int __count) { + /* Promote int to long then invoke mm_sra_pi32. */ + return _mm_srl_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrlwi(__m64 __m, int __count) { + return _mm_srli_pi16(__m, __count); +} + +/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_pi32(__m64 __m, __m64 __count) { + __m64_union m, res; + + m.as_m64 = __m; + + res.as_int[0] = (unsigned int)m.as_int[0] >> __count; + res.as_int[1] = (unsigned int)m.as_int[1] >> __count; + return (res.as_m64); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrld(__m64 __m, __m64 __count) { + return _mm_srl_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_pi32(__m64 __m, int __count) { + /* Promote int to long then invoke mm_srl_pi32. */ + return _mm_srl_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrldi(__m64 __m, int __count) { + return _mm_srli_pi32(__m, __count); +} +#endif /* _ARCH_PWR8 */ + +/* Creates a vector of two 32-bit values; I0 is least significant. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pi32(int __i1, int __i0) { + __m64_union res; + + res.as_int[0] = __i0; + res.as_int[1] = __i1; + return (res.as_m64); +} + +/* Creates a vector of four 16-bit values; W0 is least significant. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { + __m64_union res; + + res.as_short[0] = __w0; + res.as_short[1] = __w1; + res.as_short[2] = __w2; + res.as_short[3] = __w3; + return (res.as_m64); +} + +/* Creates a vector of eight 8-bit values; B0 is least significant. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, + char __b2, char __b1, char __b0) { + __m64_union res; + + res.as_char[0] = __b0; + res.as_char[1] = __b1; + res.as_char[2] = __b2; + res.as_char[3] = __b3; + res.as_char[4] = __b4; + res.as_char[5] = __b5; + res.as_char[6] = __b6; + res.as_char[7] = __b7; + return (res.as_m64); +} + +/* Similar, but with the arguments in reverse order. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_pi32(int __i0, int __i1) { + __m64_union res; + + res.as_int[0] = __i0; + res.as_int[1] = __i1; + return (res.as_m64); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { + return _mm_set_pi16(__w3, __w2, __w1, __w0); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, + char __b5, char __b6, char __b7) { + return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); +} + +/* Creates a vector of two 32-bit values, both elements containing I. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_pi32(int __i) { + __m64_union res; + + res.as_int[0] = __i; + res.as_int[1] = __i; + return (res.as_m64); +} + +/* Creates a vector of four 16-bit values, all elements containing W. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_pi16(short __w) { +#if _ARCH_PWR9 + __vector signed short w; + + w = (__vector signed short)vec_splats(__w); + return (__m64)((__vector long long)w)[0]; +#else + __m64_union res; + + res.as_short[0] = __w; + res.as_short[1] = __w; + res.as_short[2] = __w; + res.as_short[3] = __w; + return (res.as_m64); +#endif +} + +/* Creates a vector of eight 8-bit values, all elements containing B. */ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_pi8(signed char __b) { +#if _ARCH_PWR8 + __vector signed char b; + + b = (__vector signed char)vec_splats(__b); + return (__m64)((__vector long long)b)[0]; +#else + __m64_union res; + + res.as_char[0] = __b; + res.as_char[1] = __b; + res.as_char[2] = __b; + res.as_char[3] = __b; + res.as_char[4] = __b; + res.as_char[5] = __b; + res.as_char[6] = __b; + res.as_char[7] = __b; + return (res.as_m64); +#endif +} +#endif /* _MMINTRIN_H_INCLUDED */ diff --git a/lib/include/ppc_wrappers/xmmintrin.h b/lib/include/ppc_wrappers/xmmintrin.h new file mode 100644 index 000000000..1b322b665 --- /dev/null +++ b/lib/include/ppc_wrappers/xmmintrin.h @@ -0,0 +1,1838 @@ +/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header file is to help porting code using Intel intrinsics + explicitly from x86_64 to powerpc64/powerpc64le. + + Since X86 SSE intrinsics mainly handles __m128 type, PowerPC + VMX/VSX ISA is a good match for vector float SIMD operations. + However scalar float operations in vector (XMM) registers require + the POWER8 VSX ISA (2.07) level. There are differences for data + format and placement of float scalars in the vector register, which + require extra steps to match SSE scalar float semantics on POWER. + + It should be noted that there's much difference between X86_64's + MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use + portable instead of access MXSCR directly. + + Most SSE scalar float intrinsic operations can be performed more + efficiently as C language float scalar operations or optimized to + use vector SIMD operations. We recommend this for new applications. */ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef _XMMINTRIN_H_INCLUDED +#define _XMMINTRIN_H_INCLUDED + +/* Define four value permute mask */ +#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) + +#include + +/* Avoid collisions between altivec.h and strict adherence to C++ and + C11 standards. This should eventually be done inside altivec.h itself, + but only after testing a full distro build. */ +#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && \ + __STDC_VERSION__ >= 201112L)) +#undef vector +#undef pixel +#undef bool +#endif + +/* We need type definitions from the MMX header file. */ +#include + +/* Get _mm_malloc () and _mm_free (). */ +#if __STDC_HOSTED__ +#include +#endif + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); + +/* Unaligned version of the same type. */ +typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, + __aligned__ (1))); + +/* Internal data types for implementing the intrinsics. */ +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +/* Create an undefined vector. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_ps (void) +{ + __m128 __Y = __Y; + return __Y; +} + +/* Create a vector of zeros. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_ps (void) +{ + return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; +} + +/* Load four SPFP values from P. The address must be 16-byte aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ps (float const *__P) +{ + return ((__m128)vec_ld(0, (__v4sf*)__P)); +} + +/* Load four SPFP values from P. The address need not be 16-byte aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_ps (float const *__P) +{ + return (vec_vsx_ld(0, __P)); +} + +/* Load four SPFP values in reverse order. The address must be aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadr_ps (float const *__P) +{ + __v4sf __tmp; + __m128 result; + static const __vector unsigned char permute_vector = + { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, + 0x17, 0x10, 0x11, 0x12, 0x13 }; + + __tmp = vec_ld (0, (__v4sf *) __P); + result = (__m128) vec_perm (__tmp, __tmp, permute_vector); + return result; +} + +/* Create a vector with all four elements equal to F. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_ps (float __F) +{ + return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ps1 (float __F) +{ + return _mm_set1_ps (__F); +} + +/* Create the vector [Z Y X W]. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) +{ + return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; +} + +/* Create the vector [W X Y Z]. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_ps (float __Z, float __Y, float __X, float __W) +{ + return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; +} + +/* Store four SPFP values. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ps (float *__P, __m128 __A) +{ + vec_st((__v4sf)__A, 0, (__v4sf*)__P); +} + +/* Store four SPFP values. The address need not be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_ps (float *__P, __m128 __A) +{ + *(__m128_u *)__P = __A; +} + +/* Store four SPFP values in reverse order. The address must be aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storer_ps (float *__P, __m128 __A) +{ + __v4sf __tmp; + static const __vector unsigned char permute_vector = + { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16, + 0x17, 0x10, 0x11, 0x12, 0x13 }; + + __tmp = (__m128) vec_perm (__A, __A, permute_vector); + + _mm_store_ps (__P, __tmp); +} + +/* Store the lower SPFP value across four words. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store1_ps (float *__P, __m128 __A) +{ + __v4sf __va = vec_splat((__v4sf)__A, 0); + _mm_store_ps (__P, __va); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ps1 (float *__P, __m128 __A) +{ + _mm_store1_ps (__P, __A); +} + +/* Create a vector with element 0 as F and the rest zero. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ss (float __F) +{ + return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; +} + +/* Sets the low SPFP value of A from the low value of B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + + return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask)); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ss (float const *__P) +{ + return _mm_set_ss (*__P); +} + +/* Stores the lower SPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ss (float *__P, __m128 __A) +{ + *__P = ((__v4sf)__A)[0]; +} + +/* Perform the respective operation on the lower SPFP (single-precision + floating-point) values of A and B; the upper three SPFP values are + passed through from A. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ss (__m128 __A, __m128 __B) +{ +#ifdef _ARCH_PWR7 + __m128 a, b, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower double) + results. So to insure we don't generate spurious exceptions + (from the upper double values) we splat the lower double + before we to the operation. */ + a = vec_splat (__A, 0); + b = vec_splat (__B, 0); + c = a + b; + /* Then we merge the lower float result with the original upper + float elements from __A. */ + return (vec_sel (__A, c, mask)); +#else + __A[0] = __A[0] + __B[0]; + return (__A); +#endif +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ss (__m128 __A, __m128 __B) +{ +#ifdef _ARCH_PWR7 + __m128 a, b, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower double) + results. So to insure we don't generate spurious exceptions + (from the upper double values) we splat the lower double + before we to the operation. */ + a = vec_splat (__A, 0); + b = vec_splat (__B, 0); + c = a - b; + /* Then we merge the lower float result with the original upper + float elements from __A. */ + return (vec_sel (__A, c, mask)); +#else + __A[0] = __A[0] - __B[0]; + return (__A); +#endif +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ss (__m128 __A, __m128 __B) +{ +#ifdef _ARCH_PWR7 + __m128 a, b, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower double) + results. So to insure we don't generate spurious exceptions + (from the upper double values) we splat the lower double + before we to the operation. */ + a = vec_splat (__A, 0); + b = vec_splat (__B, 0); + c = a * b; + /* Then we merge the lower float result with the original upper + float elements from __A. */ + return (vec_sel (__A, c, mask)); +#else + __A[0] = __A[0] * __B[0]; + return (__A); +#endif +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ss (__m128 __A, __m128 __B) +{ +#ifdef _ARCH_PWR7 + __m128 a, b, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower double) + results. So to insure we don't generate spurious exceptions + (from the upper double values) we splat the lower double + before we to the operation. */ + a = vec_splat (__A, 0); + b = vec_splat (__B, 0); + c = a / b; + /* Then we merge the lower float result with the original upper + float elements from __A. */ + return (vec_sel (__A, c, mask)); +#else + __A[0] = __A[0] / __B[0]; + return (__A); +#endif +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_ss (__m128 __A) +{ + __m128 a, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower double) + * results. So to insure we don't generate spurious exceptions + * (from the upper double values) we splat the lower double + * before we to the operation. */ + a = vec_splat (__A, 0); + c = vec_sqrt (a); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return (vec_sel (__A, c, mask)); +} + +/* Perform the respective operation on the four SPFP values in A and B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ps (__m128 __A, __m128 __B) +{ + return (__m128) ((__v4sf)__A + (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ps (__m128 __A, __m128 __B) +{ + return (__m128) ((__v4sf)__A - (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ps (__m128 __A, __m128 __B) +{ + return (__m128) ((__v4sf)__A * (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ps (__m128 __A, __m128 __B) +{ + return (__m128) ((__v4sf)__A / (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_ps (__m128 __A) +{ + return (vec_sqrt ((__v4sf)__A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_ps (__m128 __A) +{ + return (vec_re ((__v4sf)__A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_ps (__m128 __A) +{ + return (vec_rsqrte (__A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_ss (__m128 __A) +{ + __m128 a, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower double) + * results. So to insure we don't generate spurious exceptions + * (from the upper double values) we splat the lower double + * before we to the operation. */ + a = vec_splat (__A, 0); + c = _mm_rcp_ps (a); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return (vec_sel (__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_ss (__m128 __A) +{ + __m128 a, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower double) + * results. So to insure we don't generate spurious exceptions + * (from the upper double values) we splat the lower double + * before we to the operation. */ + a = vec_splat (__A, 0); + c = vec_rsqrte (a); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return (vec_sel (__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_ss (__m128 __A, __m128 __B) +{ + __v4sf a, b, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower float) + * results. So to insure we don't generate spurious exceptions + * (from the upper float values) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf)__A, 0); + b = vec_splat ((__v4sf)__B, 0); + c = vec_min (a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return (vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_ss (__m128 __A, __m128 __B) +{ + __v4sf a, b, c; + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; + /* PowerISA VSX does not allow partial (for just lower float) + * results. So to insure we don't generate spurious exceptions + * (from the upper float values) we splat the lower float + * before we to the operation. */ + a = vec_splat (__A, 0); + b = vec_splat (__B, 0); + c = vec_max (a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return (vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_ps (__m128 __A, __m128 __B) +{ + __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A); + return vec_sel (__B, __A, m); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_ps (__m128 __A, __m128 __B) +{ + __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B); + return vec_sel (__B, __A, m); +} + +/* Perform logical bit-wise operations on 128-bit values. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B)); +// return __builtin_ia32_andps (__A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B)); +} + +/* Perform a comparison on the four SPFP values of A and B. For each + element, if the comparison is true, place a mask of all ones in the + result, otherwise a mask of zeros. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_ps (__m128 __A, __m128 __B) +{ + __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B); + return ((__m128)vec_nor (temp, temp)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_ps (__m128 __A, __m128 __B) +{ + return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_ps (__m128 __A, __m128 __B) +{ + __vector unsigned int a, b; + __vector unsigned int c, d; + static const __vector unsigned int float_exp_mask = + { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + + a = (__vector unsigned int) vec_abs ((__v4sf)__A); + b = (__vector unsigned int) vec_abs ((__v4sf)__B); + c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); + d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); + return ((__m128 ) vec_and (c, d)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_ps (__m128 __A, __m128 __B) +{ + __vector unsigned int a, b; + __vector unsigned int c, d; + static const __vector unsigned int float_exp_mask = + { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + + a = (__vector unsigned int) vec_abs ((__v4sf)__A); + b = (__vector unsigned int) vec_abs ((__v4sf)__B); + c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); + d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); + return ((__m128 ) vec_or (c, d)); +} + +/* Perform a comparison on the lower SPFP values of A and B. If the + comparison is true, place a mask of all ones in the result, otherwise a + mask of zeros. The upper three SPFP values are passed through from A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmpeq(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmplt(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmple(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmpgt(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmpge(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmpeq(a, b); + c = vec_nor (c, c); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmpge(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmpgt(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we to the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmple(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_ss (__m128 __A, __m128 __B) +{ + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + __v4sf a, b, c; + /* PowerISA VMX does not allow partial (for just element 0) + * results. So to insure we don't generate spurious exceptions + * (from the upper elements) we splat the lower float + * before we do the operation. */ + a = vec_splat ((__v4sf) __A, 0); + b = vec_splat ((__v4sf) __B, 0); + c = (__v4sf) vec_cmplt(a, b); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_ss (__m128 __A, __m128 __B) +{ + __vector unsigned int a, b; + __vector unsigned int c, d; + static const __vector unsigned int float_exp_mask = + { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + + a = (__vector unsigned int) vec_abs ((__v4sf)__A); + b = (__vector unsigned int) vec_abs ((__v4sf)__B); + c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a); + d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b); + c = vec_and (c, d); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_ss (__m128 __A, __m128 __B) +{ + __vector unsigned int a, b; + __vector unsigned int c, d; + static const __vector unsigned int float_exp_mask = + { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 }; + static const __vector unsigned int mask = + { 0xffffffff, 0, 0, 0 }; + + a = (__vector unsigned int) vec_abs ((__v4sf)__A); + b = (__vector unsigned int) vec_abs ((__v4sf)__B); + c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask); + d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask); + c = vec_or (c, d); + /* Then we merge the lower float result with the original upper + * float elements from __A. */ + return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask)); +} + +/* Compare the lower SPFP values of A and B and return 1 if true + and 0 if false. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_ss (__m128 __A, __m128 __B) +{ + return (__A[0] == __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_ss (__m128 __A, __m128 __B) +{ + return (__A[0] < __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_ss (__m128 __A, __m128 __B) +{ + return (__A[0] <= __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_ss (__m128 __A, __m128 __B) +{ + return (__A[0] > __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_ss (__m128 __A, __m128 __B) +{ + return (__A[0] >= __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_ss (__m128 __A, __m128 __B) +{ + return (__A[0] != __B[0]); +} + +/* FIXME + * The __mm_ucomi??_ss implementations below are exactly the same as + * __mm_comi??_ss because GCC for PowerPC only generates unordered + * compares (scalar and vector). + * Technically __mm_comieq_ss et al should be using the ordered + * compare and signal for QNaNs. + * The __mm_ucomieq_sd et all should be OK, as is. + */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_ss (__m128 __A, __m128 __B) +{ + return (__A[0] == __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_ss (__m128 __A, __m128 __B) +{ + return (__A[0] < __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_ss (__m128 __A, __m128 __B) +{ + return (__A[0] <= __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_ss (__m128 __A, __m128 __B) +{ + return (__A[0] > __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_ss (__m128 __A, __m128 __B) +{ + return (__A[0] >= __B[0]); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_ss (__m128 __A, __m128 __B) +{ + return (__A[0] != __B[0]); +} + +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_f32 (__m128 __A) +{ + return ((__v4sf)__A)[0]; +} + +/* Convert the lower SPFP value to a 32-bit integer according to the current + rounding mode. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si32 (__m128 __A) +{ + __m64 res = 0; +#ifdef _ARCH_PWR8 + double dtmp; + __asm__( +#ifdef __LITTLE_ENDIAN__ + "xxsldwi %x0,%x0,%x0,3;\n" +#endif + "xscvspdp %x2,%x0;\n" + "fctiw %2,%2;\n" + "mfvsrd %1,%x2;\n" + : "+wa" (__A), + "=r" (res), + "=f" (dtmp) + : ); +#else + res = __builtin_rint(__A[0]); +#endif + return (res); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_ss2si (__m128 __A) +{ + return _mm_cvtss_si32 (__A); +} + +/* Convert the lower SPFP value to a 32-bit integer according to the + current rounding mode. */ + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si64 (__m128 __A) +{ + __m64 res = 0; +#ifdef _ARCH_PWR8 + double dtmp; + __asm__( +#ifdef __LITTLE_ENDIAN__ + "xxsldwi %x0,%x0,%x0,3;\n" +#endif + "xscvspdp %x2,%x0;\n" + "fctid %2,%2;\n" + "mfvsrd %1,%x2;\n" + : "+wa" (__A), + "=r" (res), + "=f" (dtmp) + : ); +#else + res = __builtin_llrint(__A[0]); +#endif + return (res); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si64x (__m128 __A) +{ + return _mm_cvtss_si64 ((__v4sf) __A); +} + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint +{ + /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ + _MM_HINT_ET0 = 7, + _MM_HINT_ET1 = 6, + _MM_HINT_T0 = 3, + _MM_HINT_T1 = 2, + _MM_HINT_T2 = 1, + _MM_HINT_NTA = 0 +}; + +/* Loads one cache line from address P to a location "closer" to the + processor. The selector I specifies the type of prefetch operation. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_prefetch (const void *__P, enum _mm_hint __I) +{ + /* Current PowerPC will ignores the hint parameters. */ + __builtin_prefetch (__P); +} + +/* Convert the two lower SPFP values to 32-bit integers according to the + current rounding mode. Return the integers in packed form. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi32 (__m128 __A) +{ + /* Splat two lower SPFP values to both halves. */ + __v4sf temp, rounded; + __vector unsigned long long result; + + /* Splat two lower SPFP values to both halves. */ + temp = (__v4sf) vec_splat ((__vector long long)__A, 0); + rounded = vec_rint(temp); + result = (__vector unsigned long long) vec_cts (rounded, 0); + + return (__m64) ((__vector long long) result)[0]; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_ps2pi (__m128 __A) +{ + return _mm_cvtps_pi32 (__A); +} + +/* Truncate the lower SPFP value to a 32-bit integer. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si32 (__m128 __A) +{ + /* Extract the lower float element. */ + float temp = __A[0]; + /* truncate to 32-bit integer and return. */ + return temp; +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_ss2si (__m128 __A) +{ + return _mm_cvttss_si32 (__A); +} + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si64 (__m128 __A) +{ + /* Extract the lower float element. */ + float temp = __A[0]; + /* truncate to 32-bit integer and return. */ + return temp; +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si64x (__m128 __A) +{ + /* Extract the lower float element. */ + float temp = __A[0]; + /* truncate to 32-bit integer and return. */ + return temp; +} + +/* Truncate the two lower SPFP values to 32-bit integers. Return the + integers in packed form. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_pi32 (__m128 __A) +{ + __v4sf temp; + __vector unsigned long long result; + + /* Splat two lower SPFP values to both halves. */ + temp = (__v4sf) vec_splat ((__vector long long)__A, 0); + result = (__vector unsigned long long) vec_cts (temp, 0); + + return (__m64) ((__vector long long) result)[0]; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_ps2pi (__m128 __A) +{ + return _mm_cvttps_pi32 (__A); +} + +/* Convert B to a SPFP value and insert it as element zero in A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_ss (__m128 __A, int __B) +{ + float temp = __B; + __A[0] = temp; + + return __A; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_si2ss (__m128 __A, int __B) +{ + return _mm_cvtsi32_ss (__A, __B); +} + +/* Convert B to a SPFP value and insert it as element zero in A. */ +/* Intel intrinsic. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_ss (__m128 __A, long long __B) +{ + float temp = __B; + __A[0] = temp; + + return __A; +} + +/* Microsoft intrinsic. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_ss (__m128 __A, long long __B) +{ + return _mm_cvtsi64_ss (__A, __B); +} + +/* Convert the two 32-bit values in B to SPFP form and insert them + as the two lower elements in A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32_ps (__m128 __A, __m64 __B) +{ + __vector signed int vm1; + __vector float vf1; + + vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B}; + vf1 = (__vector float) vec_ctf (vm1, 0); + + return ((__m128) (__vector unsigned long long) + { ((__vector unsigned long long)vf1) [0], + ((__vector unsigned long long)__A) [1]}); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_pi2ps (__m128 __A, __m64 __B) +{ + return _mm_cvtpi32_ps (__A, __B); +} + +/* Convert the four signed 16-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi16_ps (__m64 __A) +{ + __vector signed short vs8; + __vector signed int vi4; + __vector float vf1; + + vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A }; + vi4 = vec_vupklsh (vs8); + vf1 = (__vector float) vec_ctf (vi4, 0); + + return (__m128) vf1; +} + +/* Convert the four unsigned 16-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpu16_ps (__m64 __A) +{ + const __vector unsigned short zero = + { 0, 0, 0, 0, 0, 0, 0, 0 }; + __vector unsigned short vs8; + __vector unsigned int vi4; + __vector float vf1; + + vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; + vi4 = (__vector unsigned int) vec_mergel +#ifdef __LITTLE_ENDIAN__ + (vs8, zero); +#else + (zero, vs8); +#endif + vf1 = (__vector float) vec_ctf (vi4, 0); + + return (__m128) vf1; +} + +/* Convert the low four signed 8-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi8_ps (__m64 __A) +{ + __vector signed char vc16; + __vector signed short vs8; + __vector signed int vi4; + __vector float vf1; + + vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A }; + vs8 = vec_vupkhsb (vc16); + vi4 = vec_vupkhsh (vs8); + vf1 = (__vector float) vec_ctf (vi4, 0); + + return (__m128) vf1; +} + +/* Convert the low four unsigned 8-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + +_mm_cvtpu8_ps (__m64 __A) +{ + const __vector unsigned char zero = + { 0, 0, 0, 0, 0, 0, 0, 0 }; + __vector unsigned char vc16; + __vector unsigned short vs8; + __vector unsigned int vi4; + __vector float vf1; + + vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; +#ifdef __LITTLE_ENDIAN__ + vs8 = (__vector unsigned short) vec_mergel (vc16, zero); + vi4 = (__vector unsigned int) vec_mergeh (vs8, + (__vector unsigned short) zero); +#else + vs8 = (__vector unsigned short) vec_mergel (zero, vc16); + vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero, + vs8); +#endif + vf1 = (__vector float) vec_ctf (vi4, 0); + + return (__m128) vf1; +} + +/* Convert the four signed 32-bit values in A and B to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32x2_ps (__m64 __A, __m64 __B) +{ + __vector signed int vi4; + __vector float vf4; + + vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; + vf4 = (__vector float) vec_ctf (vi4, 0); + return (__m128) vf4; +} + +/* Convert the four SPFP values in A to four signed 16-bit integers. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi16 (__m128 __A) +{ + __v4sf rounded; + __vector signed int temp; + __vector unsigned long long result; + + rounded = vec_rint(__A); + temp = vec_cts (rounded, 0); + result = (__vector unsigned long long) vec_pack (temp, temp); + + return (__m64) ((__vector long long) result)[0]; +} + +/* Convert the four SPFP values in A to four signed 8-bit integers. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi8 (__m128 __A) +{ + __v4sf rounded; + __vector signed int tmp_i; + static const __vector signed int zero = {0, 0, 0, 0}; + __vector signed short tmp_s; + __vector signed char res_v; + + rounded = vec_rint(__A); + tmp_i = vec_cts (rounded, 0); + tmp_s = vec_pack (tmp_i, zero); + res_v = vec_pack (tmp_s, tmp_s); + return (__m64) ((__vector long long) res_v)[0]; +} + +/* Selects four specific SPFP values from A and B based on MASK. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + +_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) +{ + unsigned long element_selector_10 = __mask & 0x03; + unsigned long element_selector_32 = (__mask >> 2) & 0x03; + unsigned long element_selector_54 = (__mask >> 4) & 0x03; + unsigned long element_selector_76 = (__mask >> 6) & 0x03; + static const unsigned int permute_selectors[4] = + { +#ifdef __LITTLE_ENDIAN__ + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C +#else + 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F +#endif + }; + __vector unsigned int t; + + t[0] = permute_selectors[element_selector_10]; + t[1] = permute_selectors[element_selector_32]; + t[2] = permute_selectors[element_selector_54] + 0x10101010; + t[3] = permute_selectors[element_selector_76] + 0x10101010; + return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); +} + +/* Selects and interleaves the upper two SPFP values from A and B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_ps (__m128 __A, __m128 __B) +{ + return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B); +} + +/* Selects and interleaves the lower two SPFP values from A and B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_ps (__m128 __A, __m128 __B) +{ + return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B); +} + +/* Sets the upper two SPFP values with 64-bits of data loaded from P; + the lower two values are passed through from A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadh_pi (__m128 __A, __m64 const *__P) +{ + __vector unsigned long long __a = (__vector unsigned long long)__A; + __vector unsigned long long __p = vec_splats(*__P); + __a [1] = __p [1]; + + return (__m128)__a; +} + +/* Stores the upper two SPFP values of A into P. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeh_pi (__m64 *__P, __m128 __A) +{ + __vector unsigned long long __a = (__vector unsigned long long) __A; + + *__P = __a[1]; +} + +/* Moves the upper two values of B into the lower two values of A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehl_ps (__m128 __A, __m128 __B) +{ + return (__m128) vec_mergel ((__vector unsigned long long)__B, + (__vector unsigned long long)__A); +} + +/* Moves the lower two values of B into the upper two values of A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movelh_ps (__m128 __A, __m128 __B) +{ + return (__m128) vec_mergeh ((__vector unsigned long long)__A, + (__vector unsigned long long)__B); +} + +/* Sets the lower two SPFP values with 64-bits of data loaded from P; + the upper two values are passed through from A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_pi (__m128 __A, __m64 const *__P) +{ + __vector unsigned long long __a = (__vector unsigned long long)__A; + __vector unsigned long long __p = vec_splats(*__P); + __a [0] = __p [0]; + + return (__m128)__a; +} + +/* Stores the lower two SPFP values of A into P. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_pi (__m64 *__P, __m128 __A) +{ + __vector unsigned long long __a = (__vector unsigned long long) __A; + + *__P = __a[0]; +} + +#ifdef _ARCH_PWR8 +/* Intrinsic functions that require PowerISA 2.07 minimum. */ + +/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_ps (__m128 __A) +{ + __vector unsigned long long result; + static const __vector unsigned int perm_mask = + { +#ifdef __LITTLE_ENDIAN__ + 0x00204060, 0x80808080, 0x80808080, 0x80808080 +#else + 0x80808080, 0x80808080, 0x80808080, 0x00204060 +#endif + }; + + result = ((__vector unsigned long long) + vec_vbpermq ((__vector unsigned char) __A, + (__vector unsigned char) perm_mask)); + +#ifdef __LITTLE_ENDIAN__ + return result[1]; +#else + return result[0]; +#endif +} +#endif /* _ARCH_PWR8 */ + +/* Create a vector with all four elements equal to *P. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load1_ps (float const *__P) +{ + return _mm_set1_ps (*__P); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ps1 (float const *__P) +{ + return _mm_load1_ps (__P); +} + +/* Extracts one of the four words of A. The selector N must be immediate. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_pi16 (__m64 const __A, int const __N) +{ + unsigned int shiftr = __N & 3; +#ifdef __BIG_ENDIAN__ + shiftr = 3 - shiftr; +#endif + + return ((__A >> (shiftr * 16)) & 0xffff); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pextrw (__m64 const __A, int const __N) +{ + return _mm_extract_pi16 (__A, __N); +} + +/* Inserts word D into one of four words of A. The selector N must be + immediate. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) +{ + const int shiftl = (__N & 3) * 16; + const __m64 shiftD = (const __m64) __D << shiftl; + const __m64 mask = 0xffffUL << shiftl; + __m64 result = (__A & (~mask)) | (shiftD & mask); + + return (result); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pinsrw (__m64 const __A, int const __D, int const __N) +{ + return _mm_insert_pi16 (__A, __D, __N); +} + +/* Compute the element-wise maximum of signed 16-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + +_mm_max_pi16 (__m64 __A, __m64 __B) +{ +#if _ARCH_PWR8 + __vector signed short a, b, r; + __vector __bool short c; + + a = (__vector signed short)vec_splats (__A); + b = (__vector signed short)vec_splats (__B); + c = (__vector __bool short)vec_cmpgt (a, b); + r = vec_sel (b, a, c); + return (__m64) ((__vector long long) r)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __A; + m2.as_m64 = __B; + + res.as_short[0] = + (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; + res.as_short[1] = + (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; + res.as_short[2] = + (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; + res.as_short[3] = + (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; + + return (__m64) res.as_m64; +#endif +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaxsw (__m64 __A, __m64 __B) +{ + return _mm_max_pi16 (__A, __B); +} + +/* Compute the element-wise maximum of unsigned 8-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pu8 (__m64 __A, __m64 __B) +{ +#if _ARCH_PWR8 + __vector unsigned char a, b, r; + __vector __bool char c; + + a = (__vector unsigned char)vec_splats (__A); + b = (__vector unsigned char)vec_splats (__B); + c = (__vector __bool char)vec_cmpgt (a, b); + r = vec_sel (b, a, c); + return (__m64) ((__vector long long) r)[0]; +#else + __m64_union m1, m2, res; + long i; + + m1.as_m64 = __A; + m2.as_m64 = __B; + + + for (i = 0; i < 8; i++) + res.as_char[i] = + ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ? + m1.as_char[i] : m2.as_char[i]; + + return (__m64) res.as_m64; +#endif +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaxub (__m64 __A, __m64 __B) +{ + return _mm_max_pu8 (__A, __B); +} + +/* Compute the element-wise minimum of signed 16-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pi16 (__m64 __A, __m64 __B) +{ +#if _ARCH_PWR8 + __vector signed short a, b, r; + __vector __bool short c; + + a = (__vector signed short)vec_splats (__A); + b = (__vector signed short)vec_splats (__B); + c = (__vector __bool short)vec_cmplt (a, b); + r = vec_sel (b, a, c); + return (__m64) ((__vector long long) r)[0]; +#else + __m64_union m1, m2, res; + + m1.as_m64 = __A; + m2.as_m64 = __B; + + res.as_short[0] = + (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0]; + res.as_short[1] = + (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1]; + res.as_short[2] = + (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2]; + res.as_short[3] = + (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3]; + + return (__m64) res.as_m64; +#endif +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pminsw (__m64 __A, __m64 __B) +{ + return _mm_min_pi16 (__A, __B); +} + +/* Compute the element-wise minimum of unsigned 8-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pu8 (__m64 __A, __m64 __B) +{ +#if _ARCH_PWR8 + __vector unsigned char a, b, r; + __vector __bool char c; + + a = (__vector unsigned char)vec_splats (__A); + b = (__vector unsigned char)vec_splats (__B); + c = (__vector __bool char)vec_cmplt (a, b); + r = vec_sel (b, a, c); + return (__m64) ((__vector long long) r)[0]; +#else + __m64_union m1, m2, res; + long i; + + m1.as_m64 = __A; + m2.as_m64 = __B; + + + for (i = 0; i < 8; i++) + res.as_char[i] = + ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ? + m1.as_char[i] : m2.as_char[i]; + + return (__m64) res.as_m64; +#endif +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pminub (__m64 __A, __m64 __B) +{ + return _mm_min_pu8 (__A, __B); +} + +/* Create an 8-bit mask of the signs of 8-bit values. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_pi8 (__m64 __A) +{ + unsigned long long p = +#ifdef __LITTLE_ENDIAN__ + 0x0008101820283038UL; // permute control for sign bits +#else + 0x3830282018100800UL; // permute control for sign bits +#endif + return __builtin_bpermd (p, __A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmovmskb (__m64 __A) +{ + return _mm_movemask_pi8 (__A); +} + +/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values + in B and produce the high 16 bits of the 32-bit results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pu16 (__m64 __A, __m64 __B) +{ + __vector unsigned short a, b; + __vector unsigned short c; + __vector unsigned int w0, w1; + __vector unsigned char xform1 = { +#ifdef __LITTLE_ENDIAN__ + 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, + 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F +#else + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 +#endif + }; + + a = (__vector unsigned short)vec_splats (__A); + b = (__vector unsigned short)vec_splats (__B); + + w0 = vec_vmuleuh (a, b); + w1 = vec_vmulouh (a, b); + c = (__vector unsigned short)vec_perm (w0, w1, xform1); + + return (__m64) ((__vector long long) c)[0]; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhuw (__m64 __A, __m64 __B) +{ + return _mm_mulhi_pu16 (__A, __B); +} + +/* Return a combination of the four 16-bit values in A. The selector + must be an immediate. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi16 (__m64 __A, int const __N) +{ + unsigned long element_selector_10 = __N & 0x03; + unsigned long element_selector_32 = (__N >> 2) & 0x03; + unsigned long element_selector_54 = (__N >> 4) & 0x03; + unsigned long element_selector_76 = (__N >> 6) & 0x03; + static const unsigned short permute_selectors[4] = + { +#ifdef __LITTLE_ENDIAN__ + 0x0908, 0x0B0A, 0x0D0C, 0x0F0E +#else + 0x0607, 0x0405, 0x0203, 0x0001 +#endif + }; + __m64_union t; + __vector unsigned long long a, p, r; + +#ifdef __LITTLE_ENDIAN__ + t.as_short[0] = permute_selectors[element_selector_10]; + t.as_short[1] = permute_selectors[element_selector_32]; + t.as_short[2] = permute_selectors[element_selector_54]; + t.as_short[3] = permute_selectors[element_selector_76]; +#else + t.as_short[3] = permute_selectors[element_selector_10]; + t.as_short[2] = permute_selectors[element_selector_32]; + t.as_short[1] = permute_selectors[element_selector_54]; + t.as_short[0] = permute_selectors[element_selector_76]; +#endif + p = vec_splats (t.as_m64); + a = vec_splats (__A); + r = vec_perm (a, a, (__vector unsigned char)p); + return (__m64) ((__vector long long) r)[0]; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pshufw (__m64 __A, int const __N) +{ + return _mm_shuffle_pi16 (__A, __N); +} + +/* Conditionally store byte elements of A into P. The high bit of each + byte in the selector N determines whether the corresponding byte from + A is stored. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) +{ + __m64 hibit = 0x8080808080808080UL; + __m64 mask, tmp; + __m64 *p = (__m64*)__P; + + tmp = *p; + mask = _mm_cmpeq_pi8 ((__N & hibit), hibit); + tmp = (tmp & (~mask)) | (__A & mask); + *p = tmp; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_maskmovq (__m64 __A, __m64 __N, char *__P) +{ + _mm_maskmove_si64 (__A, __N, __P); +} + +/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_pu8 (__m64 __A, __m64 __B) +{ + __vector unsigned char a, b, c; + + a = (__vector unsigned char)vec_splats (__A); + b = (__vector unsigned char)vec_splats (__B); + c = vec_avg (a, b); + return (__m64) ((__vector long long) c)[0]; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgb (__m64 __A, __m64 __B) +{ + return _mm_avg_pu8 (__A, __B); +} + +/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_pu16 (__m64 __A, __m64 __B) +{ + __vector unsigned short a, b, c; + + a = (__vector unsigned short)vec_splats (__A); + b = (__vector unsigned short)vec_splats (__B); + c = vec_avg (a, b); + return (__m64) ((__vector long long) c)[0]; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgw (__m64 __A, __m64 __B) +{ + return _mm_avg_pu16 (__A, __B); +} + +/* Compute the sum of the absolute differences of the unsigned 8-bit + values in A and B. Return the value in the lower 16-bit word; the + upper words are cleared. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sad_pu8 (__m64 __A, __m64 __B) +{ + __vector unsigned char a, b; + __vector unsigned char vmin, vmax, vabsdiff; + __vector signed int vsum; + const __vector unsigned int zero = + { 0, 0, 0, 0 }; + __m64_union result = {0}; + + a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; + b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; + vmin = vec_min (a, b); + vmax = vec_max (a, b); + vabsdiff = vec_sub (vmax, vmin); + /* Sum four groups of bytes into integers. */ + vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); + /* Sum across four integers with integer result. */ + vsum = vec_sums (vsum, (__vector signed int) zero); + /* The sum is in the right most 32-bits of the vector result. + Transfer to a GPR and truncate to 16 bits. */ + result.as_short[0] = vsum[3]; + return result.as_m64; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psadbw (__m64 __A, __m64 __B) +{ + return _mm_sad_pu8 (__A, __B); +} + +/* Stores the data in A to the address P without polluting the caches. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_pi (__m64 *__P, __m64 __A) +{ + /* Use the data cache block touch for store transient. */ + __asm__ ( + " dcbtstt 0,%0" + : + : "b" (__P) + : "memory" + ); + *__P = __A; +} + +/* Likewise. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_ps (float *__P, __m128 __A) +{ + /* Use the data cache block touch for store transient. */ + __asm__ ( + " dcbtstt 0,%0" + : + : "b" (__P) + : "memory" + ); + _mm_store_ps (__P, __A); +} + +/* Guarantees that every preceding store is globally visible before + any subsequent store. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sfence (void) +{ + /* Generate a light weight sync. */ + __atomic_thread_fence (__ATOMIC_RELEASE); +} + +/* The execution of the next instruction is delayed by an implementation + specific amount of time. The instruction does not modify the + architectural state. This is after the pop_options pragma because + it does not require SSE support in the processor--the encoding is a + nop on processors that do not support it. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_pause (void) +{ + /* There is no exact match with this construct, but the following is + close to the desired effect. */ +#if _ARCH_PWR8 + /* On power8 and later processors we can depend on Program Priority + (PRI) and associated "very low" PPI setting. Since we don't know + what PPI this thread is running at we: 1) save the current PRI + from the PPR SPR into a local GRP, 2) set the PRI to "very low* + via the special or 31,31,31 encoding. 3) issue an "isync" to + insure the PRI change takes effect before we execute any more + instructions. + Now we can execute a lwsync (release barrier) while we execute + this thread at "very low" PRI. Finally we restore the original + PRI and continue execution. */ + unsigned long __PPR; + + __asm__ volatile ( + " mfppr %0;" + " or 31,31,31;" + " isync;" + " lwsync;" + " isync;" + " mtppr %0;" + : "=r" (__PPR) + : + : "memory" + ); +#else + /* For older processor where we may not even have Program Priority + controls we can only depend on Heavy Weight Sync. */ + __atomic_thread_fence (__ATOMIC_SEQ_CST); +#endif +} + +/* Transpose the 4x4 matrix composed of row[0-3]. */ +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ +do { \ + __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ + __v4sf __t0 = vec_vmrghw (__r0, __r1); \ + __v4sf __t1 = vec_vmrghw (__r2, __r3); \ + __v4sf __t2 = vec_vmrglw (__r0, __r1); \ + __v4sf __t3 = vec_vmrglw (__r2, __r3); \ + (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \ + (__vector long long)__t1); \ + (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \ + (__vector long long)__t1); \ + (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \ + (__vector long long)__t3); \ + (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \ + (__vector long long)__t3); \ +} while (0) + +/* For backward source compatibility. */ +//# include + +#endif /* _XMMINTRIN_H_INCLUDED */ diff --git a/lib/include/prfchwintrin.h b/lib/include/prfchwintrin.h index 70851396f..6e8a4ef2e 100644 --- a/lib/include/prfchwintrin.h +++ b/lib/include/prfchwintrin.h @@ -1,22 +1,8 @@ /*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/ptwriteintrin.h b/lib/include/ptwriteintrin.h index 1bb1df0a2..0a04f7c1d 100644 --- a/lib/include/ptwriteintrin.h +++ b/lib/include/ptwriteintrin.h @@ -1,22 +1,8 @@ /*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/rdseedintrin.h b/lib/include/rdseedintrin.h index 419466932..ccb3d2dd2 100644 --- a/lib/include/rdseedintrin.h +++ b/lib/include/rdseedintrin.h @@ -1,22 +1,8 @@ /*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/rtmintrin.h b/lib/include/rtmintrin.h index e6a58d743..36ff58351 100644 --- a/lib/include/rtmintrin.h +++ b/lib/include/rtmintrin.h @@ -1,22 +1,8 @@ /*===---- rtmintrin.h - RTM intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/s390intrin.h b/lib/include/s390intrin.h index d51274c07..73a915c23 100644 --- a/lib/include/s390intrin.h +++ b/lib/include/s390intrin.h @@ -1,22 +1,8 @@ /*===---- s390intrin.h - SystemZ intrinsics --------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/sgxintrin.h b/lib/include/sgxintrin.h index 20aee7661..303a21f6b 100644 --- a/lib/include/sgxintrin.h +++ b/lib/include/sgxintrin.h @@ -1,22 +1,8 @@ /*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -28,6 +14,8 @@ #ifndef __SGXINTRIN_H #define __SGXINTRIN_H +#if __has_extension(gnu_asm) + /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("sgx"))) @@ -67,4 +55,6 @@ _enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) #undef __DEFAULT_FN_ATTRS +#endif /* __has_extension(gnu_asm) */ + #endif diff --git a/lib/include/shaintrin.h b/lib/include/shaintrin.h index 3df4718ce..08b1fb1dc 100644 --- a/lib/include/shaintrin.h +++ b/lib/include/shaintrin.h @@ -1,22 +1,8 @@ /*===---- shaintrin.h - SHA intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/smmintrin.h b/lib/include/smmintrin.h index 4806b3e4e..025830a74 100644 --- a/lib/include/smmintrin.h +++ b/lib/include/smmintrin.h @@ -1,22 +1,8 @@ /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/stdalign.h b/lib/include/stdalign.h index 3738d1284..6ad25db45 100644 --- a/lib/include/stdalign.h +++ b/lib/include/stdalign.h @@ -1,22 +1,8 @@ /*===---- stdalign.h - Standard header for alignment ------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/stdarg.h b/lib/include/stdarg.h index 101426fff..0bc39408c 100644 --- a/lib/include/stdarg.h +++ b/lib/include/stdarg.h @@ -1,24 +1,8 @@ /*===---- stdarg.h - Variable argument handling ----------------------------=== * - * Copyright (c) 2008 Eli Friedman - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/stdatomic.h b/lib/include/stdatomic.h index b4845a74e..665551ea6 100644 --- a/lib/include/stdatomic.h +++ b/lib/include/stdatomic.h @@ -1,22 +1,8 @@ /*===---- stdatomic.h - Standard header for atomic types and operations -----=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/stdbool.h b/lib/include/stdbool.h index 5cb66b55d..2525363dd 100644 --- a/lib/include/stdbool.h +++ b/lib/include/stdbool.h @@ -1,24 +1,8 @@ /*===---- stdbool.h - Standard header for booleans -------------------------=== * - * Copyright (c) 2008 Eli Friedman - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/stddef.h b/lib/include/stddef.h index 735499671..15acd4427 100644 --- a/lib/include/stddef.h +++ b/lib/include/stddef.h @@ -1,24 +1,8 @@ /*===---- stddef.h - Basic type definitions --------------------------------=== * - * Copyright (c) 2008 Eli Friedman - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/stdint.h b/lib/include/stdint.h index 0afcca3a9..192f653e9 100644 --- a/lib/include/stdint.h +++ b/lib/include/stdint.h @@ -1,29 +1,18 @@ /*===---- stdint.h - Standard header for sized integer types --------------===*\ * - * Copyright (c) 2009 Chris Lattner - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * \*===----------------------------------------------------------------------===*/ #ifndef __CLANG_STDINT_H +// AIX system headers need stdint.h to be re-enterable while _STD_TYPES_T +// is defined until an inclusion of it without _STD_TYPES_T occurs, in which +// case the header guard macro is defined. +#if !defined(_AIX) || !defined(_STD_TYPES_T) || !defined(__STDC_HOSTED__) #define __CLANG_STDINT_H +#endif /* If we're hosted, fall back to the system's stdint.h, which might have * additional definitions. diff --git a/lib/include/stdnoreturn.h b/lib/include/stdnoreturn.h index a7a301d7e..e83cd8153 100644 --- a/lib/include/stdnoreturn.h +++ b/lib/include/stdnoreturn.h @@ -1,22 +1,8 @@ /*===---- stdnoreturn.h - Standard header for noreturn macro ---------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/tbmintrin.h b/lib/include/tbmintrin.h index 1d0d746a8..f4e848a1c 100644 --- a/lib/include/tbmintrin.h +++ b/lib/include/tbmintrin.h @@ -1,22 +1,8 @@ /*===---- tbmintrin.h - TBM intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/tgmath.h b/lib/include/tgmath.h index 34e26dcc0..7acf18b9d 100644 --- a/lib/include/tgmath.h +++ b/lib/include/tgmath.h @@ -1,24 +1,8 @@ /*===---- tgmath.h - Standard header for type generic math ----------------===*\ * - * Copyright (c) 2009 Howard Hinnant - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * \*===----------------------------------------------------------------------===*/ diff --git a/lib/include/tmmintrin.h b/lib/include/tmmintrin.h index 734cd391b..35533e115 100644 --- a/lib/include/tmmintrin.h +++ b/lib/include/tmmintrin.h @@ -1,22 +1,8 @@ /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/unwind.h b/lib/include/unwind.h index 0e8317e5b..029524b7b 100644 --- a/lib/include/unwind.h +++ b/lib/include/unwind.h @@ -1,22 +1,8 @@ /*===---- unwind.h - Stack unwinding ----------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -66,8 +52,8 @@ extern "C" { #pragma GCC visibility push(default) #endif -typedef uintptr_t _Unwind_Word; -typedef intptr_t _Unwind_Sword; +typedef uintptr_t _Unwind_Word __attribute__((__mode__(__unwind_word__))); +typedef intptr_t _Unwind_Sword __attribute__((__mode__(__unwind_word__))); typedef uintptr_t _Unwind_Ptr; typedef uintptr_t _Unwind_Internal_Ptr; typedef uint64_t _Unwind_Exception_Class; diff --git a/lib/include/vadefs.h b/lib/include/vadefs.h index 7fe9a74e3..b61756844 100644 --- a/lib/include/vadefs.h +++ b/lib/include/vadefs.h @@ -1,22 +1,8 @@ /* ===-------- vadefs.h ---------------------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/vaesintrin.h b/lib/include/vaesintrin.h index e4174bb82..c4d5c3e75 100644 --- a/lib/include/vaesintrin.h +++ b/lib/include/vaesintrin.h @@ -1,23 +1,9 @@ /*===------------------ vaesintrin.h - VAES intrinsics ---------------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/varargs.h b/lib/include/varargs.h index b5477d0a6..d241b7de3 100644 --- a/lib/include/varargs.h +++ b/lib/include/varargs.h @@ -1,22 +1,8 @@ /*===---- varargs.h - Variable argument handling -------------------------------------=== * -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -* THE SOFTWARE. +* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +* See https://llvm.org/LICENSE.txt for license information. +* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/vecintrin.h b/lib/include/vecintrin.h index e62738983..c71b76a3e 100644 --- a/lib/include/vecintrin.h +++ b/lib/include/vecintrin.h @@ -1,22 +1,8 @@ /*===---- vecintrin.h - Vector intrinsics ----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -531,6 +517,141 @@ vec_bperm_u128(vector unsigned char __a, vector unsigned char __b) { } #endif +/*-- vec_revb ---------------------------------------------------------------*/ + +static inline __ATTRS_o_ai vector signed short +vec_revb(vector signed short __vec) { + return (vector signed short) + __builtin_s390_vlbrh((vector unsigned short)__vec); +} + +static inline __ATTRS_o_ai vector unsigned short +vec_revb(vector unsigned short __vec) { + return __builtin_s390_vlbrh(__vec); +} + +static inline __ATTRS_o_ai vector signed int +vec_revb(vector signed int __vec) { + return (vector signed int) + __builtin_s390_vlbrf((vector unsigned int)__vec); +} + +static inline __ATTRS_o_ai vector unsigned int +vec_revb(vector unsigned int __vec) { + return __builtin_s390_vlbrf(__vec); +} + +static inline __ATTRS_o_ai vector signed long long +vec_revb(vector signed long long __vec) { + return (vector signed long long) + __builtin_s390_vlbrg((vector unsigned long long)__vec); +} + +static inline __ATTRS_o_ai vector unsigned long long +vec_revb(vector unsigned long long __vec) { + return __builtin_s390_vlbrg(__vec); +} + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_revb(vector float __vec) { + return (vector float) + __builtin_s390_vlbrf((vector unsigned int)__vec); +} +#endif + +static inline __ATTRS_o_ai vector double +vec_revb(vector double __vec) { + return (vector double) + __builtin_s390_vlbrg((vector unsigned long long)__vec); +} + +/*-- vec_reve ---------------------------------------------------------------*/ + +static inline __ATTRS_o_ai vector signed char +vec_reve(vector signed char __vec) { + return (vector signed char) { __vec[15], __vec[14], __vec[13], __vec[12], + __vec[11], __vec[10], __vec[9], __vec[8], + __vec[7], __vec[6], __vec[5], __vec[4], + __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector unsigned char +vec_reve(vector unsigned char __vec) { + return (vector unsigned char) { __vec[15], __vec[14], __vec[13], __vec[12], + __vec[11], __vec[10], __vec[9], __vec[8], + __vec[7], __vec[6], __vec[5], __vec[4], + __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector bool char +vec_reve(vector bool char __vec) { + return (vector bool char) { __vec[15], __vec[14], __vec[13], __vec[12], + __vec[11], __vec[10], __vec[9], __vec[8], + __vec[7], __vec[6], __vec[5], __vec[4], + __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector signed short +vec_reve(vector signed short __vec) { + return (vector signed short) { __vec[7], __vec[6], __vec[5], __vec[4], + __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector unsigned short +vec_reve(vector unsigned short __vec) { + return (vector unsigned short) { __vec[7], __vec[6], __vec[5], __vec[4], + __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector bool short +vec_reve(vector bool short __vec) { + return (vector bool short) { __vec[7], __vec[6], __vec[5], __vec[4], + __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector signed int +vec_reve(vector signed int __vec) { + return (vector signed int) { __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector unsigned int +vec_reve(vector unsigned int __vec) { + return (vector unsigned int) { __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector bool int +vec_reve(vector bool int __vec) { + return (vector bool int) { __vec[3], __vec[2], __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector signed long long +vec_reve(vector signed long long __vec) { + return (vector signed long long) { __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector unsigned long long +vec_reve(vector unsigned long long __vec) { + return (vector unsigned long long) { __vec[1], __vec[0] }; +} + +static inline __ATTRS_o_ai vector bool long long +vec_reve(vector bool long long __vec) { + return (vector bool long long) { __vec[1], __vec[0] }; +} + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai vector float +vec_reve(vector float __vec) { + return (vector float) { __vec[3], __vec[2], __vec[1], __vec[0] }; +} +#endif + +static inline __ATTRS_o_ai vector double +vec_reve(vector double __vec) { + return (vector double) { __vec[1], __vec[0] }; +} + /*-- vec_sel ----------------------------------------------------------------*/ static inline __ATTRS_o_ai vector signed char @@ -6849,6 +6970,56 @@ vec_sldw(vector double __a, vector double __b, int __c) __builtin_s390_vsldb((vector unsigned char)(X), \ (vector unsigned char)(Y), (Z) * 4)) +/*-- vec_sldb ---------------------------------------------------------------*/ + +#if __ARCH__ >= 13 + +extern __ATTRS_o vector signed char +vec_sldb(vector signed char __a, vector signed char __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector unsigned char +vec_sldb(vector unsigned char __a, vector unsigned char __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector signed short +vec_sldb(vector signed short __a, vector signed short __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector unsigned short +vec_sldb(vector unsigned short __a, vector unsigned short __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector signed int +vec_sldb(vector signed int __a, vector signed int __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector unsigned int +vec_sldb(vector unsigned int __a, vector unsigned int __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector signed long long +vec_sldb(vector signed long long __a, vector signed long long __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector unsigned long long +vec_sldb(vector unsigned long long __a, vector unsigned long long __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector float +vec_sldb(vector float __a, vector float __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector double +vec_sldb(vector double __a, vector double __b, int __c) + __constant_range(__c, 0, 7); + +#define vec_sldb(X, Y, Z) ((__typeof__((vec_sldb)((X), (Y), (Z)))) \ + __builtin_s390_vsld((vector unsigned char)(X), \ + (vector unsigned char)(Y), (Z))) + +#endif + /*-- vec_sral ---------------------------------------------------------------*/ static inline __ATTRS_o_ai vector signed char @@ -7579,6 +7750,56 @@ vec_srb(vector double __a, vector unsigned long long __b) { (vector unsigned char)__a, (vector unsigned char)__b); } +/*-- vec_srdb ---------------------------------------------------------------*/ + +#if __ARCH__ >= 13 + +extern __ATTRS_o vector signed char +vec_srdb(vector signed char __a, vector signed char __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector unsigned char +vec_srdb(vector unsigned char __a, vector unsigned char __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector signed short +vec_srdb(vector signed short __a, vector signed short __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector unsigned short +vec_srdb(vector unsigned short __a, vector unsigned short __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector signed int +vec_srdb(vector signed int __a, vector signed int __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector unsigned int +vec_srdb(vector unsigned int __a, vector unsigned int __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector signed long long +vec_srdb(vector signed long long __a, vector signed long long __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector unsigned long long +vec_srdb(vector unsigned long long __a, vector unsigned long long __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector float +vec_srdb(vector float __a, vector float __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o vector double +vec_srdb(vector double __a, vector double __b, int __c) + __constant_range(__c, 0, 7); + +#define vec_srdb(X, Y, Z) ((__typeof__((vec_srdb)((X), (Y), (Z)))) \ + __builtin_s390_vsrd((vector unsigned char)(X), \ + (vector unsigned char)(Y), (Z))) + +#endif + /*-- vec_abs ----------------------------------------------------------------*/ static inline __ATTRS_o_ai vector signed char @@ -8725,6 +8946,22 @@ vec_double(vector unsigned long long __a) { return __builtin_convertvector(__a, vector double); } +/*-- vec_float --------------------------------------------------------------*/ + +#if __ARCH__ >= 13 + +static inline __ATTRS_o_ai vector float +vec_float(vector signed int __a) { + return __builtin_convertvector(__a, vector float); +} + +static inline __ATTRS_o_ai vector float +vec_float(vector unsigned int __a) { + return __builtin_convertvector(__a, vector float); +} + +#endif + /*-- vec_signed -------------------------------------------------------------*/ static inline __ATTRS_o_ai vector signed long long @@ -8732,6 +8969,13 @@ vec_signed(vector double __a) { return __builtin_convertvector(__a, vector signed long long); } +#if __ARCH__ >= 13 +static inline __ATTRS_o_ai vector signed int +vec_signed(vector float __a) { + return __builtin_convertvector(__a, vector signed int); +} +#endif + /*-- vec_unsigned -----------------------------------------------------------*/ static inline __ATTRS_o_ai vector unsigned long long @@ -8739,6 +8983,13 @@ vec_unsigned(vector double __a) { return __builtin_convertvector(__a, vector unsigned long long); } +#if __ARCH__ >= 13 +static inline __ATTRS_o_ai vector unsigned int +vec_unsigned(vector float __a) { + return __builtin_convertvector(__a, vector unsigned int); +} +#endif + /*-- vec_roundp -------------------------------------------------------------*/ #if __ARCH__ >= 12 @@ -10456,6 +10707,147 @@ vec_find_any_ne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b, return __builtin_s390_vfaezfs(__a, __b, 8, __cc); } +/*-- vec_search_string_cc ---------------------------------------------------*/ + +#if __ARCH__ >= 13 + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector signed char __a, vector signed char __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsb((vector unsigned char)__a, + (vector unsigned char)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector bool char __a, vector bool char __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsb((vector unsigned char)__a, + (vector unsigned char)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector unsigned char __a, vector unsigned char __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsb(__a, __b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector signed short __a, vector signed short __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsh((vector unsigned short)__a, + (vector unsigned short)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector bool short __a, vector bool short __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsh((vector unsigned short)__a, + (vector unsigned short)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector unsigned short __a, vector unsigned short __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsh(__a, __b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector signed int __a, vector signed int __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsf((vector unsigned int)__a, + (vector unsigned int)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector bool int __a, vector bool int __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsf((vector unsigned int)__a, + (vector unsigned int)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_cc(vector unsigned int __a, vector unsigned int __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrsf(__a, __b, __c, __cc); +} + +#endif + +/*-- vec_search_string_until_zero_cc ----------------------------------------*/ + +#if __ARCH__ >= 13 + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector signed char __a, + vector signed char __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszb((vector unsigned char)__a, + (vector unsigned char)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector bool char __a, + vector bool char __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszb((vector unsigned char)__a, + (vector unsigned char)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector unsigned char __a, + vector unsigned char __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszb(__a, __b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector signed short __a, + vector signed short __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszh((vector unsigned short)__a, + (vector unsigned short)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector bool short __a, + vector bool short __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszh((vector unsigned short)__a, + (vector unsigned short)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector unsigned short __a, + vector unsigned short __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszh(__a, __b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector signed int __a, + vector signed int __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszf((vector unsigned int)__a, + (vector unsigned int)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector bool int __a, + vector bool int __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszf((vector unsigned int)__a, + (vector unsigned int)__b, __c, __cc); +} + +static inline __ATTRS_o_ai vector unsigned char +vec_search_string_until_zero_cc(vector unsigned int __a, + vector unsigned int __b, + vector unsigned char __c, int *__cc) { + return __builtin_s390_vstrszf(__a, __b, __c, __cc); +} + +#endif + #undef __constant_pow2_range #undef __constant_range #undef __constant diff --git a/lib/include/vpclmulqdqintrin.h b/lib/include/vpclmulqdqintrin.h index 86174a457..470d83254 100644 --- a/lib/include/vpclmulqdqintrin.h +++ b/lib/include/vpclmulqdqintrin.h @@ -1,23 +1,9 @@ /*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------=== * * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/waitpkgintrin.h b/lib/include/waitpkgintrin.h index e29d6cfa5..7ecada4cf 100644 --- a/lib/include/waitpkgintrin.h +++ b/lib/include/waitpkgintrin.h @@ -1,22 +1,8 @@ /*===----------------------- waitpkgintrin.h - WAITPKG --------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/wbnoinvdintrin.h b/lib/include/wbnoinvdintrin.h index cad83368d..cac0347ef 100644 --- a/lib/include/wbnoinvdintrin.h +++ b/lib/include/wbnoinvdintrin.h @@ -1,22 +1,8 @@ /*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/wmmintrin.h b/lib/include/wmmintrin.h index 569a8d838..f932ca810 100644 --- a/lib/include/wmmintrin.h +++ b/lib/include/wmmintrin.h @@ -1,22 +1,8 @@ /*===---- wmmintrin.h - AES intrinsics ------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/x86intrin.h b/lib/include/x86intrin.h index 728c58c3e..a8b36622d 100644 --- a/lib/include/x86intrin.h +++ b/lib/include/x86intrin.h @@ -1,22 +1,8 @@ /*===---- x86intrin.h - X86 intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/xmmintrin.h b/lib/include/xmmintrin.h index 17af17267..75ff37655 100644 --- a/lib/include/xmmintrin.h +++ b/lib/include/xmmintrin.h @@ -1,22 +1,8 @@ /*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -28,7 +14,9 @@ typedef int __v4si __attribute__((__vector_size__(16))); typedef float __v4sf __attribute__((__vector_size__(16))); -typedef float __m128 __attribute__((__vector_size__(16))); +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); + +typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); /* Unsigned types */ typedef unsigned int __v4su __attribute__((__vector_size__(16))); @@ -1752,7 +1740,7 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p) { struct __loadu_ps { - __m128 __v; + __m128_u __v; } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_ps*)__p)->__v; } @@ -1931,7 +1919,11 @@ _mm_setzero_ps(void) static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a) { - __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a); + typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_storeh_pi_struct { + __mm_storeh_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3); } /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a @@ -1948,7 +1940,11 @@ _mm_storeh_pi(__m64 *__p, __m128 __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a) { - __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a); + typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_storeh_pi_struct { + __mm_storeh_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1); } /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a @@ -1987,7 +1983,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a) { struct __storeu_ps { - __m128 __v; + __m128_u __v; } __attribute__((__packed__, __may_alias__)); ((struct __storeu_ps*)__p)->__v = __a; } diff --git a/lib/include/xopintrin.h b/lib/include/xopintrin.h index 9d540a2ab..5cedde41b 100644 --- a/lib/include/xopintrin.h +++ b/lib/include/xopintrin.h @@ -1,22 +1,8 @@ /*===---- xopintrin.h - XOP intrinsics -------------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/xsavecintrin.h b/lib/include/xsavecintrin.h index 25577a95f..5524947fa 100644 --- a/lib/include/xsavecintrin.h +++ b/lib/include/xsavecintrin.h @@ -1,22 +1,8 @@ /*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/xsaveintrin.h b/lib/include/xsaveintrin.h index 16f3a78d3..9429db6dd 100644 --- a/lib/include/xsaveintrin.h +++ b/lib/include/xsaveintrin.h @@ -1,22 +1,8 @@ /*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ @@ -28,6 +14,10 @@ #ifndef __XSAVEINTRIN_H #define __XSAVEINTRIN_H +#ifdef _MSC_VER +#define _XCR_XFEATURE_ENABLED_MASK 0 +#endif + /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsave"))) @@ -41,6 +31,20 @@ _xrstor(void *__p, unsigned long long __m) { __builtin_ia32_xrstor(__p, __m); } +#ifndef _MSC_VER +#define _xgetbv(A) __builtin_ia32_xgetbv((long long)(A)) +#define _xsetbv(A, B) __builtin_ia32_xsetbv((unsigned int)(A), (unsigned long long)(B)) +#else +#ifdef __cplusplus +extern "C" { +#endif +unsigned __int64 __cdecl _xgetbv(unsigned int); +void __cdecl _xsetbv(unsigned int, unsigned __int64); +#ifdef __cplusplus +} +#endif +#endif /* _MSC_VER */ + #ifdef __x86_64__ static __inline__ void __DEFAULT_FN_ATTRS _xsave64(void *__p, unsigned long long __m) { @@ -51,6 +55,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _xrstor64(void *__p, unsigned long long __m) { __builtin_ia32_xrstor64(__p, __m); } + #endif #undef __DEFAULT_FN_ATTRS diff --git a/lib/include/xsaveoptintrin.h b/lib/include/xsaveoptintrin.h index 792cf92d4..89a4c44db 100644 --- a/lib/include/xsaveoptintrin.h +++ b/lib/include/xsaveoptintrin.h @@ -1,22 +1,8 @@ /*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/xsavesintrin.h b/lib/include/xsavesintrin.h index fe2bc4b93..3f99219a2 100644 --- a/lib/include/xsavesintrin.h +++ b/lib/include/xsavesintrin.h @@ -1,22 +1,8 @@ /*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */ diff --git a/lib/include/xtestintrin.h b/lib/include/xtestintrin.h index 924424386..7d19e3733 100644 --- a/lib/include/xtestintrin.h +++ b/lib/include/xtestintrin.h @@ -1,22 +1,8 @@ /*===---- xtestintrin.h - XTEST intrinsic ----------------------------------=== * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===-----------------------------------------------------------------------=== */