From 2117fbdae35dddf368c4ce5bb39cc73fa0f78d4c Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 19 Jul 2019 16:50:45 -0400
Subject: [PATCH] update C headers to llvm9

upstream commit 1931d3cb20a00da732c5210b123656632982fde0
---
 lib/include/__clang_cuda_builtin_vars.h       |   20 +-
 lib/include/__clang_cuda_cmath.h              |   49 +-
 lib/include/__clang_cuda_complex_builtins.h   |   20 +-
 lib/include/__clang_cuda_device_functions.h   |   93 +-
 lib/include/__clang_cuda_intrinsics.h         |   20 +-
 lib/include/__clang_cuda_libdevice_declares.h |  890 ++++---
 .../__clang_cuda_math_forward_declares.h      |   70 +-
 lib/include/__clang_cuda_runtime_wrapper.h    |   32 +-
 lib/include/__stddef_max_align_t.h            |   22 +-
 lib/include/__wmmintrin_aes.h                 |   20 +-
 lib/include/__wmmintrin_pclmul.h              |   20 +-
 lib/include/adxintrin.h                       |   20 +-
 lib/include/altivec.h                         |   20 +-
 lib/include/ammintrin.h                       |   20 +-
 lib/include/arm64intr.h                       |   20 +-
 lib/include/arm_acle.h                        |   38 +-
 lib/include/arm_neon.h                        |  396 +--
 lib/include/armintr.h                         |   20 +-
 lib/include/avx2intrin.h                      |   32 +-
 lib/include/avx512bf16intrin.h                |  279 ++
 lib/include/avx512bitalgintrin.h              |   20 +-
 lib/include/avx512bwintrin.h                  |   44 +-
 lib/include/avx512cdintrin.h                  |   52 +-
 lib/include/avx512dqintrin.h                  |   20 +-
 lib/include/avx512erintrin.h                  |   20 +-
 lib/include/avx512fintrin.h                   |  115 +-
 lib/include/avx512ifmaintrin.h                |   20 +-
 lib/include/avx512ifmavlintrin.h              |   20 +-
 lib/include/avx512pfintrin.h                  |   20 +-
 lib/include/avx512vbmi2intrin.h               |   20 +-
 lib/include/avx512vbmiintrin.h                |   20 +-
 lib/include/avx512vbmivlintrin.h              |   20 +-
 lib/include/avx512vlbf16intrin.h              |  474 ++++
 lib/include/avx512vlbitalgintrin.h            |   20 +-
 lib/include/avx512vlbwintrin.h                |   36 +-
 lib/include/avx512vlcdintrin.h                |   86 +-
 lib/include/avx512vldqintrin.h                |   52 +-
 lib/include/avx512vlintrin.h                  |   77 +-
 lib/include/avx512vlvbmi2intrin.h             |   20 +-
 lib/include/avx512vlvnniintrin.h              |   20 +-
 lib/include/avx512vlvp2intersectintrin.h      |  121 +
 lib/include/avx512vnniintrin.h                |   20 +-
 lib/include/avx512vp2intersectintrin.h        |   77 +
 lib/include/avx512vpopcntdqintrin.h           |   20 +-
 lib/include/avx512vpopcntdqvlintrin.h         |   20 +-
 lib/include/avxintrin.h                       |   50 +-
 lib/include/bmi2intrin.h                      |   20 +-
 lib/include/bmiintrin.h                       |   20 +-
 lib/include/cetintrin.h                       |   20 +-
 lib/include/cldemoteintrin.h                  |   20 +-
 lib/include/clflushoptintrin.h                |   20 +-
 lib/include/clwbintrin.h                      |   20 +-
 lib/include/clzerointrin.h                    |   20 +-
 lib/include/cpuid.h                           |   24 +-
 lib/include/emmintrin.h                       |   55 +-
 lib/include/enqcmdintrin.h                    |   63 +
 lib/include/f16cintrin.h                      |   26 +-
 lib/include/float.h                           |   28 +-
 lib/include/fma4intrin.h                      |   20 +-
 lib/include/fmaintrin.h                       |   20 +-
 lib/include/fxsrintrin.h                      |   20 +-
 lib/include/gfniintrin.h                      |   20 +-
 lib/include/htmintrin.h                       |   20 +-
 lib/include/htmxlintrin.h                     |   20 +-
 lib/include/ia32intrin.h                      |  320 ++-
 lib/include/immintrin.h                       |   62 +-
 lib/include/intrin.h                          |   42 +-
 lib/include/inttypes.h                        |   25 +-
 lib/include/invpcidintrin.h                   |   20 +-
 lib/include/iso646.h                          |   22 +-
 lib/include/limits.h                          |   22 +-
 lib/include/lwpintrin.h                       |   20 +-
 lib/include/lzcntintrin.h                     |   20 +-
 lib/include/mm3dnow.h                         |   20 +-
 lib/include/mm_malloc.h                       |   20 +-
 lib/include/mmintrin.h                        |   22 +-
 lib/include/module.modulemap                  |   21 +-
 lib/include/movdirintrin.h                    |   20 +-
 lib/include/msa.h                             |   20 +-
 lib/include/mwaitxintrin.h                    |   20 +-
 lib/include/nmmintrin.h                       |   20 +-
 lib/include/opencl-c-base.h                   |  578 ++++
 lib/include/opencl-c.h                        |  698 +----
 .../openmp_wrappers/__clang_openmp_math.h     |   35 +
 .../__clang_openmp_math_declares.h            |   33 +
 lib/include/openmp_wrappers/cmath             |   16 +
 lib/include/openmp_wrappers/math.h            |   17 +
 lib/include/pconfigintrin.h                   |   24 +-
 lib/include/pkuintrin.h                       |   20 +-
 lib/include/pmmintrin.h                       |   20 +-
 lib/include/popcntintrin.h                    |   52 +-
 lib/include/ppc_wrappers/emmintrin.h          | 2318 +++++++++++++++++
 lib/include/ppc_wrappers/mm_malloc.h          |   44 +
 lib/include/ppc_wrappers/mmintrin.h           | 1443 ++++++++++
 lib/include/ppc_wrappers/xmmintrin.h          | 1838 +++++++++++++
 lib/include/prfchwintrin.h                    |   20 +-
 lib/include/ptwriteintrin.h                   |   20 +-
 lib/include/rdseedintrin.h                    |   20 +-
 lib/include/rtmintrin.h                       |   20 +-
 lib/include/s390intrin.h                      |   20 +-
 lib/include/sgxintrin.h                       |   24 +-
 lib/include/shaintrin.h                       |   20 +-
 lib/include/smmintrin.h                       |   20 +-
 lib/include/stdalign.h                        |   20 +-
 lib/include/stdarg.h                          |   22 +-
 lib/include/stdatomic.h                       |   20 +-
 lib/include/stdbool.h                         |   22 +-
 lib/include/stddef.h                          |   22 +-
 lib/include/stdint.h                          |   27 +-
 lib/include/stdnoreturn.h                     |   20 +-
 lib/include/tbmintrin.h                       |   20 +-
 lib/include/tgmath.h                          |   22 +-
 lib/include/tmmintrin.h                       |   20 +-
 lib/include/unwind.h                          |   24 +-
 lib/include/vadefs.h                          |   20 +-
 lib/include/vaesintrin.h                      |   20 +-
 lib/include/varargs.h                         |   20 +-
 lib/include/vecintrin.h                       |  426 ++-
 lib/include/vpclmulqdqintrin.h                |   20 +-
 lib/include/waitpkgintrin.h                   |   20 +-
 lib/include/wbnoinvdintrin.h                  |   20 +-
 lib/include/wmmintrin.h                       |   20 +-
 lib/include/x86intrin.h                       |   20 +-
 lib/include/xmmintrin.h                       |   40 +-
 lib/include/xopintrin.h                       |   20 +-
 lib/include/xsavecintrin.h                    |   20 +-
 lib/include/xsaveintrin.h                     |   39 +-
 lib/include/xsaveoptintrin.h                  |   20 +-
 lib/include/xsavesintrin.h                    |   20 +-
 lib/include/xtestintrin.h                     |   20 +-
 130 files changed, 9519 insertions(+), 3542 deletions(-)
 create mode 100644 lib/include/avx512bf16intrin.h
 create mode 100644 lib/include/avx512vlbf16intrin.h
 create mode 100644 lib/include/avx512vlvp2intersectintrin.h
 create mode 100644 lib/include/avx512vp2intersectintrin.h
 create mode 100644 lib/include/enqcmdintrin.h
 create mode 100644 lib/include/opencl-c-base.h
 create mode 100644 lib/include/openmp_wrappers/__clang_openmp_math.h
 create mode 100644 lib/include/openmp_wrappers/__clang_openmp_math_declares.h
 create mode 100644 lib/include/openmp_wrappers/cmath
 create mode 100644 lib/include/openmp_wrappers/math.h
 create mode 100644 lib/include/ppc_wrappers/emmintrin.h
 create mode 100644 lib/include/ppc_wrappers/mm_malloc.h
 create mode 100644 lib/include/ppc_wrappers/mmintrin.h
 create mode 100644 lib/include/ppc_wrappers/xmmintrin.h

diff --git a/lib/include/__clang_cuda_builtin_vars.h b/lib/include/__clang_cuda_builtin_vars.h
index 290c4b298..2ba1521f2 100644
--- a/lib/include/__clang_cuda_builtin_vars.h
+++ b/lib/include/__clang_cuda_builtin_vars.h
@@ -1,22 +1,8 @@
 /*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/__clang_cuda_cmath.h b/lib/include/__clang_cuda_cmath.h
index 5331ba401..834a2e3fd 100644
--- a/lib/include/__clang_cuda_cmath.h
+++ b/lib/include/__clang_cuda_cmath.h
@@ -1,22 +1,8 @@
 /*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -44,12 +30,32 @@
 // implementation.  Declaring in the global namespace and pulling into namespace
 // std covers all of the known knowns.
 
+#ifdef _OPENMP
+#define __DEVICE__ static __attribute__((always_inline))
+#else
 #define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
+#endif
 
+// For C++ 17 we need to include noexcept attribute to be compatible
+// with the header-defined version. This may be removed once
+// variant is supported.
+#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
+#define __NOEXCEPT noexcept
+#else
+#define __NOEXCEPT
+#endif
+
+#if !(defined(_OPENMP) && defined(__cplusplus))
 __DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
 __DEVICE__ long abs(long __n) { return ::labs(__n); }
 __DEVICE__ float abs(float __x) { return ::fabsf(__x); }
 __DEVICE__ double abs(double __x) { return ::fabs(__x); }
+#endif
+// TODO: remove once variat is supported.
+#if defined(_OPENMP) && defined(__cplusplus)
+__DEVICE__ const float abs(const float __x) { return ::fabsf((float)__x); }
+__DEVICE__ const double abs(const double __x) { return ::fabs((double)__x); }
+#endif
 __DEVICE__ float acos(float __x) { return ::acosf(__x); }
 __DEVICE__ float asin(float __x) { return ::asinf(__x); }
 __DEVICE__ float atan(float __x) { return ::atanf(__x); }
@@ -58,9 +64,11 @@ __DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
 __DEVICE__ float cos(float __x) { return ::cosf(__x); }
 __DEVICE__ float cosh(float __x) { return ::coshf(__x); }
 __DEVICE__ float exp(float __x) { return ::expf(__x); }
-__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
+__DEVICE__ float fabs(float __x) __NOEXCEPT { return ::fabsf(__x); }
 __DEVICE__ float floor(float __x) { return ::floorf(__x); }
 __DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
+// TODO: remove when variant is supported
+#ifndef _OPENMP
 __DEVICE__ int fpclassify(float __x) {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
@@ -69,6 +77,7 @@ __DEVICE__ int fpclassify(double __x) {
   return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                               FP_ZERO, __x);
 }
+#endif
 __DEVICE__ float frexp(float __arg, int *__exp) {
   return ::frexpf(__arg, __exp);
 }
@@ -448,7 +457,10 @@ using ::remainderf;
 using ::remquof;
 using ::rintf;
 using ::roundf;
+// TODO: remove once variant is supported
+#ifndef _OPENMP
 using ::scalblnf;
+#endif
 using ::scalbnf;
 using ::sinf;
 using ::sinhf;
@@ -467,6 +479,7 @@ _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif
 
+#undef __NOEXCEPT
 #undef __DEVICE__
 
 #endif
diff --git a/lib/include/__clang_cuda_complex_builtins.h b/lib/include/__clang_cuda_complex_builtins.h
index beef7deff..576a958b1 100644
--- a/lib/include/__clang_cuda_complex_builtins.h
+++ b/lib/include/__clang_cuda_complex_builtins.h
@@ -1,22 +1,8 @@
 /*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/__clang_cuda_device_functions.h b/lib/include/__clang_cuda_device_functions.h
index 67bbc68b1..50ad674f9 100644
--- a/lib/include/__clang_cuda_device_functions.h
+++ b/lib/include/__clang_cuda_device_functions.h
@@ -1,22 +1,8 @@
 /*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -24,15 +10,21 @@
 #ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
 #define __CLANG_CUDA_DEVICE_FUNCTIONS_H__
 
+#ifndef _OPENMP
 #if CUDA_VERSION < 9000
 #error This file is intended to be used with CUDA-9+ only.
 #endif
+#endif
 
 // __DEVICE__ is a helper macro with common set of attributes for the wrappers
 // we implement in this file. We need static in order to avoid emitting unused
 // functions and __forceinline__ helps inlining these wrappers at -O1.
 #pragma push_macro("__DEVICE__")
+#ifdef _OPENMP
+#define __DEVICE__ static __attribute__((always_inline))
+#else
 #define __DEVICE__ static __device__ __forceinline__
+#endif
 
 // libdevice provides fast low precision and slow full-recision implementations
 // for some functions. Which one gets selected depends on
@@ -45,6 +37,15 @@
 #define __FAST_OR_SLOW(fast, slow) slow
 #endif
 
+// For C++ 17 we need to include noexcept attribute to be compatible
+// with the header-defined version. This may be removed once
+// variant is supported.
+#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
+#define __NOEXCEPT noexcept
+#else
+#define __NOEXCEPT
+#endif
+
 __DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); }
 __DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); }
 __DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); }
@@ -52,8 +53,13 @@ __DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); }
 __DEVICE__ unsigned long long __brevll(unsigned long long __a) {
   return __nv_brevll(__a);
 }
+#if defined(__cplusplus)
 __DEVICE__ void __brkpt() { asm volatile("brkpt;"); }
 __DEVICE__ void __brkpt(int __a) { __brkpt(); }
+#else
+__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); }
+__DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); }
+#endif
 __DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
                                     unsigned int __c) {
   return __nv_byte_perm(__a, __b, __c);
@@ -237,6 +243,9 @@ __DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); }
 __DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); }
 __DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); }
 __DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); }
+#ifdef _MSC_VER
+__DEVICE__ int __finitel(long double __a);
+#endif
 __DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); }
 __DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); }
 __DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); }
@@ -445,8 +454,14 @@ __DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); }
 __DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); }
 __DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); }
 __DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); }
+#ifdef _MSC_VER
+__DEVICE__ int __isinfl(long double __a);
+#endif
 __DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); }
 __DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); }
+#ifdef _MSC_VER
+__DEVICE__ int __isnanl(long double __a);
+#endif
 __DEVICE__ double __ll2double_rd(long long __a) {
   return __nv_ll2double_rd(__a);
 }
@@ -520,8 +535,8 @@ __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
 __DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); }
 __DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); }
 __DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); }
-__DEVICE__ void __sincosf(float __a, float *__sptr, float *__cptr) {
-  return __nv_fast_sincosf(__a, __sptr, __cptr);
+__DEVICE__ void __sincosf(float __a, float *__s, float *__c) {
+  return __nv_fast_sincosf(__a, __s, __c);
 }
 __DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); }
 __DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); }
@@ -1468,7 +1483,8 @@ __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
   return r;
 }
 #endif // CUDA_VERSION >= 9020
-__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
+__DEVICE__ int abs(int __a) __NOEXCEPT { return __nv_abs(__a); }
+__DEVICE__ double fabs(double __a) __NOEXCEPT { return __nv_fabs(__a); }
 __DEVICE__ double acos(double __a) { return __nv_acos(__a); }
 __DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
 __DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
@@ -1487,8 +1503,10 @@ __DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
 __DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
 __DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
 __DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
+#ifndef _OPENMP
 __DEVICE__ int clock() { return __nvvm_read_ptx_sreg_clock(); }
 __DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); }
+#endif
 __DEVICE__ double copysign(double __a, double __b) {
   return __nv_copysign(__a, __b);
 }
@@ -1525,7 +1543,6 @@ __DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
 __DEVICE__ float expf(float __a) { return __nv_expf(__a); }
 __DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
 __DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
-__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
 __DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
 __DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
 __DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
@@ -1563,16 +1580,16 @@ __DEVICE__ double j1(double __a) { return __nv_j1(__a); }
 __DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
 __DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
 __DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
-#if defined(__LP64__)
-__DEVICE__ long labs(long __a) { return llabs(__a); };
+#if defined(__LP64__) || defined(_WIN64)
+__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_llabs(__a); };
 #else
-__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
+__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_abs(__a); };
 #endif
 __DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
 __DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
 __DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
 __DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
-__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
+__DEVICE__ long long llabs(long long __a) __NOEXCEPT { return __nv_llabs(__a); }
 __DEVICE__ long long llmax(long long __a, long long __b) {
   return __nv_llmax(__a, __b);
 }
@@ -1597,7 +1614,7 @@ __DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
 __DEVICE__ float logf(float __a) {
   return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
 }
-#if defined(__LP64__)
+#if defined(__LP64__) || defined(_WIN64)
 __DEVICE__ long lrint(double __a) { return llrint(__a); }
 __DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
 __DEVICE__ long lround(double __a) { return llround(__a); }
@@ -1609,12 +1626,16 @@ __DEVICE__ long lround(double __a) { return round(__a); }
 __DEVICE__ long lroundf(float __a) { return roundf(__a); }
 #endif
 __DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
+// These functions shouldn't be declared when including this header
+// for math function resolution purposes.
+#ifndef _OPENMP
 __DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) {
   return __builtin_memcpy(__a, __b, __c);
 }
 __DEVICE__ void *memset(void *__a, int __b, size_t __c) {
   return __builtin_memset(__a, __b, __c);
 }
+#endif
 __DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
 __DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
 __DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
@@ -1698,6 +1719,8 @@ __DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
 __DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
 __DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
 __DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
+// TODO: remove once variant is supported
+#ifndef _OPENMP
 __DEVICE__ double scalbln(double __a, long __b) {
   if (__b > INT_MAX)
     return __a > 0 ? HUGE_VAL : -HUGE_VAL;
@@ -1712,18 +1735,19 @@ __DEVICE__ float scalblnf(float __a, long __b) {
     return __a > 0 ? 0.f : -0.f;
   return scalbnf(__a, (int)__b);
 }
+#endif
 __DEVICE__ double sin(double __a) { return __nv_sin(__a); }
-__DEVICE__ void sincos(double __a, double *__sptr, double *__cptr) {
-  return __nv_sincos(__a, __sptr, __cptr);
+__DEVICE__ void sincos(double __a, double *__s, double *__c) {
+  return __nv_sincos(__a, __s, __c);
 }
-__DEVICE__ void sincosf(float __a, float *__sptr, float *__cptr) {
-  return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __sptr, __cptr);
+__DEVICE__ void sincosf(float __a, float *__s, float *__c) {
+  return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
 }
-__DEVICE__ void sincospi(double __a, double *__sptr, double *__cptr) {
-  return __nv_sincospi(__a, __sptr, __cptr);
+__DEVICE__ void sincospi(double __a, double *__s, double *__c) {
+  return __nv_sincospi(__a, __s, __c);
 }
-__DEVICE__ void sincospif(float __a, float *__sptr, float *__cptr) {
-  return __nv_sincospif(__a, __sptr, __cptr);
+__DEVICE__ void sincospif(float __a, float *__s, float *__c) {
+  return __nv_sincospif(__a, __s, __c);
 }
 __DEVICE__ float sinf(float __a) {
   return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
@@ -1763,6 +1787,7 @@ __DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
 __DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
 __DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
 
+#undef __NOEXCEPT
 #pragma pop_macro("__DEVICE__")
 #pragma pop_macro("__FAST_OR_SLOW")
 #endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
diff --git a/lib/include/__clang_cuda_intrinsics.h b/lib/include/__clang_cuda_intrinsics.h
index 3c0cde94e..2970d17f8 100644
--- a/lib/include/__clang_cuda_intrinsics.h
+++ b/lib/include/__clang_cuda_intrinsics.h
@@ -1,22 +1,8 @@
 /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/__clang_cuda_libdevice_declares.h b/lib/include/__clang_cuda_libdevice_declares.h
index 71df7f849..4d7035339 100644
--- a/lib/include/__clang_cuda_libdevice_declares.h
+++ b/lib/include/__clang_cuda_libdevice_declares.h
@@ -1,22 +1,8 @@
 /*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -24,443 +10,453 @@
 #ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
 #define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
 
+#if defined(__cplusplus)
 extern "C" {
+#endif
 
-__device__ int __nv_abs(int __a);
-__device__ double __nv_acos(double __a);
-__device__ float __nv_acosf(float __a);
-__device__ double __nv_acosh(double __a);
-__device__ float __nv_acoshf(float __a);
-__device__ double __nv_asin(double __a);
-__device__ float __nv_asinf(float __a);
-__device__ double __nv_asinh(double __a);
-__device__ float __nv_asinhf(float __a);
-__device__ double __nv_atan2(double __a, double __b);
-__device__ float __nv_atan2f(float __a, float __b);
-__device__ double __nv_atan(double __a);
-__device__ float __nv_atanf(float __a);
-__device__ double __nv_atanh(double __a);
-__device__ float __nv_atanhf(float __a);
-__device__ int __nv_brev(int __a);
-__device__ long long __nv_brevll(long long __a);
-__device__ int __nv_byte_perm(int __a, int __b, int __c);
-__device__ double __nv_cbrt(double __a);
-__device__ float __nv_cbrtf(float __a);
-__device__ double __nv_ceil(double __a);
-__device__ float __nv_ceilf(float __a);
-__device__ int __nv_clz(int __a);
-__device__ int __nv_clzll(long long __a);
-__device__ double __nv_copysign(double __a, double __b);
-__device__ float __nv_copysignf(float __a, float __b);
-__device__ double __nv_cos(double __a);
-__device__ float __nv_cosf(float __a);
-__device__ double __nv_cosh(double __a);
-__device__ float __nv_coshf(float __a);
-__device__ double __nv_cospi(double __a);
-__device__ float __nv_cospif(float __a);
-__device__ double __nv_cyl_bessel_i0(double __a);
-__device__ float __nv_cyl_bessel_i0f(float __a);
-__device__ double __nv_cyl_bessel_i1(double __a);
-__device__ float __nv_cyl_bessel_i1f(float __a);
-__device__ double __nv_dadd_rd(double __a, double __b);
-__device__ double __nv_dadd_rn(double __a, double __b);
-__device__ double __nv_dadd_ru(double __a, double __b);
-__device__ double __nv_dadd_rz(double __a, double __b);
-__device__ double __nv_ddiv_rd(double __a, double __b);
-__device__ double __nv_ddiv_rn(double __a, double __b);
-__device__ double __nv_ddiv_ru(double __a, double __b);
-__device__ double __nv_ddiv_rz(double __a, double __b);
-__device__ double __nv_dmul_rd(double __a, double __b);
-__device__ double __nv_dmul_rn(double __a, double __b);
-__device__ double __nv_dmul_ru(double __a, double __b);
-__device__ double __nv_dmul_rz(double __a, double __b);
-__device__ float __nv_double2float_rd(double __a);
-__device__ float __nv_double2float_rn(double __a);
-__device__ float __nv_double2float_ru(double __a);
-__device__ float __nv_double2float_rz(double __a);
-__device__ int __nv_double2hiint(double __a);
-__device__ int __nv_double2int_rd(double __a);
-__device__ int __nv_double2int_rn(double __a);
-__device__ int __nv_double2int_ru(double __a);
-__device__ int __nv_double2int_rz(double __a);
-__device__ long long __nv_double2ll_rd(double __a);
-__device__ long long __nv_double2ll_rn(double __a);
-__device__ long long __nv_double2ll_ru(double __a);
-__device__ long long __nv_double2ll_rz(double __a);
-__device__ int __nv_double2loint(double __a);
-__device__ unsigned int __nv_double2uint_rd(double __a);
-__device__ unsigned int __nv_double2uint_rn(double __a);
-__device__ unsigned int __nv_double2uint_ru(double __a);
-__device__ unsigned int __nv_double2uint_rz(double __a);
-__device__ unsigned long long __nv_double2ull_rd(double __a);
-__device__ unsigned long long __nv_double2ull_rn(double __a);
-__device__ unsigned long long __nv_double2ull_ru(double __a);
-__device__ unsigned long long __nv_double2ull_rz(double __a);
-__device__ unsigned long long __nv_double_as_longlong(double __a);
-__device__ double __nv_drcp_rd(double __a);
-__device__ double __nv_drcp_rn(double __a);
-__device__ double __nv_drcp_ru(double __a);
-__device__ double __nv_drcp_rz(double __a);
-__device__ double __nv_dsqrt_rd(double __a);
-__device__ double __nv_dsqrt_rn(double __a);
-__device__ double __nv_dsqrt_ru(double __a);
-__device__ double __nv_dsqrt_rz(double __a);
-__device__ double __nv_dsub_rd(double __a, double __b);
-__device__ double __nv_dsub_rn(double __a, double __b);
-__device__ double __nv_dsub_ru(double __a, double __b);
-__device__ double __nv_dsub_rz(double __a, double __b);
-__device__ double __nv_erfc(double __a);
-__device__ float __nv_erfcf(float __a);
-__device__ double __nv_erfcinv(double __a);
-__device__ float __nv_erfcinvf(float __a);
-__device__ double __nv_erfcx(double __a);
-__device__ float __nv_erfcxf(float __a);
-__device__ double __nv_erf(double __a);
-__device__ float __nv_erff(float __a);
-__device__ double __nv_erfinv(double __a);
-__device__ float __nv_erfinvf(float __a);
-__device__ double __nv_exp10(double __a);
-__device__ float __nv_exp10f(float __a);
-__device__ double __nv_exp2(double __a);
-__device__ float __nv_exp2f(float __a);
-__device__ double __nv_exp(double __a);
-__device__ float __nv_expf(float __a);
-__device__ double __nv_expm1(double __a);
-__device__ float __nv_expm1f(float __a);
-__device__ double __nv_fabs(double __a);
-__device__ float __nv_fabsf(float __a);
-__device__ float __nv_fadd_rd(float __a, float __b);
-__device__ float __nv_fadd_rn(float __a, float __b);
-__device__ float __nv_fadd_ru(float __a, float __b);
-__device__ float __nv_fadd_rz(float __a, float __b);
-__device__ float __nv_fast_cosf(float __a);
-__device__ float __nv_fast_exp10f(float __a);
-__device__ float __nv_fast_expf(float __a);
-__device__ float __nv_fast_fdividef(float __a, float __b);
-__device__ float __nv_fast_log10f(float __a);
-__device__ float __nv_fast_log2f(float __a);
-__device__ float __nv_fast_logf(float __a);
-__device__ float __nv_fast_powf(float __a, float __b);
-__device__ void __nv_fast_sincosf(float __a, float *__sptr, float *__cptr);
-__device__ float __nv_fast_sinf(float __a);
-__device__ float __nv_fast_tanf(float __a);
-__device__ double __nv_fdim(double __a, double __b);
-__device__ float __nv_fdimf(float __a, float __b);
-__device__ float __nv_fdiv_rd(float __a, float __b);
-__device__ float __nv_fdiv_rn(float __a, float __b);
-__device__ float __nv_fdiv_ru(float __a, float __b);
-__device__ float __nv_fdiv_rz(float __a, float __b);
-__device__ int __nv_ffs(int __a);
-__device__ int __nv_ffsll(long long __a);
-__device__ int __nv_finitef(float __a);
-__device__ unsigned short __nv_float2half_rn(float __a);
-__device__ int __nv_float2int_rd(float __a);
-__device__ int __nv_float2int_rn(float __a);
-__device__ int __nv_float2int_ru(float __a);
-__device__ int __nv_float2int_rz(float __a);
-__device__ long long __nv_float2ll_rd(float __a);
-__device__ long long __nv_float2ll_rn(float __a);
-__device__ long long __nv_float2ll_ru(float __a);
-__device__ long long __nv_float2ll_rz(float __a);
-__device__ unsigned int __nv_float2uint_rd(float __a);
-__device__ unsigned int __nv_float2uint_rn(float __a);
-__device__ unsigned int __nv_float2uint_ru(float __a);
-__device__ unsigned int __nv_float2uint_rz(float __a);
-__device__ unsigned long long __nv_float2ull_rd(float __a);
-__device__ unsigned long long __nv_float2ull_rn(float __a);
-__device__ unsigned long long __nv_float2ull_ru(float __a);
-__device__ unsigned long long __nv_float2ull_rz(float __a);
-__device__ int __nv_float_as_int(float __a);
-__device__ unsigned int __nv_float_as_uint(float __a);
-__device__ double __nv_floor(double __a);
-__device__ float __nv_floorf(float __a);
-__device__ double __nv_fma(double __a, double __b, double __c);
-__device__ float __nv_fmaf(float __a, float __b, float __c);
-__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
-__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
-__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
-__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
-__device__ float __nv_fmaf_rd(float __a, float __b, float __c);
-__device__ float __nv_fmaf_rn(float __a, float __b, float __c);
-__device__ float __nv_fmaf_ru(float __a, float __b, float __c);
-__device__ float __nv_fmaf_rz(float __a, float __b, float __c);
-__device__ double __nv_fma_rd(double __a, double __b, double __c);
-__device__ double __nv_fma_rn(double __a, double __b, double __c);
-__device__ double __nv_fma_ru(double __a, double __b, double __c);
-__device__ double __nv_fma_rz(double __a, double __b, double __c);
-__device__ double __nv_fmax(double __a, double __b);
-__device__ float __nv_fmaxf(float __a, float __b);
-__device__ double __nv_fmin(double __a, double __b);
-__device__ float __nv_fminf(float __a, float __b);
-__device__ double __nv_fmod(double __a, double __b);
-__device__ float __nv_fmodf(float __a, float __b);
-__device__ float __nv_fmul_rd(float __a, float __b);
-__device__ float __nv_fmul_rn(float __a, float __b);
-__device__ float __nv_fmul_ru(float __a, float __b);
-__device__ float __nv_fmul_rz(float __a, float __b);
-__device__ float __nv_frcp_rd(float __a);
-__device__ float __nv_frcp_rn(float __a);
-__device__ float __nv_frcp_ru(float __a);
-__device__ float __nv_frcp_rz(float __a);
-__device__ double __nv_frexp(double __a, int *__b);
-__device__ float __nv_frexpf(float __a, int *__b);
-__device__ float __nv_frsqrt_rn(float __a);
-__device__ float __nv_fsqrt_rd(float __a);
-__device__ float __nv_fsqrt_rn(float __a);
-__device__ float __nv_fsqrt_ru(float __a);
-__device__ float __nv_fsqrt_rz(float __a);
-__device__ float __nv_fsub_rd(float __a, float __b);
-__device__ float __nv_fsub_rn(float __a, float __b);
-__device__ float __nv_fsub_ru(float __a, float __b);
-__device__ float __nv_fsub_rz(float __a, float __b);
-__device__ int __nv_hadd(int __a, int __b);
-__device__ float __nv_half2float(unsigned short __h);
-__device__ double __nv_hiloint2double(int __a, int __b);
-__device__ double __nv_hypot(double __a, double __b);
-__device__ float __nv_hypotf(float __a, float __b);
-__device__ int __nv_ilogb(double __a);
-__device__ int __nv_ilogbf(float __a);
-__device__ double __nv_int2double_rn(int __a);
-__device__ float __nv_int2float_rd(int __a);
-__device__ float __nv_int2float_rn(int __a);
-__device__ float __nv_int2float_ru(int __a);
-__device__ float __nv_int2float_rz(int __a);
-__device__ float __nv_int_as_float(int __a);
-__device__ int __nv_isfinited(double __a);
-__device__ int __nv_isinfd(double __a);
-__device__ int __nv_isinff(float __a);
-__device__ int __nv_isnand(double __a);
-__device__ int __nv_isnanf(float __a);
-__device__ double __nv_j0(double __a);
-__device__ float __nv_j0f(float __a);
-__device__ double __nv_j1(double __a);
-__device__ float __nv_j1f(float __a);
-__device__ float __nv_jnf(int __a, float __b);
-__device__ double __nv_jn(int __a, double __b);
-__device__ double __nv_ldexp(double __a, int __b);
-__device__ float __nv_ldexpf(float __a, int __b);
-__device__ double __nv_lgamma(double __a);
-__device__ float __nv_lgammaf(float __a);
-__device__ double __nv_ll2double_rd(long long __a);
-__device__ double __nv_ll2double_rn(long long __a);
-__device__ double __nv_ll2double_ru(long long __a);
-__device__ double __nv_ll2double_rz(long long __a);
-__device__ float __nv_ll2float_rd(long long __a);
-__device__ float __nv_ll2float_rn(long long __a);
-__device__ float __nv_ll2float_ru(long long __a);
-__device__ float __nv_ll2float_rz(long long __a);
-__device__ long long __nv_llabs(long long __a);
-__device__ long long __nv_llmax(long long __a, long long __b);
-__device__ long long __nv_llmin(long long __a, long long __b);
-__device__ long long __nv_llrint(double __a);
-__device__ long long __nv_llrintf(float __a);
-__device__ long long __nv_llround(double __a);
-__device__ long long __nv_llroundf(float __a);
-__device__ double __nv_log10(double __a);
-__device__ float __nv_log10f(float __a);
-__device__ double __nv_log1p(double __a);
-__device__ float __nv_log1pf(float __a);
-__device__ double __nv_log2(double __a);
-__device__ float __nv_log2f(float __a);
-__device__ double __nv_logb(double __a);
-__device__ float __nv_logbf(float __a);
-__device__ double __nv_log(double __a);
-__device__ float __nv_logf(float __a);
-__device__ double __nv_longlong_as_double(long long __a);
-__device__ int __nv_max(int __a, int __b);
-__device__ int __nv_min(int __a, int __b);
-__device__ double __nv_modf(double __a, double *__b);
-__device__ float __nv_modff(float __a, float *__b);
-__device__ int __nv_mul24(int __a, int __b);
-__device__ long long __nv_mul64hi(long long __a, long long __b);
-__device__ int __nv_mulhi(int __a, int __b);
-__device__ double __nv_nan(const signed char *__a);
-__device__ float __nv_nanf(const signed char *__a);
-__device__ double __nv_nearbyint(double __a);
-__device__ float __nv_nearbyintf(float __a);
-__device__ double __nv_nextafter(double __a, double __b);
-__device__ float __nv_nextafterf(float __a, float __b);
-__device__ double __nv_norm3d(double __a, double __b, double __c);
-__device__ float __nv_norm3df(float __a, float __b, float __c);
-__device__ double __nv_norm4d(double __a, double __b, double __c, double __d);
-__device__ float __nv_norm4df(float __a, float __b, float __c, float __d);
-__device__ double __nv_normcdf(double __a);
-__device__ float __nv_normcdff(float __a);
-__device__ double __nv_normcdfinv(double __a);
-__device__ float __nv_normcdfinvf(float __a);
-__device__ float __nv_normf(int __a, const float *__b);
-__device__ double __nv_norm(int __a, const double *__b);
-__device__ int __nv_popc(int __a);
-__device__ int __nv_popcll(long long __a);
-__device__ double __nv_pow(double __a, double __b);
-__device__ float __nv_powf(float __a, float __b);
-__device__ double __nv_powi(double __a, int __b);
-__device__ float __nv_powif(float __a, int __b);
-__device__ double __nv_rcbrt(double __a);
-__device__ float __nv_rcbrtf(float __a);
-__device__ double __nv_rcp64h(double __a);
-__device__ double __nv_remainder(double __a, double __b);
-__device__ float __nv_remainderf(float __a, float __b);
-__device__ double __nv_remquo(double __a, double __b, int *__c);
-__device__ float __nv_remquof(float __a, float __b, int *__c);
-__device__ int __nv_rhadd(int __a, int __b);
-__device__ double __nv_rhypot(double __a, double __b);
-__device__ float __nv_rhypotf(float __a, float __b);
-__device__ double __nv_rint(double __a);
-__device__ float __nv_rintf(float __a);
-__device__ double __nv_rnorm3d(double __a, double __b, double __c);
-__device__ float __nv_rnorm3df(float __a, float __b, float __c);
-__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
-__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
-__device__ float __nv_rnormf(int __a, const float *__b);
-__device__ double __nv_rnorm(int __a, const double *__b);
-__device__ double __nv_round(double __a);
-__device__ float __nv_roundf(float __a);
-__device__ double __nv_rsqrt(double __a);
-__device__ float __nv_rsqrtf(float __a);
-__device__ int __nv_sad(int __a, int __b, int __c);
-__device__ float __nv_saturatef(float __a);
-__device__ double __nv_scalbn(double __a, int __b);
-__device__ float __nv_scalbnf(float __a, int __b);
-__device__ int __nv_signbitd(double __a);
-__device__ int __nv_signbitf(float __a);
-__device__ void __nv_sincos(double __a, double *__b, double *__c);
-__device__ void __nv_sincosf(float __a, float *__b, float *__c);
-__device__ void __nv_sincospi(double __a, double *__b, double *__c);
-__device__ void __nv_sincospif(float __a, float *__b, float *__c);
-__device__ double __nv_sin(double __a);
-__device__ float __nv_sinf(float __a);
-__device__ double __nv_sinh(double __a);
-__device__ float __nv_sinhf(float __a);
-__device__ double __nv_sinpi(double __a);
-__device__ float __nv_sinpif(float __a);
-__device__ double __nv_sqrt(double __a);
-__device__ float __nv_sqrtf(float __a);
-__device__ double __nv_tan(double __a);
-__device__ float __nv_tanf(float __a);
-__device__ double __nv_tanh(double __a);
-__device__ float __nv_tanhf(float __a);
-__device__ double __nv_tgamma(double __a);
-__device__ float __nv_tgammaf(float __a);
-__device__ double __nv_trunc(double __a);
-__device__ float __nv_truncf(float __a);
-__device__ int __nv_uhadd(unsigned int __a, unsigned int __b);
-__device__ double __nv_uint2double_rn(unsigned int __i);
-__device__ float __nv_uint2float_rd(unsigned int __a);
-__device__ float __nv_uint2float_rn(unsigned int __a);
-__device__ float __nv_uint2float_ru(unsigned int __a);
-__device__ float __nv_uint2float_rz(unsigned int __a);
-__device__ float __nv_uint_as_float(unsigned int __a);
-__device__ double __nv_ull2double_rd(unsigned long long __a);
-__device__ double __nv_ull2double_rn(unsigned long long __a);
-__device__ double __nv_ull2double_ru(unsigned long long __a);
-__device__ double __nv_ull2double_rz(unsigned long long __a);
-__device__ float __nv_ull2float_rd(unsigned long long __a);
-__device__ float __nv_ull2float_rn(unsigned long long __a);
-__device__ float __nv_ull2float_ru(unsigned long long __a);
-__device__ float __nv_ull2float_rz(unsigned long long __a);
-__device__ unsigned long long __nv_ullmax(unsigned long long __a,
+#if defined(_OPENMP)
+#define __DEVICE__
+#elif defined(__CUDA__)
+#define __DEVICE__ __device__
+#endif
+
+__DEVICE__ int __nv_abs(int __a);
+__DEVICE__ double __nv_acos(double __a);
+__DEVICE__ float __nv_acosf(float __a);
+__DEVICE__ double __nv_acosh(double __a);
+__DEVICE__ float __nv_acoshf(float __a);
+__DEVICE__ double __nv_asin(double __a);
+__DEVICE__ float __nv_asinf(float __a);
+__DEVICE__ double __nv_asinh(double __a);
+__DEVICE__ float __nv_asinhf(float __a);
+__DEVICE__ double __nv_atan2(double __a, double __b);
+__DEVICE__ float __nv_atan2f(float __a, float __b);
+__DEVICE__ double __nv_atan(double __a);
+__DEVICE__ float __nv_atanf(float __a);
+__DEVICE__ double __nv_atanh(double __a);
+__DEVICE__ float __nv_atanhf(float __a);
+__DEVICE__ int __nv_brev(int __a);
+__DEVICE__ long long __nv_brevll(long long __a);
+__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c);
+__DEVICE__ double __nv_cbrt(double __a);
+__DEVICE__ float __nv_cbrtf(float __a);
+__DEVICE__ double __nv_ceil(double __a);
+__DEVICE__ float __nv_ceilf(float __a);
+__DEVICE__ int __nv_clz(int __a);
+__DEVICE__ int __nv_clzll(long long __a);
+__DEVICE__ double __nv_copysign(double __a, double __b);
+__DEVICE__ float __nv_copysignf(float __a, float __b);
+__DEVICE__ double __nv_cos(double __a);
+__DEVICE__ float __nv_cosf(float __a);
+__DEVICE__ double __nv_cosh(double __a);
+__DEVICE__ float __nv_coshf(float __a);
+__DEVICE__ double __nv_cospi(double __a);
+__DEVICE__ float __nv_cospif(float __a);
+__DEVICE__ double __nv_cyl_bessel_i0(double __a);
+__DEVICE__ float __nv_cyl_bessel_i0f(float __a);
+__DEVICE__ double __nv_cyl_bessel_i1(double __a);
+__DEVICE__ float __nv_cyl_bessel_i1f(float __a);
+__DEVICE__ double __nv_dadd_rd(double __a, double __b);
+__DEVICE__ double __nv_dadd_rn(double __a, double __b);
+__DEVICE__ double __nv_dadd_ru(double __a, double __b);
+__DEVICE__ double __nv_dadd_rz(double __a, double __b);
+__DEVICE__ double __nv_ddiv_rd(double __a, double __b);
+__DEVICE__ double __nv_ddiv_rn(double __a, double __b);
+__DEVICE__ double __nv_ddiv_ru(double __a, double __b);
+__DEVICE__ double __nv_ddiv_rz(double __a, double __b);
+__DEVICE__ double __nv_dmul_rd(double __a, double __b);
+__DEVICE__ double __nv_dmul_rn(double __a, double __b);
+__DEVICE__ double __nv_dmul_ru(double __a, double __b);
+__DEVICE__ double __nv_dmul_rz(double __a, double __b);
+__DEVICE__ float __nv_double2float_rd(double __a);
+__DEVICE__ float __nv_double2float_rn(double __a);
+__DEVICE__ float __nv_double2float_ru(double __a);
+__DEVICE__ float __nv_double2float_rz(double __a);
+__DEVICE__ int __nv_double2hiint(double __a);
+__DEVICE__ int __nv_double2int_rd(double __a);
+__DEVICE__ int __nv_double2int_rn(double __a);
+__DEVICE__ int __nv_double2int_ru(double __a);
+__DEVICE__ int __nv_double2int_rz(double __a);
+__DEVICE__ long long __nv_double2ll_rd(double __a);
+__DEVICE__ long long __nv_double2ll_rn(double __a);
+__DEVICE__ long long __nv_double2ll_ru(double __a);
+__DEVICE__ long long __nv_double2ll_rz(double __a);
+__DEVICE__ int __nv_double2loint(double __a);
+__DEVICE__ unsigned int __nv_double2uint_rd(double __a);
+__DEVICE__ unsigned int __nv_double2uint_rn(double __a);
+__DEVICE__ unsigned int __nv_double2uint_ru(double __a);
+__DEVICE__ unsigned int __nv_double2uint_rz(double __a);
+__DEVICE__ unsigned long long __nv_double2ull_rd(double __a);
+__DEVICE__ unsigned long long __nv_double2ull_rn(double __a);
+__DEVICE__ unsigned long long __nv_double2ull_ru(double __a);
+__DEVICE__ unsigned long long __nv_double2ull_rz(double __a);
+__DEVICE__ unsigned long long __nv_double_as_longlong(double __a);
+__DEVICE__ double __nv_drcp_rd(double __a);
+__DEVICE__ double __nv_drcp_rn(double __a);
+__DEVICE__ double __nv_drcp_ru(double __a);
+__DEVICE__ double __nv_drcp_rz(double __a);
+__DEVICE__ double __nv_dsqrt_rd(double __a);
+__DEVICE__ double __nv_dsqrt_rn(double __a);
+__DEVICE__ double __nv_dsqrt_ru(double __a);
+__DEVICE__ double __nv_dsqrt_rz(double __a);
+__DEVICE__ double __nv_dsub_rd(double __a, double __b);
+__DEVICE__ double __nv_dsub_rn(double __a, double __b);
+__DEVICE__ double __nv_dsub_ru(double __a, double __b);
+__DEVICE__ double __nv_dsub_rz(double __a, double __b);
+__DEVICE__ double __nv_erfc(double __a);
+__DEVICE__ float __nv_erfcf(float __a);
+__DEVICE__ double __nv_erfcinv(double __a);
+__DEVICE__ float __nv_erfcinvf(float __a);
+__DEVICE__ double __nv_erfcx(double __a);
+__DEVICE__ float __nv_erfcxf(float __a);
+__DEVICE__ double __nv_erf(double __a);
+__DEVICE__ float __nv_erff(float __a);
+__DEVICE__ double __nv_erfinv(double __a);
+__DEVICE__ float __nv_erfinvf(float __a);
+__DEVICE__ double __nv_exp10(double __a);
+__DEVICE__ float __nv_exp10f(float __a);
+__DEVICE__ double __nv_exp2(double __a);
+__DEVICE__ float __nv_exp2f(float __a);
+__DEVICE__ double __nv_exp(double __a);
+__DEVICE__ float __nv_expf(float __a);
+__DEVICE__ double __nv_expm1(double __a);
+__DEVICE__ float __nv_expm1f(float __a);
+__DEVICE__ double __nv_fabs(double __a);
+__DEVICE__ float __nv_fabsf(float __a);
+__DEVICE__ float __nv_fadd_rd(float __a, float __b);
+__DEVICE__ float __nv_fadd_rn(float __a, float __b);
+__DEVICE__ float __nv_fadd_ru(float __a, float __b);
+__DEVICE__ float __nv_fadd_rz(float __a, float __b);
+__DEVICE__ float __nv_fast_cosf(float __a);
+__DEVICE__ float __nv_fast_exp10f(float __a);
+__DEVICE__ float __nv_fast_expf(float __a);
+__DEVICE__ float __nv_fast_fdividef(float __a, float __b);
+__DEVICE__ float __nv_fast_log10f(float __a);
+__DEVICE__ float __nv_fast_log2f(float __a);
+__DEVICE__ float __nv_fast_logf(float __a);
+__DEVICE__ float __nv_fast_powf(float __a, float __b);
+__DEVICE__ void __nv_fast_sincosf(float __a, float *__s, float *__c);
+__DEVICE__ float __nv_fast_sinf(float __a);
+__DEVICE__ float __nv_fast_tanf(float __a);
+__DEVICE__ double __nv_fdim(double __a, double __b);
+__DEVICE__ float __nv_fdimf(float __a, float __b);
+__DEVICE__ float __nv_fdiv_rd(float __a, float __b);
+__DEVICE__ float __nv_fdiv_rn(float __a, float __b);
+__DEVICE__ float __nv_fdiv_ru(float __a, float __b);
+__DEVICE__ float __nv_fdiv_rz(float __a, float __b);
+__DEVICE__ int __nv_ffs(int __a);
+__DEVICE__ int __nv_ffsll(long long __a);
+__DEVICE__ int __nv_finitef(float __a);
+__DEVICE__ unsigned short __nv_float2half_rn(float __a);
+__DEVICE__ int __nv_float2int_rd(float __a);
+__DEVICE__ int __nv_float2int_rn(float __a);
+__DEVICE__ int __nv_float2int_ru(float __a);
+__DEVICE__ int __nv_float2int_rz(float __a);
+__DEVICE__ long long __nv_float2ll_rd(float __a);
+__DEVICE__ long long __nv_float2ll_rn(float __a);
+__DEVICE__ long long __nv_float2ll_ru(float __a);
+__DEVICE__ long long __nv_float2ll_rz(float __a);
+__DEVICE__ unsigned int __nv_float2uint_rd(float __a);
+__DEVICE__ unsigned int __nv_float2uint_rn(float __a);
+__DEVICE__ unsigned int __nv_float2uint_ru(float __a);
+__DEVICE__ unsigned int __nv_float2uint_rz(float __a);
+__DEVICE__ unsigned long long __nv_float2ull_rd(float __a);
+__DEVICE__ unsigned long long __nv_float2ull_rn(float __a);
+__DEVICE__ unsigned long long __nv_float2ull_ru(float __a);
+__DEVICE__ unsigned long long __nv_float2ull_rz(float __a);
+__DEVICE__ int __nv_float_as_int(float __a);
+__DEVICE__ unsigned int __nv_float_as_uint(float __a);
+__DEVICE__ double __nv_floor(double __a);
+__DEVICE__ float __nv_floorf(float __a);
+__DEVICE__ double __nv_fma(double __a, double __b, double __c);
+__DEVICE__ float __nv_fmaf(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c);
+__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c);
+__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c);
+__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c);
+__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c);
+__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c);
+__DEVICE__ double __nv_fmax(double __a, double __b);
+__DEVICE__ float __nv_fmaxf(float __a, float __b);
+__DEVICE__ double __nv_fmin(double __a, double __b);
+__DEVICE__ float __nv_fminf(float __a, float __b);
+__DEVICE__ double __nv_fmod(double __a, double __b);
+__DEVICE__ float __nv_fmodf(float __a, float __b);
+__DEVICE__ float __nv_fmul_rd(float __a, float __b);
+__DEVICE__ float __nv_fmul_rn(float __a, float __b);
+__DEVICE__ float __nv_fmul_ru(float __a, float __b);
+__DEVICE__ float __nv_fmul_rz(float __a, float __b);
+__DEVICE__ float __nv_frcp_rd(float __a);
+__DEVICE__ float __nv_frcp_rn(float __a);
+__DEVICE__ float __nv_frcp_ru(float __a);
+__DEVICE__ float __nv_frcp_rz(float __a);
+__DEVICE__ double __nv_frexp(double __a, int *__b);
+__DEVICE__ float __nv_frexpf(float __a, int *__b);
+__DEVICE__ float __nv_frsqrt_rn(float __a);
+__DEVICE__ float __nv_fsqrt_rd(float __a);
+__DEVICE__ float __nv_fsqrt_rn(float __a);
+__DEVICE__ float __nv_fsqrt_ru(float __a);
+__DEVICE__ float __nv_fsqrt_rz(float __a);
+__DEVICE__ float __nv_fsub_rd(float __a, float __b);
+__DEVICE__ float __nv_fsub_rn(float __a, float __b);
+__DEVICE__ float __nv_fsub_ru(float __a, float __b);
+__DEVICE__ float __nv_fsub_rz(float __a, float __b);
+__DEVICE__ int __nv_hadd(int __a, int __b);
+__DEVICE__ float __nv_half2float(unsigned short __h);
+__DEVICE__ double __nv_hiloint2double(int __a, int __b);
+__DEVICE__ double __nv_hypot(double __a, double __b);
+__DEVICE__ float __nv_hypotf(float __a, float __b);
+__DEVICE__ int __nv_ilogb(double __a);
+__DEVICE__ int __nv_ilogbf(float __a);
+__DEVICE__ double __nv_int2double_rn(int __a);
+__DEVICE__ float __nv_int2float_rd(int __a);
+__DEVICE__ float __nv_int2float_rn(int __a);
+__DEVICE__ float __nv_int2float_ru(int __a);
+__DEVICE__ float __nv_int2float_rz(int __a);
+__DEVICE__ float __nv_int_as_float(int __a);
+__DEVICE__ int __nv_isfinited(double __a);
+__DEVICE__ int __nv_isinfd(double __a);
+__DEVICE__ int __nv_isinff(float __a);
+__DEVICE__ int __nv_isnand(double __a);
+__DEVICE__ int __nv_isnanf(float __a);
+__DEVICE__ double __nv_j0(double __a);
+__DEVICE__ float __nv_j0f(float __a);
+__DEVICE__ double __nv_j1(double __a);
+__DEVICE__ float __nv_j1f(float __a);
+__DEVICE__ float __nv_jnf(int __a, float __b);
+__DEVICE__ double __nv_jn(int __a, double __b);
+__DEVICE__ double __nv_ldexp(double __a, int __b);
+__DEVICE__ float __nv_ldexpf(float __a, int __b);
+__DEVICE__ double __nv_lgamma(double __a);
+__DEVICE__ float __nv_lgammaf(float __a);
+__DEVICE__ double __nv_ll2double_rd(long long __a);
+__DEVICE__ double __nv_ll2double_rn(long long __a);
+__DEVICE__ double __nv_ll2double_ru(long long __a);
+__DEVICE__ double __nv_ll2double_rz(long long __a);
+__DEVICE__ float __nv_ll2float_rd(long long __a);
+__DEVICE__ float __nv_ll2float_rn(long long __a);
+__DEVICE__ float __nv_ll2float_ru(long long __a);
+__DEVICE__ float __nv_ll2float_rz(long long __a);
+__DEVICE__ long long __nv_llabs(long long __a);
+__DEVICE__ long long __nv_llmax(long long __a, long long __b);
+__DEVICE__ long long __nv_llmin(long long __a, long long __b);
+__DEVICE__ long long __nv_llrint(double __a);
+__DEVICE__ long long __nv_llrintf(float __a);
+__DEVICE__ long long __nv_llround(double __a);
+__DEVICE__ long long __nv_llroundf(float __a);
+__DEVICE__ double __nv_log10(double __a);
+__DEVICE__ float __nv_log10f(float __a);
+__DEVICE__ double __nv_log1p(double __a);
+__DEVICE__ float __nv_log1pf(float __a);
+__DEVICE__ double __nv_log2(double __a);
+__DEVICE__ float __nv_log2f(float __a);
+__DEVICE__ double __nv_logb(double __a);
+__DEVICE__ float __nv_logbf(float __a);
+__DEVICE__ double __nv_log(double __a);
+__DEVICE__ float __nv_logf(float __a);
+__DEVICE__ double __nv_longlong_as_double(long long __a);
+__DEVICE__ int __nv_max(int __a, int __b);
+__DEVICE__ int __nv_min(int __a, int __b);
+__DEVICE__ double __nv_modf(double __a, double *__b);
+__DEVICE__ float __nv_modff(float __a, float *__b);
+__DEVICE__ int __nv_mul24(int __a, int __b);
+__DEVICE__ long long __nv_mul64hi(long long __a, long long __b);
+__DEVICE__ int __nv_mulhi(int __a, int __b);
+__DEVICE__ double __nv_nan(const signed char *__a);
+__DEVICE__ float __nv_nanf(const signed char *__a);
+__DEVICE__ double __nv_nearbyint(double __a);
+__DEVICE__ float __nv_nearbyintf(float __a);
+__DEVICE__ double __nv_nextafter(double __a, double __b);
+__DEVICE__ float __nv_nextafterf(float __a, float __b);
+__DEVICE__ double __nv_norm3d(double __a, double __b, double __c);
+__DEVICE__ float __nv_norm3df(float __a, float __b, float __c);
+__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d);
+__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d);
+__DEVICE__ double __nv_normcdf(double __a);
+__DEVICE__ float __nv_normcdff(float __a);
+__DEVICE__ double __nv_normcdfinv(double __a);
+__DEVICE__ float __nv_normcdfinvf(float __a);
+__DEVICE__ float __nv_normf(int __a, const float *__b);
+__DEVICE__ double __nv_norm(int __a, const double *__b);
+__DEVICE__ int __nv_popc(int __a);
+__DEVICE__ int __nv_popcll(long long __a);
+__DEVICE__ double __nv_pow(double __a, double __b);
+__DEVICE__ float __nv_powf(float __a, float __b);
+__DEVICE__ double __nv_powi(double __a, int __b);
+__DEVICE__ float __nv_powif(float __a, int __b);
+__DEVICE__ double __nv_rcbrt(double __a);
+__DEVICE__ float __nv_rcbrtf(float __a);
+__DEVICE__ double __nv_rcp64h(double __a);
+__DEVICE__ double __nv_remainder(double __a, double __b);
+__DEVICE__ float __nv_remainderf(float __a, float __b);
+__DEVICE__ double __nv_remquo(double __a, double __b, int *__c);
+__DEVICE__ float __nv_remquof(float __a, float __b, int *__c);
+__DEVICE__ int __nv_rhadd(int __a, int __b);
+__DEVICE__ double __nv_rhypot(double __a, double __b);
+__DEVICE__ float __nv_rhypotf(float __a, float __b);
+__DEVICE__ double __nv_rint(double __a);
+__DEVICE__ float __nv_rintf(float __a);
+__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c);
+__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c);
+__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
+__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
+__DEVICE__ float __nv_rnormf(int __a, const float *__b);
+__DEVICE__ double __nv_rnorm(int __a, const double *__b);
+__DEVICE__ double __nv_round(double __a);
+__DEVICE__ float __nv_roundf(float __a);
+__DEVICE__ double __nv_rsqrt(double __a);
+__DEVICE__ float __nv_rsqrtf(float __a);
+__DEVICE__ int __nv_sad(int __a, int __b, int __c);
+__DEVICE__ float __nv_saturatef(float __a);
+__DEVICE__ double __nv_scalbn(double __a, int __b);
+__DEVICE__ float __nv_scalbnf(float __a, int __b);
+__DEVICE__ int __nv_signbitd(double __a);
+__DEVICE__ int __nv_signbitf(float __a);
+__DEVICE__ void __nv_sincos(double __a, double *__b, double *__c);
+__DEVICE__ void __nv_sincosf(float __a, float *__b, float *__c);
+__DEVICE__ void __nv_sincospi(double __a, double *__b, double *__c);
+__DEVICE__ void __nv_sincospif(float __a, float *__b, float *__c);
+__DEVICE__ double __nv_sin(double __a);
+__DEVICE__ float __nv_sinf(float __a);
+__DEVICE__ double __nv_sinh(double __a);
+__DEVICE__ float __nv_sinhf(float __a);
+__DEVICE__ double __nv_sinpi(double __a);
+__DEVICE__ float __nv_sinpif(float __a);
+__DEVICE__ double __nv_sqrt(double __a);
+__DEVICE__ float __nv_sqrtf(float __a);
+__DEVICE__ double __nv_tan(double __a);
+__DEVICE__ float __nv_tanf(float __a);
+__DEVICE__ double __nv_tanh(double __a);
+__DEVICE__ float __nv_tanhf(float __a);
+__DEVICE__ double __nv_tgamma(double __a);
+__DEVICE__ float __nv_tgammaf(float __a);
+__DEVICE__ double __nv_trunc(double __a);
+__DEVICE__ float __nv_truncf(float __a);
+__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b);
+__DEVICE__ double __nv_uint2double_rn(unsigned int __i);
+__DEVICE__ float __nv_uint2float_rd(unsigned int __a);
+__DEVICE__ float __nv_uint2float_rn(unsigned int __a);
+__DEVICE__ float __nv_uint2float_ru(unsigned int __a);
+__DEVICE__ float __nv_uint2float_rz(unsigned int __a);
+__DEVICE__ float __nv_uint_as_float(unsigned int __a);
+__DEVICE__ double __nv_ull2double_rd(unsigned long long __a);
+__DEVICE__ double __nv_ull2double_rn(unsigned long long __a);
+__DEVICE__ double __nv_ull2double_ru(unsigned long long __a);
+__DEVICE__ double __nv_ull2double_rz(unsigned long long __a);
+__DEVICE__ float __nv_ull2float_rd(unsigned long long __a);
+__DEVICE__ float __nv_ull2float_rn(unsigned long long __a);
+__DEVICE__ float __nv_ull2float_ru(unsigned long long __a);
+__DEVICE__ float __nv_ull2float_rz(unsigned long long __a);
+__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a,
                                           unsigned long long __b);
-__device__ unsigned long long __nv_ullmin(unsigned long long __a,
+__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a,
                                           unsigned long long __b);
-__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
-__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
-__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
-__device__ unsigned long long __nv_umul64hi(unsigned long long __a,
+__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a,
                                             unsigned long long __b);
-__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
-__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
-__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
+__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
+__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
                                   unsigned int __c);
 #if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
-__device__ int __nv_vabs2(int __a);
-__device__ int __nv_vabs4(int __a);
-__device__ int __nv_vabsdiffs2(int __a, int __b);
-__device__ int __nv_vabsdiffs4(int __a, int __b);
-__device__ int __nv_vabsdiffu2(int __a, int __b);
-__device__ int __nv_vabsdiffu4(int __a, int __b);
-__device__ int __nv_vabsss2(int __a);
-__device__ int __nv_vabsss4(int __a);
-__device__ int __nv_vadd2(int __a, int __b);
-__device__ int __nv_vadd4(int __a, int __b);
-__device__ int __nv_vaddss2(int __a, int __b);
-__device__ int __nv_vaddss4(int __a, int __b);
-__device__ int __nv_vaddus2(int __a, int __b);
-__device__ int __nv_vaddus4(int __a, int __b);
-__device__ int __nv_vavgs2(int __a, int __b);
-__device__ int __nv_vavgs4(int __a, int __b);
-__device__ int __nv_vavgu2(int __a, int __b);
-__device__ int __nv_vavgu4(int __a, int __b);
-__device__ int __nv_vcmpeq2(int __a, int __b);
-__device__ int __nv_vcmpeq4(int __a, int __b);
-__device__ int __nv_vcmpges2(int __a, int __b);
-__device__ int __nv_vcmpges4(int __a, int __b);
-__device__ int __nv_vcmpgeu2(int __a, int __b);
-__device__ int __nv_vcmpgeu4(int __a, int __b);
-__device__ int __nv_vcmpgts2(int __a, int __b);
-__device__ int __nv_vcmpgts4(int __a, int __b);
-__device__ int __nv_vcmpgtu2(int __a, int __b);
-__device__ int __nv_vcmpgtu4(int __a, int __b);
-__device__ int __nv_vcmples2(int __a, int __b);
-__device__ int __nv_vcmples4(int __a, int __b);
-__device__ int __nv_vcmpleu2(int __a, int __b);
-__device__ int __nv_vcmpleu4(int __a, int __b);
-__device__ int __nv_vcmplts2(int __a, int __b);
-__device__ int __nv_vcmplts4(int __a, int __b);
-__device__ int __nv_vcmpltu2(int __a, int __b);
-__device__ int __nv_vcmpltu4(int __a, int __b);
-__device__ int __nv_vcmpne2(int __a, int __b);
-__device__ int __nv_vcmpne4(int __a, int __b);
-__device__ int __nv_vhaddu2(int __a, int __b);
-__device__ int __nv_vhaddu4(int __a, int __b);
-__device__ int __nv_vmaxs2(int __a, int __b);
-__device__ int __nv_vmaxs4(int __a, int __b);
-__device__ int __nv_vmaxu2(int __a, int __b);
-__device__ int __nv_vmaxu4(int __a, int __b);
-__device__ int __nv_vmins2(int __a, int __b);
-__device__ int __nv_vmins4(int __a, int __b);
-__device__ int __nv_vminu2(int __a, int __b);
-__device__ int __nv_vminu4(int __a, int __b);
-__device__ int __nv_vneg2(int __a);
-__device__ int __nv_vneg4(int __a);
-__device__ int __nv_vnegss2(int __a);
-__device__ int __nv_vnegss4(int __a);
-__device__ int __nv_vsads2(int __a, int __b);
-__device__ int __nv_vsads4(int __a, int __b);
-__device__ int __nv_vsadu2(int __a, int __b);
-__device__ int __nv_vsadu4(int __a, int __b);
-__device__ int __nv_vseteq2(int __a, int __b);
-__device__ int __nv_vseteq4(int __a, int __b);
-__device__ int __nv_vsetges2(int __a, int __b);
-__device__ int __nv_vsetges4(int __a, int __b);
-__device__ int __nv_vsetgeu2(int __a, int __b);
-__device__ int __nv_vsetgeu4(int __a, int __b);
-__device__ int __nv_vsetgts2(int __a, int __b);
-__device__ int __nv_vsetgts4(int __a, int __b);
-__device__ int __nv_vsetgtu2(int __a, int __b);
-__device__ int __nv_vsetgtu4(int __a, int __b);
-__device__ int __nv_vsetles2(int __a, int __b);
-__device__ int __nv_vsetles4(int __a, int __b);
-__device__ int __nv_vsetleu2(int __a, int __b);
-__device__ int __nv_vsetleu4(int __a, int __b);
-__device__ int __nv_vsetlts2(int __a, int __b);
-__device__ int __nv_vsetlts4(int __a, int __b);
-__device__ int __nv_vsetltu2(int __a, int __b);
-__device__ int __nv_vsetltu4(int __a, int __b);
-__device__ int __nv_vsetne2(int __a, int __b);
-__device__ int __nv_vsetne4(int __a, int __b);
-__device__ int __nv_vsub2(int __a, int __b);
-__device__ int __nv_vsub4(int __a, int __b);
-__device__ int __nv_vsubss2(int __a, int __b);
-__device__ int __nv_vsubss4(int __a, int __b);
-__device__ int __nv_vsubus2(int __a, int __b);
-__device__ int __nv_vsubus4(int __a, int __b);
+__DEVICE__ int __nv_vabs2(int __a);
+__DEVICE__ int __nv_vabs4(int __a);
+__DEVICE__ int __nv_vabsdiffs2(int __a, int __b);
+__DEVICE__ int __nv_vabsdiffs4(int __a, int __b);
+__DEVICE__ int __nv_vabsdiffu2(int __a, int __b);
+__DEVICE__ int __nv_vabsdiffu4(int __a, int __b);
+__DEVICE__ int __nv_vabsss2(int __a);
+__DEVICE__ int __nv_vabsss4(int __a);
+__DEVICE__ int __nv_vadd2(int __a, int __b);
+__DEVICE__ int __nv_vadd4(int __a, int __b);
+__DEVICE__ int __nv_vaddss2(int __a, int __b);
+__DEVICE__ int __nv_vaddss4(int __a, int __b);
+__DEVICE__ int __nv_vaddus2(int __a, int __b);
+__DEVICE__ int __nv_vaddus4(int __a, int __b);
+__DEVICE__ int __nv_vavgs2(int __a, int __b);
+__DEVICE__ int __nv_vavgs4(int __a, int __b);
+__DEVICE__ int __nv_vavgu2(int __a, int __b);
+__DEVICE__ int __nv_vavgu4(int __a, int __b);
+__DEVICE__ int __nv_vcmpeq2(int __a, int __b);
+__DEVICE__ int __nv_vcmpeq4(int __a, int __b);
+__DEVICE__ int __nv_vcmpges2(int __a, int __b);
+__DEVICE__ int __nv_vcmpges4(int __a, int __b);
+__DEVICE__ int __nv_vcmpgeu2(int __a, int __b);
+__DEVICE__ int __nv_vcmpgeu4(int __a, int __b);
+__DEVICE__ int __nv_vcmpgts2(int __a, int __b);
+__DEVICE__ int __nv_vcmpgts4(int __a, int __b);
+__DEVICE__ int __nv_vcmpgtu2(int __a, int __b);
+__DEVICE__ int __nv_vcmpgtu4(int __a, int __b);
+__DEVICE__ int __nv_vcmples2(int __a, int __b);
+__DEVICE__ int __nv_vcmples4(int __a, int __b);
+__DEVICE__ int __nv_vcmpleu2(int __a, int __b);
+__DEVICE__ int __nv_vcmpleu4(int __a, int __b);
+__DEVICE__ int __nv_vcmplts2(int __a, int __b);
+__DEVICE__ int __nv_vcmplts4(int __a, int __b);
+__DEVICE__ int __nv_vcmpltu2(int __a, int __b);
+__DEVICE__ int __nv_vcmpltu4(int __a, int __b);
+__DEVICE__ int __nv_vcmpne2(int __a, int __b);
+__DEVICE__ int __nv_vcmpne4(int __a, int __b);
+__DEVICE__ int __nv_vhaddu2(int __a, int __b);
+__DEVICE__ int __nv_vhaddu4(int __a, int __b);
+__DEVICE__ int __nv_vmaxs2(int __a, int __b);
+__DEVICE__ int __nv_vmaxs4(int __a, int __b);
+__DEVICE__ int __nv_vmaxu2(int __a, int __b);
+__DEVICE__ int __nv_vmaxu4(int __a, int __b);
+__DEVICE__ int __nv_vmins2(int __a, int __b);
+__DEVICE__ int __nv_vmins4(int __a, int __b);
+__DEVICE__ int __nv_vminu2(int __a, int __b);
+__DEVICE__ int __nv_vminu4(int __a, int __b);
+__DEVICE__ int __nv_vneg2(int __a);
+__DEVICE__ int __nv_vneg4(int __a);
+__DEVICE__ int __nv_vnegss2(int __a);
+__DEVICE__ int __nv_vnegss4(int __a);
+__DEVICE__ int __nv_vsads2(int __a, int __b);
+__DEVICE__ int __nv_vsads4(int __a, int __b);
+__DEVICE__ int __nv_vsadu2(int __a, int __b);
+__DEVICE__ int __nv_vsadu4(int __a, int __b);
+__DEVICE__ int __nv_vseteq2(int __a, int __b);
+__DEVICE__ int __nv_vseteq4(int __a, int __b);
+__DEVICE__ int __nv_vsetges2(int __a, int __b);
+__DEVICE__ int __nv_vsetges4(int __a, int __b);
+__DEVICE__ int __nv_vsetgeu2(int __a, int __b);
+__DEVICE__ int __nv_vsetgeu4(int __a, int __b);
+__DEVICE__ int __nv_vsetgts2(int __a, int __b);
+__DEVICE__ int __nv_vsetgts4(int __a, int __b);
+__DEVICE__ int __nv_vsetgtu2(int __a, int __b);
+__DEVICE__ int __nv_vsetgtu4(int __a, int __b);
+__DEVICE__ int __nv_vsetles2(int __a, int __b);
+__DEVICE__ int __nv_vsetles4(int __a, int __b);
+__DEVICE__ int __nv_vsetleu2(int __a, int __b);
+__DEVICE__ int __nv_vsetleu4(int __a, int __b);
+__DEVICE__ int __nv_vsetlts2(int __a, int __b);
+__DEVICE__ int __nv_vsetlts4(int __a, int __b);
+__DEVICE__ int __nv_vsetltu2(int __a, int __b);
+__DEVICE__ int __nv_vsetltu4(int __a, int __b);
+__DEVICE__ int __nv_vsetne2(int __a, int __b);
+__DEVICE__ int __nv_vsetne4(int __a, int __b);
+__DEVICE__ int __nv_vsub2(int __a, int __b);
+__DEVICE__ int __nv_vsub4(int __a, int __b);
+__DEVICE__ int __nv_vsubss2(int __a, int __b);
+__DEVICE__ int __nv_vsubss4(int __a, int __b);
+__DEVICE__ int __nv_vsubus2(int __a, int __b);
+__DEVICE__ int __nv_vsubus4(int __a, int __b);
 #endif  // CUDA_VERSION
-__device__ double __nv_y0(double __a);
-__device__ float __nv_y0f(float __a);
-__device__ double __nv_y1(double __a);
-__device__ float __nv_y1f(float __a);
-__device__ float __nv_ynf(int __a, float __b);
-__device__ double __nv_yn(int __a, double __b);
+__DEVICE__ double __nv_y0(double __a);
+__DEVICE__ float __nv_y0f(float __a);
+__DEVICE__ double __nv_y1(double __a);
+__DEVICE__ float __nv_y1f(float __a);
+__DEVICE__ float __nv_ynf(int __a, float __b);
+__DEVICE__ double __nv_yn(int __a, double __b);
+#if defined(__cplusplus)
 } // extern "C"
+#endif
 #endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__
diff --git a/lib/include/__clang_cuda_math_forward_declares.h b/lib/include/__clang_cuda_math_forward_declares.h
index c31b1f4cd..0afe4db55 100644
--- a/lib/include/__clang_cuda_math_forward_declares.h
+++ b/lib/include/__clang_cuda_math_forward_declares.h
@@ -1,22 +1,8 @@
 /*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -34,14 +20,37 @@
 // would preclude the use of our own __device__ overloads for these functions.
 
 #pragma push_macro("__DEVICE__")
+#ifdef _OPENMP
+#define __DEVICE__ static __inline__ __attribute__((always_inline))
+#else
 #define __DEVICE__                                                             \
   static __inline__ __attribute__((always_inline)) __attribute__((device))
+#endif
 
-__DEVICE__ double abs(double);
-__DEVICE__ float abs(float);
-__DEVICE__ int abs(int);
+// For C++ 17 we need to include noexcept attribute to be compatible
+// with the header-defined version. This may be removed once
+// variant is supported.
+#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
+#define __NOEXCEPT noexcept
+#else
+#define __NOEXCEPT
+#endif
+
+#if !(defined(_OPENMP) && defined(__cplusplus))
 __DEVICE__ long abs(long);
 __DEVICE__ long long abs(long long);
+__DEVICE__ double abs(double);
+__DEVICE__ float abs(float);
+#endif
+// While providing the CUDA declarations and definitions for math functions,
+// we may manually define additional functions.
+// TODO: Once variant is supported the additional functions will have
+// to be removed.
+#if defined(_OPENMP) && defined(__cplusplus)
+__DEVICE__ const double abs(const double);
+__DEVICE__ const float abs(const float);
+#endif
+__DEVICE__ int abs(int) __NOEXCEPT;
 __DEVICE__ double acos(double);
 __DEVICE__ float acos(float);
 __DEVICE__ double acosh(double);
@@ -76,8 +85,8 @@ __DEVICE__ double exp(double);
 __DEVICE__ float exp(float);
 __DEVICE__ double expm1(double);
 __DEVICE__ float expm1(float);
-__DEVICE__ double fabs(double);
-__DEVICE__ float fabs(float);
+__DEVICE__ double fabs(double) __NOEXCEPT;
+__DEVICE__ float fabs(float) __NOEXCEPT;
 __DEVICE__ double fdim(double, double);
 __DEVICE__ float fdim(float, float);
 __DEVICE__ double floor(double);
@@ -98,12 +107,18 @@ __DEVICE__ double hypot(double, double);
 __DEVICE__ float hypot(float, float);
 __DEVICE__ int ilogb(double);
 __DEVICE__ int ilogb(float);
+#ifdef _MSC_VER
+__DEVICE__ bool isfinite(long double);
+#endif
 __DEVICE__ bool isfinite(double);
 __DEVICE__ bool isfinite(float);
 __DEVICE__ bool isgreater(double, double);
 __DEVICE__ bool isgreaterequal(double, double);
 __DEVICE__ bool isgreaterequal(float, float);
 __DEVICE__ bool isgreater(float, float);
+#ifdef _MSC_VER
+__DEVICE__ bool isinf(long double);
+#endif
 __DEVICE__ bool isinf(double);
 __DEVICE__ bool isinf(float);
 __DEVICE__ bool isless(double, double);
@@ -112,18 +127,21 @@ __DEVICE__ bool islessequal(float, float);
 __DEVICE__ bool isless(float, float);
 __DEVICE__ bool islessgreater(double, double);
 __DEVICE__ bool islessgreater(float, float);
+#ifdef _MSC_VER
+__DEVICE__ bool isnan(long double);
+#endif
 __DEVICE__ bool isnan(double);
 __DEVICE__ bool isnan(float);
 __DEVICE__ bool isnormal(double);
 __DEVICE__ bool isnormal(float);
 __DEVICE__ bool isunordered(double, double);
 __DEVICE__ bool isunordered(float, float);
-__DEVICE__ long labs(long);
+__DEVICE__ long labs(long) __NOEXCEPT;
 __DEVICE__ double ldexp(double, int);
 __DEVICE__ float ldexp(float, int);
 __DEVICE__ double lgamma(double);
 __DEVICE__ float lgamma(float);
-__DEVICE__ long long llabs(long long);
+__DEVICE__ long long llabs(long long) __NOEXCEPT;
 __DEVICE__ long long llrint(double);
 __DEVICE__ long long llrint(float);
 __DEVICE__ double log10(double);
@@ -134,6 +152,9 @@ __DEVICE__ double log2(double);
 __DEVICE__ float log2(float);
 __DEVICE__ double logb(double);
 __DEVICE__ float logb(float);
+#if defined(_OPENMP) && defined(__cplusplus)
+__DEVICE__ long double log(long double);
+#endif
 __DEVICE__ double log(double);
 __DEVICE__ float log(float);
 __DEVICE__ long lrint(double);
@@ -281,6 +302,7 @@ _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif
 
+#undef __NOEXCEPT
 #pragma pop_macro("__DEVICE__")
 
 #endif
diff --git a/lib/include/__clang_cuda_runtime_wrapper.h b/lib/include/__clang_cuda_runtime_wrapper.h
index f05c0454a..3e362dd96 100644
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@@ -1,22 +1,8 @@
 /*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -62,7 +48,7 @@
 #include "cuda.h"
 #if !defined(CUDA_VERSION)
 #error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 10000
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 10010
 #error "Unsupported CUDA version!"
 #endif
 
@@ -426,5 +412,15 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
 #pragma pop_macro("__USE_FAST_MATH__")
 #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
 
+// CUDA runtime uses this undocumented function to access kernel launch
+// configuration. The declaration is in crt/device_functions.h but that file
+// includes a lot of other stuff we don't want. Instead, we'll provide our own
+// declaration for it here.
+#if CUDA_VERSION >= 9020
+extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                                size_t sharedMem = 0,
+                                                void *stream = 0);
+#endif
+
 #endif // __CUDA__
 #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/lib/include/__stddef_max_align_t.h b/lib/include/__stddef_max_align_t.h
index 1e10ca986..e3b439285 100644
--- a/lib/include/__stddef_max_align_t.h
+++ b/lib/include/__stddef_max_align_t.h
@@ -1,24 +1,8 @@
 /*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
  *
- * Copyright (c) 2014 Chandler Carruth
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/__wmmintrin_aes.h b/lib/include/__wmmintrin_aes.h
index 70c355efc..f540319c7 100644
--- a/lib/include/__wmmintrin_aes.h
+++ b/lib/include/__wmmintrin_aes.h
@@ -1,22 +1,8 @@
 /*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/__wmmintrin_pclmul.h b/lib/include/__wmmintrin_pclmul.h
index e0f928796..fef4b93db 100644
--- a/lib/include/__wmmintrin_pclmul.h
+++ b/lib/include/__wmmintrin_pclmul.h
@@ -1,22 +1,8 @@
 /*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/adxintrin.h b/lib/include/adxintrin.h
index d6c454db8..72b9ed08f 100644
--- a/lib/include/adxintrin.h
+++ b/lib/include/adxintrin.h
@@ -1,22 +1,8 @@
 /*===---- adxintrin.h - ADX intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/altivec.h b/lib/include/altivec.h
index 2dc6adb90..4008440b2 100644
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@@ -1,22 +1,8 @@
 /*===---- altivec.h - Standard header for type generic math ---------------===*\
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
 \*===----------------------------------------------------------------------===*/
 
diff --git a/lib/include/ammintrin.h b/lib/include/ammintrin.h
index 680b4465e..3806be6eb 100644
--- a/lib/include/ammintrin.h
+++ b/lib/include/ammintrin.h
@@ -1,22 +1,8 @@
 /*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/arm64intr.h b/lib/include/arm64intr.h
index be5228361..4943b2db6 100644
--- a/lib/include/arm64intr.h
+++ b/lib/include/arm64intr.h
@@ -1,22 +1,8 @@
 /*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/arm_acle.h b/lib/include/arm_acle.h
index ab2589798..096cc261a 100644
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@@ -1,22 +1,8 @@
 /*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -611,6 +597,14 @@ __crc32cd(uint32_t __a, uint64_t __b) {
 }
 #endif
 
+/* Armv8.3-A Javascript conversion intrinsic */
+#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_JCVT)
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__jcvt(double __a) {
+  return __builtin_arm_jcvt(__a);
+}
+#endif
+
 /* 10.1 Special register intrinsics */
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
@@ -619,6 +613,16 @@ __crc32cd(uint32_t __a, uint64_t __b) {
 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
 
+// Memory Tagging Extensions (MTE) Intrinsics
+#if __ARM_FEATURE_MEMORY_TAGGING
+#define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
+#define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
+#define __arm_mte_exclude_tag(__ptr, __excluded)  __builtin_arm_gmi(__ptr, __excluded)
+#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
+#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
+#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
+#endif
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/lib/include/arm_neon.h b/lib/include/arm_neon.h
index d6765b36d..694bdfc9c 100644
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
@@ -44247,13 +44247,13 @@ __ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2)
 #endif
 #if defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
-__ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vfmlalq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
   return __ret;
 }
 #else
-__ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
   float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -44262,7 +44262,7 @@ __ai float32x4_t vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
   return __ret;
 }
-__ai float32x4_t __noswap_vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t __noswap_vfmlalq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vfmlalq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
   return __ret;
@@ -44270,13 +44270,13 @@ __ai float32x4_t __noswap_vfmlalq_high_u32(float32x4_t __p0, float16x8_t __p1, f
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __ret;
   __ret = (float32x2_t) __builtin_neon_vfmlal_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
   return __ret;
 }
 #else
-__ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
   float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
@@ -44285,7 +44285,7 @@ __ai float32x2_t vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
   return __ret;
 }
-__ai float32x2_t __noswap_vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t __noswap_vfmlal_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __ret;
   __ret = (float32x2_t) __builtin_neon_vfmlal_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
   return __ret;
@@ -44293,13 +44293,13 @@ __ai float32x2_t __noswap_vfmlal_high_u32(float32x2_t __p0, float16x4_t __p1, fl
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vfmlalq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
   return __ret;
 }
 #else
-__ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
   float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -44308,7 +44308,7 @@ __ai float32x4_t vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
   return __ret;
 }
-__ai float32x4_t __noswap_vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t __noswap_vfmlalq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vfmlalq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
   return __ret;
@@ -44316,13 +44316,13 @@ __ai float32x4_t __noswap_vfmlalq_low_u32(float32x4_t __p0, float16x8_t __p1, fl
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __ret;
   __ret = (float32x2_t) __builtin_neon_vfmlal_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
   return __ret;
 }
 #else
-__ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
   float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
@@ -44331,7 +44331,7 @@ __ai float32x2_t vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
   return __ret;
 }
-__ai float32x2_t __noswap_vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t __noswap_vfmlal_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __ret;
   __ret = (float32x2_t) __builtin_neon_vfmlal_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
   return __ret;
@@ -44339,13 +44339,13 @@ __ai float32x2_t __noswap_vfmlal_low_u32(float32x2_t __p0, float16x4_t __p1, flo
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vfmlslq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
   return __ret;
 }
 #else
-__ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
   float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -44354,7 +44354,7 @@ __ai float32x4_t vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
   return __ret;
 }
-__ai float32x4_t __noswap_vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t __noswap_vfmlslq_high_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vfmlslq_high_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
   return __ret;
@@ -44362,13 +44362,13 @@ __ai float32x4_t __noswap_vfmlslq_high_u32(float32x4_t __p0, float16x8_t __p1, f
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __ret;
   __ret = (float32x2_t) __builtin_neon_vfmlsl_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
   return __ret;
 }
 #else
-__ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
   float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
@@ -44377,7 +44377,7 @@ __ai float32x2_t vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
   return __ret;
 }
-__ai float32x2_t __noswap_vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t __noswap_vfmlsl_high_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __ret;
   __ret = (float32x2_t) __builtin_neon_vfmlsl_high_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
   return __ret;
@@ -44385,13 +44385,13 @@ __ai float32x2_t __noswap_vfmlsl_high_u32(float32x2_t __p0, float16x4_t __p1, fl
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vfmlslq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
   return __ret;
 }
 #else
-__ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
   float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   float16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -44400,7 +44400,7 @@ __ai float32x4_t vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
   return __ret;
 }
-__ai float32x4_t __noswap_vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
+__ai float32x4_t __noswap_vfmlslq_low_f16(float32x4_t __p0, float16x8_t __p1, float16x8_t __p2) {
   float32x4_t __ret;
   __ret = (float32x4_t) __builtin_neon_vfmlslq_low_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
   return __ret;
@@ -44408,13 +44408,13 @@ __ai float32x4_t __noswap_vfmlslq_low_u32(float32x4_t __p0, float16x8_t __p1, fl
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __ret;
   __ret = (float32x2_t) __builtin_neon_vfmlsl_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
   return __ret;
 }
 #else
-__ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
   float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   float16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
@@ -44423,7 +44423,7 @@ __ai float32x2_t vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
   return __ret;
 }
-__ai float32x2_t __noswap_vfmlsl_low_u32(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
+__ai float32x2_t __noswap_vfmlsl_low_f16(float32x2_t __p0, float16x4_t __p1, float16x4_t __p2) {
   float32x2_t __ret;
   __ret = (float32x2_t) __builtin_neon_vfmlsl_low_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
   return __ret;
@@ -64095,15 +64095,15 @@ __ai uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) {
+__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, uint8x16_t __p1) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) {
+__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, uint8x16_t __p1) {
   int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64129,15 +64129,15 @@ __ai uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) {
+__ai int8x8_t vqtbl1_s8(int8x16_t __p0, uint8x8_t __p1) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) {
+__ai int8x8_t vqtbl1_s8(int8x16_t __p0, uint8x8_t __p1) {
   int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64203,17 +64203,17 @@ __ai uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) {
+__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, uint8x16_t __p1) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) {
+__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, uint8x16_t __p1) {
   int8x16x2_t __rev0;
   __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64241,17 +64241,17 @@ __ai uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) {
+__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, uint8x8_t __p1) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) {
+__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, uint8x8_t __p1) {
   int8x16x2_t __rev0;
   __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64320,18 +64320,18 @@ __ai uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) {
+__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, uint8x16_t __p1) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) {
+__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, uint8x16_t __p1) {
   int8x16x3_t __rev0;
   __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64360,18 +64360,18 @@ __ai uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) {
+__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, uint8x8_t __p1) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) {
+__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, uint8x8_t __p1) {
   int8x16x3_t __rev0;
   __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64443,19 +64443,19 @@ __ai uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) {
+__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, uint8x16_t __p1) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) {
+__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, uint8x16_t __p1) {
   int8x16x4_t __rev0;
   __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64485,19 +64485,19 @@ __ai uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) {
+__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, uint8x8_t __p1) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) {
+__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, uint8x8_t __p1) {
   int8x16x4_t __rev0;
   __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64560,16 +64560,16 @@ __ai uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, uint8x16_t __p2) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, uint8x16_t __p2) {
   int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64596,16 +64596,16 @@ __ai uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) {
+__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, uint8x8_t __p2) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) {
+__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, uint8x8_t __p2) {
   int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64674,18 +64674,18 @@ __ai uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2)
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) {
+__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, uint8x16_t __p2) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) {
+__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, uint8x16_t __p2) {
   int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16x2_t __rev1;
   __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64714,18 +64714,18 @@ __ai uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) {
+__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, uint8x8_t __p2) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) {
+__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, uint8x8_t __p2) {
   int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16x2_t __rev1;
   __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64797,19 +64797,19 @@ __ai uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2)
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) {
+__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, uint8x16_t __p2) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) {
+__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, uint8x16_t __p2) {
   int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16x3_t __rev1;
   __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64839,19 +64839,19 @@ __ai uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) {
+__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, uint8x8_t __p2) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) {
+__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, uint8x8_t __p2) {
   int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16x3_t __rev1;
   __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64926,20 +64926,20 @@ __ai uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2)
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) {
+__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, uint8x16_t __p2) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) {
+__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, uint8x16_t __p2) {
   int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16x4_t __rev1;
   __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -64970,20 +64970,20 @@ __ai uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) {
+__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, uint8x8_t __p2) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) {
+__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, uint8x8_t __p2) {
   int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16x4_t __rev1;
   __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -66293,13 +66293,13 @@ __ai int64_t vshld_s64(int64_t __p0, int64_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
+__ai uint8_t vsqaddb_u8(uint8_t __p0, int8_t __p1) {
   uint8_t __ret;
   __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
   return __ret;
 }
 #else
-__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
+__ai uint8_t vsqaddb_u8(uint8_t __p0, int8_t __p1) {
   uint8_t __ret;
   __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
   return __ret;
@@ -66307,13 +66307,13 @@ __ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
+__ai uint32_t vsqadds_u32(uint32_t __p0, int32_t __p1) {
   uint32_t __ret;
   __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
   return __ret;
 }
 #else
-__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
+__ai uint32_t vsqadds_u32(uint32_t __p0, int32_t __p1) {
   uint32_t __ret;
   __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
   return __ret;
@@ -66321,13 +66321,13 @@ __ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
+__ai uint64_t vsqaddd_u64(uint64_t __p0, int64_t __p1) {
   uint64_t __ret;
   __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
   return __ret;
 }
 #else
-__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
+__ai uint64_t vsqaddd_u64(uint64_t __p0, int64_t __p1) {
   uint64_t __ret;
   __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
   return __ret;
@@ -66335,13 +66335,13 @@ __ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
+__ai uint16_t vsqaddh_u16(uint16_t __p0, int16_t __p1) {
   uint16_t __ret;
   __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
   return __ret;
 }
 #else
-__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
+__ai uint16_t vsqaddh_u16(uint16_t __p0, int16_t __p1) {
   uint16_t __ret;
   __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
   return __ret;
@@ -66349,15 +66349,15 @@ __ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, int8x16_t __p1) {
   uint8x16_t __ret;
   __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
   return __ret;
 }
 #else
-__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, int8x16_t __p1) {
   uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   uint8x16_t __ret;
   __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -66366,15 +66366,15 @@ __ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, int32x4_t __p1) {
   uint32x4_t __ret;
   __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
   return __ret;
 }
 #else
-__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, int32x4_t __p1) {
   uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   uint32x4_t __ret;
   __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
@@ -66383,15 +66383,15 @@ __ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, int64x2_t __p1) {
   uint64x2_t __ret;
   __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
   return __ret;
 }
 #else
-__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, int64x2_t __p1) {
   uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   uint64x2_t __ret;
   __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
@@ -66400,15 +66400,15 @@ __ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, int16x8_t __p1) {
   uint16x8_t __ret;
   __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
   return __ret;
 }
 #else
-__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, int16x8_t __p1) {
   uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   uint16x8_t __ret;
   __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -66417,15 +66417,15 @@ __ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, int8x8_t __p1) {
   uint8x8_t __ret;
   __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
   return __ret;
 }
 #else
-__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, int8x8_t __p1) {
   uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   uint8x8_t __ret;
   __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -66434,15 +66434,15 @@ __ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, int32x2_t __p1) {
   uint32x2_t __ret;
   __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
   return __ret;
 }
 #else
-__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, int32x2_t __p1) {
   uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   uint32x2_t __ret;
   __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
@@ -66451,13 +66451,13 @@ __ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, int64x1_t __p1) {
   uint64x1_t __ret;
   __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
   return __ret;
 }
 #else
-__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, int64x1_t __p1) {
   uint64x1_t __ret;
   __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
   return __ret;
@@ -66465,15 +66465,15 @@ __ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, int16x4_t __p1) {
   uint16x4_t __ret;
   __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
   return __ret;
 }
 #else
-__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, int16x4_t __p1) {
   uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   uint16x4_t __ret;
   __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
@@ -68919,13 +68919,13 @@ __ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
+__ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) {
   int8_t __ret;
   __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
   return __ret;
 }
 #else
-__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
+__ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) {
   int8_t __ret;
   __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
   return __ret;
@@ -68933,13 +68933,13 @@ __ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
+__ai int32_t vuqadds_s32(int32_t __p0, uint32_t __p1) {
   int32_t __ret;
   __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
   return __ret;
 }
 #else
-__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
+__ai int32_t vuqadds_s32(int32_t __p0, uint32_t __p1) {
   int32_t __ret;
   __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
   return __ret;
@@ -68947,13 +68947,13 @@ __ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
+__ai int64_t vuqaddd_s64(int64_t __p0, uint64_t __p1) {
   int64_t __ret;
   __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
   return __ret;
 }
 #else
-__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
+__ai int64_t vuqaddd_s64(int64_t __p0, uint64_t __p1) {
   int64_t __ret;
   __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
   return __ret;
@@ -68961,13 +68961,13 @@ __ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
+__ai int16_t vuqaddh_s16(int16_t __p0, uint16_t __p1) {
   int16_t __ret;
   __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
   return __ret;
 }
 #else
-__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
+__ai int16_t vuqaddh_s16(int16_t __p0, uint16_t __p1) {
   int16_t __ret;
   __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
   return __ret;
@@ -68975,15 +68975,15 @@ __ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+__ai int8x16_t vuqaddq_s8(int8x16_t __p0, uint8x16_t __p1) {
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
   return __ret;
 }
 #else
-__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+__ai int8x16_t vuqaddq_s8(int8x16_t __p0, uint8x16_t __p1) {
   int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x16_t __ret;
   __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
   __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -68992,15 +68992,15 @@ __ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+__ai int32x4_t vuqaddq_s32(int32x4_t __p0, uint32x4_t __p1) {
   int32x4_t __ret;
   __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
   return __ret;
 }
 #else
-__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+__ai int32x4_t vuqaddq_s32(int32x4_t __p0, uint32x4_t __p1) {
   int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   int32x4_t __ret;
   __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
@@ -69009,15 +69009,15 @@ __ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+__ai int64x2_t vuqaddq_s64(int64x2_t __p0, uint64x2_t __p1) {
   int64x2_t __ret;
   __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
   return __ret;
 }
 #else
-__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+__ai int64x2_t vuqaddq_s64(int64x2_t __p0, uint64x2_t __p1) {
   int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   int64x2_t __ret;
   __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
@@ -69026,15 +69026,15 @@ __ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+__ai int16x8_t vuqaddq_s16(int16x8_t __p0, uint16x8_t __p1) {
   int16x8_t __ret;
   __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
   return __ret;
 }
 #else
-__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+__ai int16x8_t vuqaddq_s16(int16x8_t __p0, uint16x8_t __p1) {
   int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   int16x8_t __ret;
   __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -69043,15 +69043,15 @@ __ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+__ai int8x8_t vuqadd_s8(int8x8_t __p0, uint8x8_t __p1) {
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
   return __ret;
 }
 #else
-__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+__ai int8x8_t vuqadd_s8(int8x8_t __p0, uint8x8_t __p1) {
   int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
-  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
   int8x8_t __ret;
   __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
   __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
@@ -69060,15 +69060,15 @@ __ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+__ai int32x2_t vuqadd_s32(int32x2_t __p0, uint32x2_t __p1) {
   int32x2_t __ret;
   __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
   return __ret;
 }
 #else
-__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+__ai int32x2_t vuqadd_s32(int32x2_t __p0, uint32x2_t __p1) {
   int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
-  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
   int32x2_t __ret;
   __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
   __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
@@ -69077,13 +69077,13 @@ __ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+__ai int64x1_t vuqadd_s64(int64x1_t __p0, uint64x1_t __p1) {
   int64x1_t __ret;
   __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
   return __ret;
 }
 #else
-__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+__ai int64x1_t vuqadd_s64(int64x1_t __p0, uint64x1_t __p1) {
   int64x1_t __ret;
   __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
   return __ret;
@@ -69091,15 +69091,15 @@ __ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+__ai int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) {
   int16x4_t __ret;
   __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
   return __ret;
 }
 #else
-__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+__ai int16x4_t vuqadd_s16(int16x4_t __p0, uint16x4_t __p1) {
   int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
-  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
   int16x4_t __ret;
   __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
   __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
@@ -71912,16 +71912,16 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
 
 #if defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)
 #ifdef __LITTLE_ENDIAN__
-#define vfmlalq_lane_high_u32(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
+#define vfmlalq_lane_high_f16(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
   float32x4_t __s0_264 = __p0_264; \
   float16x8_t __s1_264 = __p1_264; \
   float16x4_t __s2_264 = __p2_264; \
   float32x4_t __ret_264; \
-  __ret_264 = vfmlalq_high_u32(__s0_264, __s1_264, (float16x8_t) {vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264)}); \
+  __ret_264 = vfmlalq_high_f16(__s0_264, __s1_264, (float16x8_t) {vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264), vget_lane_f16(__s2_264, __p3_264)}); \
   __ret_264; \
 })
 #else
-#define vfmlalq_lane_high_u32(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
+#define vfmlalq_lane_high_f16(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
   float32x4_t __s0_265 = __p0_265; \
   float16x8_t __s1_265 = __p1_265; \
   float16x4_t __s2_265 = __p2_265; \
@@ -71929,23 +71929,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x8_t __rev1_265;  __rev1_265 = __builtin_shufflevector(__s1_265, __s1_265, 7, 6, 5, 4, 3, 2, 1, 0); \
   float16x4_t __rev2_265;  __rev2_265 = __builtin_shufflevector(__s2_265, __s2_265, 3, 2, 1, 0); \
   float32x4_t __ret_265; \
-  __ret_265 = __noswap_vfmlalq_high_u32(__rev0_265, __rev1_265, (float16x8_t) {__noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265)}); \
+  __ret_265 = __noswap_vfmlalq_high_f16(__rev0_265, __rev1_265, (float16x8_t) {__noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265), __noswap_vget_lane_f16(__rev2_265, __p3_265)}); \
   __ret_265 = __builtin_shufflevector(__ret_265, __ret_265, 3, 2, 1, 0); \
   __ret_265; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlal_lane_high_u32(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
+#define vfmlal_lane_high_f16(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
   float32x2_t __s0_266 = __p0_266; \
   float16x4_t __s1_266 = __p1_266; \
   float16x4_t __s2_266 = __p2_266; \
   float32x2_t __ret_266; \
-  __ret_266 = vfmlal_high_u32(__s0_266, __s1_266, (float16x4_t) {vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266)}); \
+  __ret_266 = vfmlal_high_f16(__s0_266, __s1_266, (float16x4_t) {vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266), vget_lane_f16(__s2_266, __p3_266)}); \
   __ret_266; \
 })
 #else
-#define vfmlal_lane_high_u32(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
+#define vfmlal_lane_high_f16(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
   float32x2_t __s0_267 = __p0_267; \
   float16x4_t __s1_267 = __p1_267; \
   float16x4_t __s2_267 = __p2_267; \
@@ -71953,23 +71953,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x4_t __rev1_267;  __rev1_267 = __builtin_shufflevector(__s1_267, __s1_267, 3, 2, 1, 0); \
   float16x4_t __rev2_267;  __rev2_267 = __builtin_shufflevector(__s2_267, __s2_267, 3, 2, 1, 0); \
   float32x2_t __ret_267; \
-  __ret_267 = __noswap_vfmlal_high_u32(__rev0_267, __rev1_267, (float16x4_t) {__noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267)}); \
+  __ret_267 = __noswap_vfmlal_high_f16(__rev0_267, __rev1_267, (float16x4_t) {__noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267), __noswap_vget_lane_f16(__rev2_267, __p3_267)}); \
   __ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 1, 0); \
   __ret_267; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlalq_lane_low_u32(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
+#define vfmlalq_lane_low_f16(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
   float32x4_t __s0_268 = __p0_268; \
   float16x8_t __s1_268 = __p1_268; \
   float16x4_t __s2_268 = __p2_268; \
   float32x4_t __ret_268; \
-  __ret_268 = vfmlalq_low_u32(__s0_268, __s1_268, (float16x8_t) {vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268)}); \
+  __ret_268 = vfmlalq_low_f16(__s0_268, __s1_268, (float16x8_t) {vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268), vget_lane_f16(__s2_268, __p3_268)}); \
   __ret_268; \
 })
 #else
-#define vfmlalq_lane_low_u32(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
+#define vfmlalq_lane_low_f16(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
   float32x4_t __s0_269 = __p0_269; \
   float16x8_t __s1_269 = __p1_269; \
   float16x4_t __s2_269 = __p2_269; \
@@ -71977,23 +71977,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x8_t __rev1_269;  __rev1_269 = __builtin_shufflevector(__s1_269, __s1_269, 7, 6, 5, 4, 3, 2, 1, 0); \
   float16x4_t __rev2_269;  __rev2_269 = __builtin_shufflevector(__s2_269, __s2_269, 3, 2, 1, 0); \
   float32x4_t __ret_269; \
-  __ret_269 = __noswap_vfmlalq_low_u32(__rev0_269, __rev1_269, (float16x8_t) {__noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269)}); \
+  __ret_269 = __noswap_vfmlalq_low_f16(__rev0_269, __rev1_269, (float16x8_t) {__noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269), __noswap_vget_lane_f16(__rev2_269, __p3_269)}); \
   __ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 3, 2, 1, 0); \
   __ret_269; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlal_lane_low_u32(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
+#define vfmlal_lane_low_f16(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
   float32x2_t __s0_270 = __p0_270; \
   float16x4_t __s1_270 = __p1_270; \
   float16x4_t __s2_270 = __p2_270; \
   float32x2_t __ret_270; \
-  __ret_270 = vfmlal_low_u32(__s0_270, __s1_270, (float16x4_t) {vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270)}); \
+  __ret_270 = vfmlal_low_f16(__s0_270, __s1_270, (float16x4_t) {vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270), vget_lane_f16(__s2_270, __p3_270)}); \
   __ret_270; \
 })
 #else
-#define vfmlal_lane_low_u32(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
+#define vfmlal_lane_low_f16(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
   float32x2_t __s0_271 = __p0_271; \
   float16x4_t __s1_271 = __p1_271; \
   float16x4_t __s2_271 = __p2_271; \
@@ -72001,23 +72001,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x4_t __rev1_271;  __rev1_271 = __builtin_shufflevector(__s1_271, __s1_271, 3, 2, 1, 0); \
   float16x4_t __rev2_271;  __rev2_271 = __builtin_shufflevector(__s2_271, __s2_271, 3, 2, 1, 0); \
   float32x2_t __ret_271; \
-  __ret_271 = __noswap_vfmlal_low_u32(__rev0_271, __rev1_271, (float16x4_t) {__noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271)}); \
+  __ret_271 = __noswap_vfmlal_low_f16(__rev0_271, __rev1_271, (float16x4_t) {__noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271), __noswap_vget_lane_f16(__rev2_271, __p3_271)}); \
   __ret_271 = __builtin_shufflevector(__ret_271, __ret_271, 1, 0); \
   __ret_271; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlalq_laneq_high_u32(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
+#define vfmlalq_laneq_high_f16(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
   float32x4_t __s0_272 = __p0_272; \
   float16x8_t __s1_272 = __p1_272; \
   float16x8_t __s2_272 = __p2_272; \
   float32x4_t __ret_272; \
-  __ret_272 = vfmlalq_high_u32(__s0_272, __s1_272, (float16x8_t) {vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272)}); \
+  __ret_272 = vfmlalq_high_f16(__s0_272, __s1_272, (float16x8_t) {vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272), vgetq_lane_f16(__s2_272, __p3_272)}); \
   __ret_272; \
 })
 #else
-#define vfmlalq_laneq_high_u32(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
+#define vfmlalq_laneq_high_f16(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
   float32x4_t __s0_273 = __p0_273; \
   float16x8_t __s1_273 = __p1_273; \
   float16x8_t __s2_273 = __p2_273; \
@@ -72025,23 +72025,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x8_t __rev1_273;  __rev1_273 = __builtin_shufflevector(__s1_273, __s1_273, 7, 6, 5, 4, 3, 2, 1, 0); \
   float16x8_t __rev2_273;  __rev2_273 = __builtin_shufflevector(__s2_273, __s2_273, 7, 6, 5, 4, 3, 2, 1, 0); \
   float32x4_t __ret_273; \
-  __ret_273 = __noswap_vfmlalq_high_u32(__rev0_273, __rev1_273, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273)}); \
+  __ret_273 = __noswap_vfmlalq_high_f16(__rev0_273, __rev1_273, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273), __noswap_vgetq_lane_f16(__rev2_273, __p3_273)}); \
   __ret_273 = __builtin_shufflevector(__ret_273, __ret_273, 3, 2, 1, 0); \
   __ret_273; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlal_laneq_high_u32(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
+#define vfmlal_laneq_high_f16(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
   float32x2_t __s0_274 = __p0_274; \
   float16x4_t __s1_274 = __p1_274; \
   float16x8_t __s2_274 = __p2_274; \
   float32x2_t __ret_274; \
-  __ret_274 = vfmlal_high_u32(__s0_274, __s1_274, (float16x4_t) {vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274)}); \
+  __ret_274 = vfmlal_high_f16(__s0_274, __s1_274, (float16x4_t) {vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274), vgetq_lane_f16(__s2_274, __p3_274)}); \
   __ret_274; \
 })
 #else
-#define vfmlal_laneq_high_u32(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
+#define vfmlal_laneq_high_f16(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
   float32x2_t __s0_275 = __p0_275; \
   float16x4_t __s1_275 = __p1_275; \
   float16x8_t __s2_275 = __p2_275; \
@@ -72049,23 +72049,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x4_t __rev1_275;  __rev1_275 = __builtin_shufflevector(__s1_275, __s1_275, 3, 2, 1, 0); \
   float16x8_t __rev2_275;  __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 7, 6, 5, 4, 3, 2, 1, 0); \
   float32x2_t __ret_275; \
-  __ret_275 = __noswap_vfmlal_high_u32(__rev0_275, __rev1_275, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275)}); \
+  __ret_275 = __noswap_vfmlal_high_f16(__rev0_275, __rev1_275, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275), __noswap_vgetq_lane_f16(__rev2_275, __p3_275)}); \
   __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 1, 0); \
   __ret_275; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlalq_laneq_low_u32(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
+#define vfmlalq_laneq_low_f16(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
   float32x4_t __s0_276 = __p0_276; \
   float16x8_t __s1_276 = __p1_276; \
   float16x8_t __s2_276 = __p2_276; \
   float32x4_t __ret_276; \
-  __ret_276 = vfmlalq_low_u32(__s0_276, __s1_276, (float16x8_t) {vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276)}); \
+  __ret_276 = vfmlalq_low_f16(__s0_276, __s1_276, (float16x8_t) {vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276), vgetq_lane_f16(__s2_276, __p3_276)}); \
   __ret_276; \
 })
 #else
-#define vfmlalq_laneq_low_u32(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
+#define vfmlalq_laneq_low_f16(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
   float32x4_t __s0_277 = __p0_277; \
   float16x8_t __s1_277 = __p1_277; \
   float16x8_t __s2_277 = __p2_277; \
@@ -72073,23 +72073,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x8_t __rev1_277;  __rev1_277 = __builtin_shufflevector(__s1_277, __s1_277, 7, 6, 5, 4, 3, 2, 1, 0); \
   float16x8_t __rev2_277;  __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 7, 6, 5, 4, 3, 2, 1, 0); \
   float32x4_t __ret_277; \
-  __ret_277 = __noswap_vfmlalq_low_u32(__rev0_277, __rev1_277, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277)}); \
+  __ret_277 = __noswap_vfmlalq_low_f16(__rev0_277, __rev1_277, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277), __noswap_vgetq_lane_f16(__rev2_277, __p3_277)}); \
   __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 3, 2, 1, 0); \
   __ret_277; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlal_laneq_low_u32(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
+#define vfmlal_laneq_low_f16(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
   float32x2_t __s0_278 = __p0_278; \
   float16x4_t __s1_278 = __p1_278; \
   float16x8_t __s2_278 = __p2_278; \
   float32x2_t __ret_278; \
-  __ret_278 = vfmlal_low_u32(__s0_278, __s1_278, (float16x4_t) {vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278)}); \
+  __ret_278 = vfmlal_low_f16(__s0_278, __s1_278, (float16x4_t) {vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278), vgetq_lane_f16(__s2_278, __p3_278)}); \
   __ret_278; \
 })
 #else
-#define vfmlal_laneq_low_u32(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
+#define vfmlal_laneq_low_f16(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
   float32x2_t __s0_279 = __p0_279; \
   float16x4_t __s1_279 = __p1_279; \
   float16x8_t __s2_279 = __p2_279; \
@@ -72097,23 +72097,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x4_t __rev1_279;  __rev1_279 = __builtin_shufflevector(__s1_279, __s1_279, 3, 2, 1, 0); \
   float16x8_t __rev2_279;  __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 7, 6, 5, 4, 3, 2, 1, 0); \
   float32x2_t __ret_279; \
-  __ret_279 = __noswap_vfmlal_low_u32(__rev0_279, __rev1_279, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279)}); \
+  __ret_279 = __noswap_vfmlal_low_f16(__rev0_279, __rev1_279, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279), __noswap_vgetq_lane_f16(__rev2_279, __p3_279)}); \
   __ret_279 = __builtin_shufflevector(__ret_279, __ret_279, 1, 0); \
   __ret_279; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlslq_lane_high_u32(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
+#define vfmlslq_lane_high_f16(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
   float32x4_t __s0_280 = __p0_280; \
   float16x8_t __s1_280 = __p1_280; \
   float16x4_t __s2_280 = __p2_280; \
   float32x4_t __ret_280; \
-  __ret_280 = vfmlslq_high_u32(__s0_280, __s1_280, (float16x8_t) {vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280)}); \
+  __ret_280 = vfmlslq_high_f16(__s0_280, __s1_280, (float16x8_t) {vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280), vget_lane_f16(__s2_280, __p3_280)}); \
   __ret_280; \
 })
 #else
-#define vfmlslq_lane_high_u32(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
+#define vfmlslq_lane_high_f16(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
   float32x4_t __s0_281 = __p0_281; \
   float16x8_t __s1_281 = __p1_281; \
   float16x4_t __s2_281 = __p2_281; \
@@ -72121,23 +72121,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x8_t __rev1_281;  __rev1_281 = __builtin_shufflevector(__s1_281, __s1_281, 7, 6, 5, 4, 3, 2, 1, 0); \
   float16x4_t __rev2_281;  __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 3, 2, 1, 0); \
   float32x4_t __ret_281; \
-  __ret_281 = __noswap_vfmlslq_high_u32(__rev0_281, __rev1_281, (float16x8_t) {__noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281)}); \
+  __ret_281 = __noswap_vfmlslq_high_f16(__rev0_281, __rev1_281, (float16x8_t) {__noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281), __noswap_vget_lane_f16(__rev2_281, __p3_281)}); \
   __ret_281 = __builtin_shufflevector(__ret_281, __ret_281, 3, 2, 1, 0); \
   __ret_281; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlsl_lane_high_u32(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \
+#define vfmlsl_lane_high_f16(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \
   float32x2_t __s0_282 = __p0_282; \
   float16x4_t __s1_282 = __p1_282; \
   float16x4_t __s2_282 = __p2_282; \
   float32x2_t __ret_282; \
-  __ret_282 = vfmlsl_high_u32(__s0_282, __s1_282, (float16x4_t) {vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282)}); \
+  __ret_282 = vfmlsl_high_f16(__s0_282, __s1_282, (float16x4_t) {vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282), vget_lane_f16(__s2_282, __p3_282)}); \
   __ret_282; \
 })
 #else
-#define vfmlsl_lane_high_u32(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \
+#define vfmlsl_lane_high_f16(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \
   float32x2_t __s0_283 = __p0_283; \
   float16x4_t __s1_283 = __p1_283; \
   float16x4_t __s2_283 = __p2_283; \
@@ -72145,23 +72145,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x4_t __rev1_283;  __rev1_283 = __builtin_shufflevector(__s1_283, __s1_283, 3, 2, 1, 0); \
   float16x4_t __rev2_283;  __rev2_283 = __builtin_shufflevector(__s2_283, __s2_283, 3, 2, 1, 0); \
   float32x2_t __ret_283; \
-  __ret_283 = __noswap_vfmlsl_high_u32(__rev0_283, __rev1_283, (float16x4_t) {__noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283)}); \
+  __ret_283 = __noswap_vfmlsl_high_f16(__rev0_283, __rev1_283, (float16x4_t) {__noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283), __noswap_vget_lane_f16(__rev2_283, __p3_283)}); \
   __ret_283 = __builtin_shufflevector(__ret_283, __ret_283, 1, 0); \
   __ret_283; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlslq_lane_low_u32(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \
+#define vfmlslq_lane_low_f16(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \
   float32x4_t __s0_284 = __p0_284; \
   float16x8_t __s1_284 = __p1_284; \
   float16x4_t __s2_284 = __p2_284; \
   float32x4_t __ret_284; \
-  __ret_284 = vfmlslq_low_u32(__s0_284, __s1_284, (float16x8_t) {vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284)}); \
+  __ret_284 = vfmlslq_low_f16(__s0_284, __s1_284, (float16x8_t) {vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284), vget_lane_f16(__s2_284, __p3_284)}); \
   __ret_284; \
 })
 #else
-#define vfmlslq_lane_low_u32(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \
+#define vfmlslq_lane_low_f16(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \
   float32x4_t __s0_285 = __p0_285; \
   float16x8_t __s1_285 = __p1_285; \
   float16x4_t __s2_285 = __p2_285; \
@@ -72169,23 +72169,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x8_t __rev1_285;  __rev1_285 = __builtin_shufflevector(__s1_285, __s1_285, 7, 6, 5, 4, 3, 2, 1, 0); \
   float16x4_t __rev2_285;  __rev2_285 = __builtin_shufflevector(__s2_285, __s2_285, 3, 2, 1, 0); \
   float32x4_t __ret_285; \
-  __ret_285 = __noswap_vfmlslq_low_u32(__rev0_285, __rev1_285, (float16x8_t) {__noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285)}); \
+  __ret_285 = __noswap_vfmlslq_low_f16(__rev0_285, __rev1_285, (float16x8_t) {__noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285), __noswap_vget_lane_f16(__rev2_285, __p3_285)}); \
   __ret_285 = __builtin_shufflevector(__ret_285, __ret_285, 3, 2, 1, 0); \
   __ret_285; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlsl_lane_low_u32(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \
+#define vfmlsl_lane_low_f16(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \
   float32x2_t __s0_286 = __p0_286; \
   float16x4_t __s1_286 = __p1_286; \
   float16x4_t __s2_286 = __p2_286; \
   float32x2_t __ret_286; \
-  __ret_286 = vfmlsl_low_u32(__s0_286, __s1_286, (float16x4_t) {vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286)}); \
+  __ret_286 = vfmlsl_low_f16(__s0_286, __s1_286, (float16x4_t) {vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286), vget_lane_f16(__s2_286, __p3_286)}); \
   __ret_286; \
 })
 #else
-#define vfmlsl_lane_low_u32(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \
+#define vfmlsl_lane_low_f16(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \
   float32x2_t __s0_287 = __p0_287; \
   float16x4_t __s1_287 = __p1_287; \
   float16x4_t __s2_287 = __p2_287; \
@@ -72193,23 +72193,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x4_t __rev1_287;  __rev1_287 = __builtin_shufflevector(__s1_287, __s1_287, 3, 2, 1, 0); \
   float16x4_t __rev2_287;  __rev2_287 = __builtin_shufflevector(__s2_287, __s2_287, 3, 2, 1, 0); \
   float32x2_t __ret_287; \
-  __ret_287 = __noswap_vfmlsl_low_u32(__rev0_287, __rev1_287, (float16x4_t) {__noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287)}); \
+  __ret_287 = __noswap_vfmlsl_low_f16(__rev0_287, __rev1_287, (float16x4_t) {__noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287), __noswap_vget_lane_f16(__rev2_287, __p3_287)}); \
   __ret_287 = __builtin_shufflevector(__ret_287, __ret_287, 1, 0); \
   __ret_287; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlslq_laneq_high_u32(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \
+#define vfmlslq_laneq_high_f16(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \
   float32x4_t __s0_288 = __p0_288; \
   float16x8_t __s1_288 = __p1_288; \
   float16x8_t __s2_288 = __p2_288; \
   float32x4_t __ret_288; \
-  __ret_288 = vfmlslq_high_u32(__s0_288, __s1_288, (float16x8_t) {vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288)}); \
+  __ret_288 = vfmlslq_high_f16(__s0_288, __s1_288, (float16x8_t) {vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288), vgetq_lane_f16(__s2_288, __p3_288)}); \
   __ret_288; \
 })
 #else
-#define vfmlslq_laneq_high_u32(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \
+#define vfmlslq_laneq_high_f16(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \
   float32x4_t __s0_289 = __p0_289; \
   float16x8_t __s1_289 = __p1_289; \
   float16x8_t __s2_289 = __p2_289; \
@@ -72217,23 +72217,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x8_t __rev1_289;  __rev1_289 = __builtin_shufflevector(__s1_289, __s1_289, 7, 6, 5, 4, 3, 2, 1, 0); \
   float16x8_t __rev2_289;  __rev2_289 = __builtin_shufflevector(__s2_289, __s2_289, 7, 6, 5, 4, 3, 2, 1, 0); \
   float32x4_t __ret_289; \
-  __ret_289 = __noswap_vfmlslq_high_u32(__rev0_289, __rev1_289, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289)}); \
+  __ret_289 = __noswap_vfmlslq_high_f16(__rev0_289, __rev1_289, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289), __noswap_vgetq_lane_f16(__rev2_289, __p3_289)}); \
   __ret_289 = __builtin_shufflevector(__ret_289, __ret_289, 3, 2, 1, 0); \
   __ret_289; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlsl_laneq_high_u32(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \
+#define vfmlsl_laneq_high_f16(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \
   float32x2_t __s0_290 = __p0_290; \
   float16x4_t __s1_290 = __p1_290; \
   float16x8_t __s2_290 = __p2_290; \
   float32x2_t __ret_290; \
-  __ret_290 = vfmlsl_high_u32(__s0_290, __s1_290, (float16x4_t) {vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290)}); \
+  __ret_290 = vfmlsl_high_f16(__s0_290, __s1_290, (float16x4_t) {vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290), vgetq_lane_f16(__s2_290, __p3_290)}); \
   __ret_290; \
 })
 #else
-#define vfmlsl_laneq_high_u32(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \
+#define vfmlsl_laneq_high_f16(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \
   float32x2_t __s0_291 = __p0_291; \
   float16x4_t __s1_291 = __p1_291; \
   float16x8_t __s2_291 = __p2_291; \
@@ -72241,23 +72241,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x4_t __rev1_291;  __rev1_291 = __builtin_shufflevector(__s1_291, __s1_291, 3, 2, 1, 0); \
   float16x8_t __rev2_291;  __rev2_291 = __builtin_shufflevector(__s2_291, __s2_291, 7, 6, 5, 4, 3, 2, 1, 0); \
   float32x2_t __ret_291; \
-  __ret_291 = __noswap_vfmlsl_high_u32(__rev0_291, __rev1_291, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291)}); \
+  __ret_291 = __noswap_vfmlsl_high_f16(__rev0_291, __rev1_291, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291), __noswap_vgetq_lane_f16(__rev2_291, __p3_291)}); \
   __ret_291 = __builtin_shufflevector(__ret_291, __ret_291, 1, 0); \
   __ret_291; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlslq_laneq_low_u32(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \
+#define vfmlslq_laneq_low_f16(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \
   float32x4_t __s0_292 = __p0_292; \
   float16x8_t __s1_292 = __p1_292; \
   float16x8_t __s2_292 = __p2_292; \
   float32x4_t __ret_292; \
-  __ret_292 = vfmlslq_low_u32(__s0_292, __s1_292, (float16x8_t) {vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292)}); \
+  __ret_292 = vfmlslq_low_f16(__s0_292, __s1_292, (float16x8_t) {vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292), vgetq_lane_f16(__s2_292, __p3_292)}); \
   __ret_292; \
 })
 #else
-#define vfmlslq_laneq_low_u32(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \
+#define vfmlslq_laneq_low_f16(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \
   float32x4_t __s0_293 = __p0_293; \
   float16x8_t __s1_293 = __p1_293; \
   float16x8_t __s2_293 = __p2_293; \
@@ -72265,23 +72265,23 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x8_t __rev1_293;  __rev1_293 = __builtin_shufflevector(__s1_293, __s1_293, 7, 6, 5, 4, 3, 2, 1, 0); \
   float16x8_t __rev2_293;  __rev2_293 = __builtin_shufflevector(__s2_293, __s2_293, 7, 6, 5, 4, 3, 2, 1, 0); \
   float32x4_t __ret_293; \
-  __ret_293 = __noswap_vfmlslq_low_u32(__rev0_293, __rev1_293, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293)}); \
+  __ret_293 = __noswap_vfmlslq_low_f16(__rev0_293, __rev1_293, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293), __noswap_vgetq_lane_f16(__rev2_293, __p3_293)}); \
   __ret_293 = __builtin_shufflevector(__ret_293, __ret_293, 3, 2, 1, 0); \
   __ret_293; \
 })
 #endif
 
 #ifdef __LITTLE_ENDIAN__
-#define vfmlsl_laneq_low_u32(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \
+#define vfmlsl_laneq_low_f16(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \
   float32x2_t __s0_294 = __p0_294; \
   float16x4_t __s1_294 = __p1_294; \
   float16x8_t __s2_294 = __p2_294; \
   float32x2_t __ret_294; \
-  __ret_294 = vfmlsl_low_u32(__s0_294, __s1_294, (float16x4_t) {vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294)}); \
+  __ret_294 = vfmlsl_low_f16(__s0_294, __s1_294, (float16x4_t) {vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294), vgetq_lane_f16(__s2_294, __p3_294)}); \
   __ret_294; \
 })
 #else
-#define vfmlsl_laneq_low_u32(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \
+#define vfmlsl_laneq_low_f16(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \
   float32x2_t __s0_295 = __p0_295; \
   float16x4_t __s1_295 = __p1_295; \
   float16x8_t __s2_295 = __p2_295; \
@@ -72289,7 +72289,7 @@ int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(in
   float16x4_t __rev1_295;  __rev1_295 = __builtin_shufflevector(__s1_295, __s1_295, 3, 2, 1, 0); \
   float16x8_t __rev2_295;  __rev2_295 = __builtin_shufflevector(__s2_295, __s2_295, 7, 6, 5, 4, 3, 2, 1, 0); \
   float32x2_t __ret_295; \
-  __ret_295 = __noswap_vfmlsl_low_u32(__rev0_295, __rev1_295, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295)}); \
+  __ret_295 = __noswap_vfmlsl_low_f16(__rev0_295, __rev1_295, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295), __noswap_vgetq_lane_f16(__rev2_295, __p3_295)}); \
   __ret_295 = __builtin_shufflevector(__ret_295, __ret_295, 1, 0); \
   __ret_295; \
 })
diff --git a/lib/include/armintr.h b/lib/include/armintr.h
index 933afcbb9..300ed4ee4 100644
--- a/lib/include/armintr.h
+++ b/lib/include/armintr.h
@@ -1,22 +1,8 @@
 /*===---- armintr.h - ARM Windows intrinsics -------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx2intrin.h b/lib/include/avx2intrin.h
index 9688a96fd..162e83ea2 100644
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
@@ -1,22 +1,8 @@
 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -146,21 +132,13 @@ _mm256_andnot_si256(__m256i __a, __m256i __b)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_avg_epu8(__m256i __a, __m256i __b)
 {
-  typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
-  return (__m256i)__builtin_convertvector(
-               ((__builtin_convertvector((__v32qu)__a, __v32hu) +
-                 __builtin_convertvector((__v32qu)__b, __v32hu)) + 1)
-                 >> 1, __v32qu);
+  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_avg_epu16(__m256i __a, __m256i __b)
 {
-  typedef unsigned int __v16su __attribute__((__vector_size__(64)));
-  return (__m256i)__builtin_convertvector(
-               ((__builtin_convertvector((__v16hu)__a, __v16su) +
-                 __builtin_convertvector((__v16hu)__b, __v16su)) + 1)
-                 >> 1, __v16hu);
+  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
diff --git a/lib/include/avx512bf16intrin.h b/lib/include/avx512bf16intrin.h
new file mode 100644
index 000000000..d1d87e72f
--- /dev/null
+++ b/lib/include/avx512bf16intrin.h
@@ -0,0 +1,279 @@
+/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BF16INTRIN_H
+#define __AVX512BF16INTRIN_H
+
+typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
+typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
+typedef unsigned short __bfloat16;
+
+#define __DEFAULT_FN_ATTRS512 \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
+                 __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
+
+/// Convert One BF16 Data to One Single Float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __A
+///    A bfloat data.
+/// \returns A float data whose sign field and exponent field keep unchanged,
+///    and fraction field is extended to 23 bits.
+static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {
+  return __builtin_ia32_cvtsbf162ss_32(__A);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __B
+///    A 512-bit vector of [16 x float].
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+///    conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
+                                                    (__v16sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __B
+///    A 512-bit vector of [16 x float].
+/// \param __W
+///    A 512-bit vector of [32 x bfloat].
+/// \param __U
+///    A 32-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+///    conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
+  return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
+                                        (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
+                                        (__v32hi)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __B
+///    A 512-bit vector of [16 x float].
+/// \param __U
+///    A 32-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+///    conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
+  return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
+                                        (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
+                                        (__v32hi)_mm512_setzero_si512());
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_cvtneps_pbh(__m512 __A) {
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                              (__v16hi)_mm256_undefined_si256(),
+                                              (__mmask16)-1);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __W
+///    A 256-bit vector of [16 x bfloat].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                                        (__v16hi)__W,
+                                                        (__mmask16)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                                (__v16hi)_mm256_setzero_si256(),
+                                                (__mmask16)__U);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [32 x bfloat].
+/// \param __B
+///    A 512-bit vector of [32 x bfloat].
+/// \param __D
+///    A 512-bit vector of [16 x float].
+/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
+  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
+                                             (__v16si) __A,
+                                             (__v16si) __B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [32 x bfloat].
+/// \param __B
+///    A 512-bit vector of [32 x bfloat].
+/// \param __D
+///    A 512-bit vector of [16 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
+                                       (__v16sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [32 x bfloat].
+/// \param __B
+///    A 512-bit vector of [32 x bfloat].
+/// \param __D
+///    A 512-bit vector of [16 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
+                                       (__v16sf)_mm512_setzero_si512());
+}
+
+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from convertion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
+  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
+      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+///    A 16-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from convertion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
+  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
+      (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+///    A 512-bit vector of [16 x float]. Elements are copied from __S when
+///     the corresponding mask bit is not set.
+/// \param __U
+///    A 16-bit mask.
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from convertion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
+  return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
+      (__m512i)__S, (__mmask16)__U,
+      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS512
+
+#endif
diff --git a/lib/include/avx512bitalgintrin.h b/lib/include/avx512bitalgintrin.h
index 56046f8c4..d4411d156 100644
--- a/lib/include/avx512bitalgintrin.h
+++ b/lib/include/avx512bitalgintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512bwintrin.h b/lib/include/avx512bwintrin.h
index a90a25537..cb2e07619 100644
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -719,11 +705,7 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_avg_epu8 (__m512i __A, __m512i __B)
 {
-  typedef unsigned short __v64hu __attribute__((__vector_size__(128)));
-  return (__m512i)__builtin_convertvector(
-              ((__builtin_convertvector((__v64qu) __A, __v64hu) +
-                __builtin_convertvector((__v64qu) __B, __v64hu)) + 1)
-                >> 1, __v64qu);
+  return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -746,11 +728,7 @@ _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_avg_epu16 (__m512i __A, __m512i __B)
 {
-  typedef unsigned int __v32su __attribute__((__vector_size__(128)));
-  return (__m512i)__builtin_convertvector(
-              ((__builtin_convertvector((__v32hu) __A, __v32su) +
-                __builtin_convertvector((__v32hu) __B, __v32su)) + 1)
-                >> 1, __v32hu);
+  return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -1733,14 +1711,14 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
                                               (__v64qi) _mm512_setzero_si512());
 }
 
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
   return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
                 (__mmask64) __B);
 }
 
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
   return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
@@ -1751,7 +1729,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_loadu_epi16 (void const *__P)
 {
   struct __loadu_epi16 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi16*)__P)->__v;
 }
@@ -1777,7 +1755,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_loadu_epi8 (void const *__P)
 {
   struct __loadu_epi8 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi8*)__P)->__v;
 }
@@ -1803,7 +1781,7 @@ static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_epi16 (void *__P, __m512i __A)
 {
   struct __storeu_epi16 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi16*)__P)->__v = __A;
 }
@@ -1820,7 +1798,7 @@ static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_epi8 (void *__P, __m512i __A)
 {
   struct __storeu_epi8 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi8*)__P)->__v = __A;
 }
diff --git a/lib/include/avx512cdintrin.h b/lib/include/avx512cdintrin.h
index e63902743..bfdba84aa 100644
--- a/lib/include/avx512cdintrin.h
+++ b/lib/include/avx512cdintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -34,49 +20,45 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_conflict_epi64 (__m512i __A)
 {
-  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
-                 (__v8di) _mm512_setzero_si512 (),
-                 (__mmask8) -1);
+  return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
-               (__v8di) __W,
-               (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_conflict_epi64(__A),
+                                             (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
-                 (__v8di) _mm512_setzero_si512 (),
-                 (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_conflict_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512 ());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_conflict_epi32 (__m512i __A)
 {
-  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
-                 (__v16si) _mm512_setzero_si512 (),
-                 (__mmask16) -1);
+  return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
-               (__v16si) __W,
-               (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_conflict_epi32(__A),
+                                            (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
-                 (__v16si) _mm512_setzero_si512 (),
-                 (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_conflict_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/lib/include/avx512dqintrin.h b/lib/include/avx512dqintrin.h
index 6e6c293af..337256c50 100644
--- a/lib/include/avx512dqintrin.h
+++ b/lib/include/avx512dqintrin.h
@@ -1,22 +1,8 @@
 /*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512erintrin.h b/lib/include/avx512erintrin.h
index 6348275c8..857006169 100644
--- a/lib/include/avx512erintrin.h
+++ b/lib/include/avx512erintrin.h
@@ -1,22 +1,8 @@
 /*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512fintrin.h b/lib/include/avx512fintrin.h
index 1c19993ff..132761f9e 100644
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@@ -1,22 +1,8 @@
 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -40,9 +26,13 @@ typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
 typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
 typedef unsigned int __v16su __attribute__((__vector_size__(64)));
 
-typedef float __m512 __attribute__((__vector_size__(64)));
-typedef double __m512d __attribute__((__vector_size__(64)));
-typedef long long __m512i __attribute__((__vector_size__(64)));
+typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
+typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
+
+typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
+typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
+typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
 
 typedef unsigned char __mmask8;
 typedef unsigned short __mmask16;
@@ -1991,12 +1981,12 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 #define _mm512_mask_add_round_pd(W, U, A, B, R) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W));
+                                   (__v8df)(__m512d)(W))
 
 #define _mm512_maskz_add_round_pd(U, A, B, R) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd());
+                                   (__v8df)_mm512_setzero_pd())
 
 #define _mm512_add_round_ps(A, B, R) \
   (__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
@@ -2005,12 +1995,12 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 #define _mm512_mask_add_round_ps(W, U, A, B, R) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W));
+                                  (__v16sf)(__m512)(W))
 
 #define _mm512_maskz_add_round_ps(U, A, B, R) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps());
+                                  (__v16sf)_mm512_setzero_ps())
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2106,12 +2096,12 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W));
+                                   (__v8df)(__m512d)(W))
 
 #define _mm512_maskz_sub_round_pd(U, A, B, R) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd());
+                                   (__v8df)_mm512_setzero_pd())
 
 #define _mm512_sub_round_ps(A, B, R) \
   (__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
@@ -2120,12 +2110,12 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W));
+                                  (__v16sf)(__m512)(W))
 
 #define _mm512_maskz_sub_round_ps(U, A, B, R) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps());
+                                  (__v16sf)_mm512_setzero_ps())
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2221,12 +2211,12 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W));
+                                   (__v8df)(__m512d)(W))
 
 #define _mm512_maskz_mul_round_pd(U, A, B, R) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd());
+                                   (__v8df)_mm512_setzero_pd())
 
 #define _mm512_mul_round_ps(A, B, R) \
   (__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
@@ -2235,12 +2225,12 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W));
+                                  (__v16sf)(__m512)(W))
 
 #define _mm512_maskz_mul_round_ps(U, A, B, R) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps());
+                                  (__v16sf)_mm512_setzero_ps())
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2349,12 +2339,12 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 #define _mm512_mask_div_round_pd(W, U, A, B, R) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W));
+                                   (__v8df)(__m512d)(W))
 
 #define _mm512_maskz_div_round_pd(U, A, B, R) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd());
+                                   (__v8df)_mm512_setzero_pd())
 
 #define _mm512_div_round_ps(A, B, R) \
   (__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
@@ -2363,12 +2353,12 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 #define _mm512_mask_div_round_ps(W, U, A, B, R) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W));
+                                  (__v16sf)(__m512)(W))
 
 #define _mm512_maskz_div_round_ps(U, A, B, R) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps());
+                                  (__v16sf)_mm512_setzero_ps())
 
 #define _mm512_roundscale_ps(A, B) \
   (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
@@ -3789,20 +3779,9 @@ _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
                                             (__v16hi)_mm256_setzero_si256(), \
                                             (__mmask16)(W))
 
-#define _mm512_cvtps_ph(A, I) \
-  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                            (__v16hi)_mm256_setzero_si256(), \
-                                            (__mmask16)-1)
-
-#define _mm512_mask_cvtps_ph(U, W, A, I) \
-  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                            (__v16hi)(__m256i)(U), \
-                                            (__mmask16)(W))
-
-#define _mm512_maskz_cvtps_ph(W, A, I) \
-  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                            (__v16hi)_mm256_setzero_si256(), \
-                                            (__mmask16)(W))
+#define _mm512_cvtps_ph       _mm512_cvt_roundps_ph
+#define _mm512_mask_cvtps_ph  _mm512_mask_cvt_roundps_ph
+#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
 
 #define _mm512_cvt_roundph_ps(A, R) \
   (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
@@ -4324,7 +4303,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_loadu_si512 (void const *__P)
 {
   struct __loadu_si512 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_si512*)__P)->__v;
 }
@@ -4333,7 +4312,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_loadu_epi32 (void const *__P)
 {
   struct __loadu_epi32 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi32*)__P)->__v;
 }
@@ -4360,7 +4339,7 @@ static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_loadu_epi64 (void const *__P)
 {
   struct __loadu_epi64 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi64*)__P)->__v;
 }
@@ -4420,7 +4399,7 @@ static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_loadu_pd(void const *__p)
 {
   struct __loadu_pd {
-    __m512d __v;
+    __m512d_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_pd*)__p)->__v;
 }
@@ -4429,7 +4408,7 @@ static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_loadu_ps(void const *__p)
 {
   struct __loadu_ps {
-    __m512 __v;
+    __m512_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_ps*)__p)->__v;
 }
@@ -4504,7 +4483,7 @@ static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_epi64 (void *__P, __m512i __A)
 {
   struct __storeu_epi64 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi64*)__P)->__v = __A;
 }
@@ -4520,7 +4499,7 @@ static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_si512 (void *__P, __m512i __A)
 {
   struct __storeu_si512 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_si512*)__P)->__v = __A;
 }
@@ -4529,7 +4508,7 @@ static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_epi32 (void *__P, __m512i __A)
 {
   struct __storeu_epi32 {
-    __m512i __v;
+    __m512i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi32*)__P)->__v = __A;
 }
@@ -4551,7 +4530,7 @@ static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_pd(void *__P, __m512d __A)
 {
   struct __storeu_pd {
-    __m512d __v;
+    __m512d_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_pd*)__P)->__v = __A;
 }
@@ -4567,7 +4546,7 @@ static __inline void __DEFAULT_FN_ATTRS512
 _mm512_storeu_ps(void *__P, __m512 __A)
 {
   struct __storeu_ps {
-    __m512 __v;
+    __m512_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_ps*)__P)->__v = __A;
 }
@@ -9329,7 +9308,7 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
   __v2du __t6 = __t4 op __t5; \
   __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
   __v2du __t8 = __t6 op __t7; \
-  return __t8[0];
+  return __t8[0]
 
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
   _mm512_mask_reduce_operator(+);
@@ -9381,7 +9360,7 @@ _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
   __m128d __t6 = __t4 op __t5; \
   __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
   __m128d __t8 = __t6 op __t7; \
-  return __t8[0];
+  return __t8[0]
 
 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
   _mm512_mask_reduce_operator(+);
@@ -9415,7 +9394,7 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
   __v4su __t8 = __t6 op __t7; \
   __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
   __v4su __t10 = __t8 op __t9; \
-  return __t10[0];
+  return __t10[0]
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_epi32(__m512i __W) {
@@ -9473,7 +9452,7 @@ _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
   __m128 __t8 = __t6 op __t7; \
   __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
   __m128 __t10 = __t8 op __t9; \
-  return __t10[0];
+  return __t10[0]
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_ps(__m512 __W) {
@@ -9505,7 +9484,7 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
   __m512i __t4 = _mm512_##op(__t2, __t3); \
   __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \
   __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \
-  return __t6[0];
+  return __t6[0]
 
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epi64(__m512i __V) {
@@ -9563,7 +9542,7 @@ _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
   __m128i __t8 = _mm_##op(__t6, __t7); \
   __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \
   __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \
-  return __t10[0];
+  return __t10[0]
 
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epi32(__m512i __V) {
@@ -9619,7 +9598,7 @@ _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
   __m128d __t6 = _mm_##op(__t4, __t5); \
   __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
   __m128d __t8 = _mm_##op(__t6, __t7); \
-  return __t8[0];
+  return __t8[0]
 
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_pd(__m512d __V) {
@@ -9655,7 +9634,7 @@ _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
   __m128 __t8 = _mm_##op(__t6, __t7); \
   __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
   __m128 __t10 = _mm_##op(__t8, __t9); \
-  return __t10[0];
+  return __t10[0]
 
 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_ps(__m512 __V) {
diff --git a/lib/include/avx512ifmaintrin.h b/lib/include/avx512ifmaintrin.h
index 159713049..5f7da52f1 100644
--- a/lib/include/avx512ifmaintrin.h
+++ b/lib/include/avx512ifmaintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512ifmavlintrin.h b/lib/include/avx512ifmavlintrin.h
index afdea888c..5889401d1 100644
--- a/lib/include/avx512ifmavlintrin.h
+++ b/lib/include/avx512ifmavlintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512pfintrin.h b/lib/include/avx512pfintrin.h
index 73b2234fb..b8bcf49c6 100644
--- a/lib/include/avx512pfintrin.h
+++ b/lib/include/avx512pfintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vbmi2intrin.h b/lib/include/avx512vbmi2intrin.h
index 532425242..a23144616 100644
--- a/lib/include/avx512vbmi2intrin.h
+++ b/lib/include/avx512vbmi2intrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vbmiintrin.h b/lib/include/avx512vbmiintrin.h
index 5463d9015..c0e0f94d4 100644
--- a/lib/include/avx512vbmiintrin.h
+++ b/lib/include/avx512vbmiintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vbmivlintrin.h b/lib/include/avx512vbmivlintrin.h
index b5d5aa9af..c5b96ae8a 100644
--- a/lib/include/avx512vbmivlintrin.h
+++ b/lib/include/avx512vbmivlintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vlbf16intrin.h b/lib/include/avx512vlbf16intrin.h
new file mode 100644
index 000000000..1b1a744bc
--- /dev/null
+++ b/lib/include/avx512vlbf16intrin.h
@@ -0,0 +1,474 @@
+/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBF16INTRIN_H
+#define __AVX512VLBF16INTRIN_H
+
+typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+
+#define __DEFAULT_FN_ATTRS128 \
+  __attribute__((__always_inline__, __nodebug__, \
+                 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+  __attribute__((__always_inline__, __nodebug__, \
+                 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __B
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
+                                                    (__v4sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __B
+///    A 128-bit vector of [4 x float].
+/// \param __W
+///    A 128-bit vector of [8 x bfloat].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
+                                             (__v8hi)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __B
+///    A 128-bit vector of [4 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __B
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+///    conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
+                                                    (__v8sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __B
+///    A 256-bit vector of [8 x float].
+/// \param __W
+///    A 256-bit vector of [16 x bfloat].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+///    conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
+                                         (__v16hi)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __B
+///    A 256-bit vector of [8 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+///    conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __A, and higher 64 bits are 0.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_cvtneps_pbh(__m128 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+                                                  (__v8hi)_mm_undefined_si128(),
+                                                  (__mmask8)-1);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __W
+///    A 128-bit vector of [8 x bfloat].
+/// \param __U
+///    A 4-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __A, and higher 64 bits are 0.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+                                                        (__v8hi)__W,
+                                                        (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __U
+///    A 4-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __A, and higher 64 bits are 0.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+                                                    (__v8hi)_mm_setzero_si128(),
+                                                    (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_cvtneps_pbh(__m256 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                  (__v8hi)_mm_undefined_si128(),
+                                                  (__mmask8)-1);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __W
+///    A 256-bit vector of [8 x bfloat].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                        (__v8hi)__W,
+                                                        (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                    (__v8hi)_mm_setzero_si128(),
+                                                    (__mmask8)__U);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \param __B
+///    A 128-bit vector of [8 x bfloat].
+/// \param __D
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
+  return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
+                                             (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \param __B
+///    A 128-bit vector of [8 x bfloat].
+/// \param __D
+///    A 128-bit vector of [4 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
+                                           (__v4sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \param __B
+///    A 128-bit vector of [8 x bfloat].
+/// \param __D
+///    A 128-bit vector of [4 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
+                                           (__v4sf)_mm_setzero_si128());
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \param __B
+///    A 256-bit vector of [16 x bfloat].
+/// \param __D
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
+  return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
+                                             (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \param __B
+///    A 256-bit vector of [16 x bfloat].
+/// \param __D
+///    A 256-bit vector of [8 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
+                                        (__v8sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \param __B
+///    A 256-bit vector of [16 x bfloat].
+/// \param __D
+///    A 256-bit vector of [8 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
+                                        (__v8sf)_mm256_setzero_si256());
+}
+
+/// Convert One Single float Data to One BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A float data.
+/// \returns A bf16 data whose sign field and exponent field keep unchanged,
+///    and fraction field is truncated to 7 bits.
+static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
+  __v4sf __V = {__A, 0, 0, 0};
+  __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
+      (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
+  return __R[0];
+}
+
+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from convertion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
+  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
+      (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+///    A 8-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from convertion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
+  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
+      (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+///    A 256-bit vector of [8 x float]. Elements are copied from __S when
+///     the corresponding mask bit is not set.
+/// \param __U
+///    A 8-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from convertion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
+  return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
+      (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
+      16));
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
diff --git a/lib/include/avx512vlbitalgintrin.h b/lib/include/avx512vlbitalgintrin.h
index 64860b292..5154eae14 100644
--- a/lib/include/avx512vlbitalgintrin.h
+++ b/lib/include/avx512vlbitalgintrin.h
@@ -1,23 +1,9 @@
 /*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vlbwintrin.h b/lib/include/avx512vlbwintrin.h
index 87e0023e8..ead09466b 100644
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@@ -1,22 +1,8 @@
 /*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -2301,7 +2287,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128
 _mm_loadu_epi16 (void const *__P)
 {
   struct __loadu_epi16 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi16*)__P)->__v;
 }
@@ -2327,7 +2313,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256
 _mm256_loadu_epi16 (void const *__P)
 {
   struct __loadu_epi16 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi16*)__P)->__v;
 }
@@ -2353,7 +2339,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128
 _mm_loadu_epi8 (void const *__P)
 {
   struct __loadu_epi8 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi8*)__P)->__v;
 }
@@ -2379,7 +2365,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256
 _mm256_loadu_epi8 (void const *__P)
 {
   struct __loadu_epi8 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi8*)__P)->__v;
 }
@@ -2405,7 +2391,7 @@ static __inline void __DEFAULT_FN_ATTRS128
 _mm_storeu_epi16 (void *__P, __m128i __A)
 {
   struct __storeu_epi16 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi16*)__P)->__v = __A;
 }
@@ -2422,7 +2408,7 @@ static __inline void __DEFAULT_FN_ATTRS256
 _mm256_storeu_epi16 (void *__P, __m256i __A)
 {
   struct __storeu_epi16 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi16*)__P)->__v = __A;
 }
@@ -2439,7 +2425,7 @@ static __inline void __DEFAULT_FN_ATTRS128
 _mm_storeu_epi8 (void *__P, __m128i __A)
 {
   struct __storeu_epi8 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi8*)__P)->__v = __A;
 }
@@ -2456,7 +2442,7 @@ static __inline void __DEFAULT_FN_ATTRS256
 _mm256_storeu_epi8 (void *__P, __m256i __A)
 {
   struct __storeu_epi8 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi8*)__P)->__v = __A;
 }
diff --git a/lib/include/avx512vlcdintrin.h b/lib/include/avx512vlcdintrin.h
index 903a7c254..cc8b72528 100644
--- a/lib/include/avx512vlcdintrin.h
+++ b/lib/include/avx512vlcdintrin.h
@@ -1,22 +1,8 @@
 /*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -60,99 +46,89 @@ _mm256_broadcastmw_epi32 (__mmask16 __A)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_conflict_epi64 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
-               (__v2di) _mm_undefined_si128 (),
-               (__mmask8) -1);
+  return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
-               (__v2di) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_conflict_epi64(__A),
+                                             (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
-               (__v2di)
-               _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_conflict_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_conflict_epi64 (__m256i __A)
 {
-  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
-               (__v4di)  _mm256_undefined_si256 (),
-               (__mmask8) -1);
+  return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
-               (__v4di) __W,
-               (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_conflict_epi64(__A),
+                                             (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
-               (__v4di) _mm256_setzero_si256 (),
-               (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_conflict_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_conflict_epi32 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
-               (__v4si) _mm_undefined_si128 (),
-               (__mmask8) -1);
+  return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
-               (__v4si) __W,
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_conflict_epi32(__A),
+                                             (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
-               (__v4si) _mm_setzero_si128 (),
-               (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_conflict_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_conflict_epi32 (__m256i __A)
 {
-  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
-               (__v8si) _mm256_undefined_si256 (),
-               (__mmask8) -1);
+  return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
-               (__v8si) __W,
-               (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_conflict_epi32(__A),
+                                             (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
-               (__v8si)
-               _mm256_setzero_si256 (),
-               (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_conflict_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
diff --git a/lib/include/avx512vldqintrin.h b/lib/include/avx512vldqintrin.h
index 9d13846e8..95ba574ea 100644
--- a/lib/include/avx512vldqintrin.h
+++ b/lib/include/avx512vldqintrin.h
@@ -1,22 +1,8 @@
 /*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -523,23 +509,21 @@ _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_cvtepi64_ps (__m256i __A) {
-  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
-                (__v4sf) _mm_setzero_ps(),
-                (__mmask8) -1);
+  return (__m128)__builtin_convertvector((__v4di)__A, __v4sf);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
-  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
-                (__v4sf) __W,
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm256_cvtepi64_ps(__A),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
-  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
-                (__v4sf) _mm_setzero_ps(),
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm256_cvtepi64_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
@@ -771,23 +755,21 @@ _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_cvtepu64_ps (__m256i __A) {
-  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
-                (__v4sf) _mm_setzero_ps(),
-                (__mmask8) -1);
+  return (__m128)__builtin_convertvector((__v4du)__A, __v4sf);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
-  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
-                (__v4sf) __W,
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm256_cvtepu64_ps(__A),
+                                             (__v4sf)__W);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
-  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
-                (__v4sf) _mm_setzero_ps(),
-                (__mmask8) __U);
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm256_cvtepu64_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
 }
 
 #define _mm_range_pd(A, B, C) \
diff --git a/lib/include/avx512vlintrin.h b/lib/include/avx512vlintrin.h
index a2cdc0a96..9494fc8a6 100644
--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
@@ -1,22 +1,8 @@
 /*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -5513,7 +5499,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128
 _mm_loadu_epi64 (void const *__P)
 {
   struct __loadu_epi64 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi64*)__P)->__v;
 }
@@ -5539,7 +5525,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256
 _mm256_loadu_epi64 (void const *__P)
 {
   struct __loadu_epi64 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi64*)__P)->__v;
 }
@@ -5565,7 +5551,7 @@ static __inline __m128i __DEFAULT_FN_ATTRS128
 _mm_loadu_epi32 (void const *__P)
 {
   struct __loadu_epi32 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi32*)__P)->__v;
 }
@@ -5591,7 +5577,7 @@ static __inline __m256i __DEFAULT_FN_ATTRS256
 _mm256_loadu_epi32 (void const *__P)
 {
   struct __loadu_epi32 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_epi32*)__P)->__v;
 }
@@ -5717,7 +5703,7 @@ static __inline void __DEFAULT_FN_ATTRS128
 _mm_storeu_epi64 (void *__P, __m128i __A)
 {
   struct __storeu_epi64 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi64*)__P)->__v = __A;
 }
@@ -5734,7 +5720,7 @@ static __inline void __DEFAULT_FN_ATTRS256
 _mm256_storeu_epi64 (void *__P, __m256i __A)
 {
   struct __storeu_epi64 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi64*)__P)->__v = __A;
 }
@@ -5751,7 +5737,7 @@ static __inline void __DEFAULT_FN_ATTRS128
 _mm_storeu_epi32 (void *__P, __m128i __A)
 {
   struct __storeu_epi32 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi32*)__P)->__v = __A;
 }
@@ -5768,7 +5754,7 @@ static __inline void __DEFAULT_FN_ATTRS256
 _mm256_storeu_epi32 (void *__P, __m256i __A)
 {
   struct __storeu_epi32 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_epi32*)__P)->__v = __A;
 }
@@ -7000,7 +6986,7 @@ _mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
 _mm256_cvtsepi32_epi8 (__m256i __A)
 {
   return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
@@ -7023,7 +7009,7 @@ _mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
                __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS128
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
 {
   __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
@@ -7581,7 +7567,7 @@ _mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
               __M);
 }
 
-static __inline__ void __DEFAULT_FN_ATTRS256
+static __inline__ void __DEFAULT_FN_ATTRS128
 _mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
   __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
@@ -8425,22 +8411,6 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
                 (__mmask8) __U);
 }
 
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A)
-{
-  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
-                                                  (__v8hi) __W,
-                                                  (__mmask8) __U);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A)
-{
-  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
-                                                  (__v8hi) _mm_setzero_si128 (),
-                                                  (__mmask8) __U);
-}
-
 #define _mm_mask_cvt_roundps_ph(W, U, A, I) \
   (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
                                          (__v8hi)(__m128i)(W), \
@@ -8451,21 +8421,9 @@ _mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A)
                                          (__v8hi)_mm_setzero_si128(), \
                                          (__mmask8)(U))
 
-static __inline __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A)
-{
-  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
-                                                      (__v8hi) __W,
-                                                      (__mmask8) __U);
-}
+#define _mm_mask_cvtps_ph  _mm_mask_cvt_roundps_ph
+#define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph
 
-static __inline __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A)
-{
-  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
-                                                      (__v8hi) _mm_setzero_si128(),
-                                                      (__mmask8) __U);
-}
 #define _mm256_mask_cvt_roundps_ph(W, U, A, I) \
   (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
                                             (__v8hi)(__m128i)(W), \
@@ -8476,6 +8434,9 @@ _mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A)
                                             (__v8hi)_mm_setzero_si128(), \
                                             (__mmask8)(U))
 
+#define _mm256_mask_cvtps_ph  _mm256_mask_cvt_roundps_ph
+#define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph
+
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
diff --git a/lib/include/avx512vlvbmi2intrin.h b/lib/include/avx512vlvbmi2intrin.h
index 632d14fb5..a40f926de 100644
--- a/lib/include/avx512vlvbmi2intrin.h
+++ b/lib/include/avx512vlvbmi2intrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vlvnniintrin.h b/lib/include/avx512vlvnniintrin.h
index 62382268e..b7c8fa08c 100644
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vlvp2intersectintrin.h b/lib/include/avx512vlvp2intersectintrin.h
new file mode 100644
index 000000000..3e0815e5d
--- /dev/null
+++ b/lib/include/avx512vlvp2intersectintrin.h
@@ -0,0 +1,121 @@
+/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLVP2INTERSECT_H
+#define _AVX512VLVP2INTERSECT_H
+
+#define __DEFAULT_FN_ATTRS128 \
+  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vl,avx512vp2intersect"), \
+                 __min_vector_width__(128)))
+
+#define __DEFAULT_FN_ATTRS256 \
+  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vl,avx512vp2intersect"), \
+                 __min_vector_width__(256)))
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x i32].
+/// \param __b
+///    A 256-bit vector of [8 x i32]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x i64].
+/// \param __b
+///    A 256-bit vector of [4 x i64]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \param __b
+///    A 128-bit vector of [4 x i32]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+/// \param __b
+///    A 128-bit vector of [2 x i64]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
diff --git a/lib/include/avx512vnniintrin.h b/lib/include/avx512vnniintrin.h
index 620ef5a78..9935a119a 100644
--- a/lib/include/avx512vnniintrin.h
+++ b/lib/include/avx512vnniintrin.h
@@ -1,23 +1,9 @@
 /*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vp2intersectintrin.h b/lib/include/avx512vp2intersectintrin.h
new file mode 100644
index 000000000..5d3cb48cf
--- /dev/null
+++ b/lib/include/avx512vp2intersectintrin.h
@@ -0,0 +1,77 @@
+/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vp2intersect.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECT_H
+#define _AVX512VP2INTERSECT_H
+
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vp2intersect"), \
+                 __min_vector_width__(512)))
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+///    A 512-bit vector of [16 x i32].
+/// \param __b
+///    A 512-bit vector of [16 x i32]
+/// \param __m0
+///    A pointer point to 16-bit mask
+/// \param __m1
+///    A pointer point to 16-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) {
+  __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+///    A 512-bit vector of [8 x i64].
+/// \param __b
+///    A 512-bit vector of [8 x i64]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/lib/include/avx512vpopcntdqintrin.h b/lib/include/avx512vpopcntdqintrin.h
index c99f59456..bb435e623 100644
--- a/lib/include/avx512vpopcntdqintrin.h
+++ b/lib/include/avx512vpopcntdqintrin.h
@@ -1,23 +1,9 @@
 /*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avx512vpopcntdqvlintrin.h b/lib/include/avx512vpopcntdqvlintrin.h
index 681a75fa0..a3cb9b6bc 100644
--- a/lib/include/avx512vpopcntdqvlintrin.h
+++ b/lib/include/avx512vpopcntdqvlintrin.h
@@ -1,23 +1,9 @@
 /*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/avxintrin.h b/lib/include/avxintrin.h
index cb15396b3..a01240b9d 100644
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@@ -1,22 +1,8 @@
 /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -45,9 +31,13 @@ typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
  * appear in the interface though. */
 typedef signed char __v32qs __attribute__((__vector_size__(32)));
 
-typedef float __m256 __attribute__ ((__vector_size__ (32)));
-typedef double __m256d __attribute__((__vector_size__(32)));
-typedef long long __m256i __attribute__((__vector_size__(32)));
+typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
+typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
+
+typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
+typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
+typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
@@ -3113,7 +3103,7 @@ static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu_pd(double const *__p)
 {
   struct __loadu_pd {
-    __m256d __v;
+    __m256d_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_pd*)__p)->__v;
 }
@@ -3133,7 +3123,7 @@ static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu_ps(float const *__p)
 {
   struct __loadu_ps {
-    __m256 __v;
+    __m256_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_ps*)__p)->__v;
 }
@@ -3166,10 +3156,10 @@ _mm256_load_si256(__m256i const *__p)
 ///    A pointer to a 256-bit integer vector containing integer values.
 /// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_loadu_si256(__m256i const *__p)
+_mm256_loadu_si256(__m256i_u const *__p)
 {
   struct __loadu_si256 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_si256*)__p)->__v;
 }
@@ -3246,7 +3236,7 @@ static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_pd(double *__p, __m256d __a)
 {
   struct __storeu_pd {
-    __m256d __v;
+    __m256d_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_pd*)__p)->__v = __a;
 }
@@ -3266,7 +3256,7 @@ static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_ps(float *__p, __m256 __a)
 {
   struct __storeu_ps {
-    __m256 __v;
+    __m256_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_ps*)__p)->__v = __a;
 }
@@ -3301,10 +3291,10 @@ _mm256_store_si256(__m256i *__p, __m256i __a)
 /// \param __a
 ///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu_si256(__m256i *__p, __m256i __a)
+_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
 {
   struct __storeu_si256 {
-    __m256i __v;
+    __m256i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_si256*)__p)->__v = __a;
 }
@@ -4834,7 +4824,7 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
 ///    address of the memory location does not have to be aligned.
 /// \returns A 256-bit integer vector containing the concatenated result.
 static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
+_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
 {
   __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
   return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
@@ -4918,7 +4908,7 @@ _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
 /// \param __a
 ///    A 256-bit integer vector.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
+_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
 {
   __m128i __v128;
 
diff --git a/lib/include/bmi2intrin.h b/lib/include/bmi2intrin.h
index fdae82cf2..0b56aed5f 100644
--- a/lib/include/bmi2intrin.h
+++ b/lib/include/bmi2intrin.h
@@ -1,22 +1,8 @@
 /*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/bmiintrin.h b/lib/include/bmiintrin.h
index 56c20b78d..b7af62f60 100644
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@@ -1,22 +1,8 @@
 /*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/cetintrin.h b/lib/include/cetintrin.h
index 120c95424..4290e9d73 100644
--- a/lib/include/cetintrin.h
+++ b/lib/include/cetintrin.h
@@ -1,22 +1,8 @@
 /*===---- cetintrin.h - CET intrinsic --------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/cldemoteintrin.h b/lib/include/cldemoteintrin.h
index fa78148eb..2413e7dea 100644
--- a/lib/include/cldemoteintrin.h
+++ b/lib/include/cldemoteintrin.h
@@ -1,22 +1,8 @@
 /*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/clflushoptintrin.h b/lib/include/clflushoptintrin.h
index 79bb4589f..060eb36f3 100644
--- a/lib/include/clflushoptintrin.h
+++ b/lib/include/clflushoptintrin.h
@@ -1,22 +1,8 @@
 /*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/clwbintrin.h b/lib/include/clwbintrin.h
index c09286ba6..3360d203f 100644
--- a/lib/include/clwbintrin.h
+++ b/lib/include/clwbintrin.h
@@ -1,22 +1,8 @@
 /*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/clzerointrin.h b/lib/include/clzerointrin.h
index f4e920839..a180984a3 100644
--- a/lib/include/clzerointrin.h
+++ b/lib/include/clzerointrin.h
@@ -1,22 +1,8 @@
 /*===----------------------- clzerointrin.h - CLZERO ----------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/cpuid.h b/lib/include/cpuid.h
index fce6af52d..02ffac26c 100644
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@@ -1,22 +1,8 @@
 /*===---- cpuid.h - X86 cpu model detection --------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -191,6 +177,7 @@
 #define bit_CLDEMOTE         0x02000000
 #define bit_MOVDIRI          0x08000000
 #define bit_MOVDIR64B        0x10000000
+#define bit_ENQCMD           0x20000000
 
 /* Features in %edx for leaf 7 sub-leaf 0 */
 #define bit_AVX5124VNNIW  0x00000004
@@ -198,6 +185,9 @@
 #define bit_PCONFIG       0x00040000
 #define bit_IBT           0x00100000
 
+/* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_AVX512BF16    0x00000020
+
 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
 #define bit_XSAVEC      0x00000002
diff --git a/lib/include/emmintrin.h b/lib/include/emmintrin.h
index 6d61f9719..3d55f5f27 100644
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@@ -1,22 +1,8 @@
 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -26,8 +12,11 @@
 
 #include <xmmintrin.h>
 
-typedef double __m128d __attribute__((__vector_size__(16)));
-typedef long long __m128i __attribute__((__vector_size__(16)));
+typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+
+typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
+typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
 
 /* Type defines.  */
 typedef double __v2df __attribute__ ((__vector_size__ (16)));
@@ -1652,7 +1641,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadu_pd(double const *__dp)
 {
   struct __loadu_pd {
-    __m128d __v;
+    __m128d_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_pd*)__dp)->__v;
 }
@@ -2042,7 +2031,7 @@ static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_pd(double *__dp, __m128d __a)
 {
   struct __storeu_pd {
-    __m128d __v;
+    __m128d_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_pd*)__dp)->__v = __a;
 }
@@ -2316,11 +2305,7 @@ _mm_adds_epu16(__m128i __a, __m128i __b)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu8(__m128i __a, __m128i __b)
 {
-  typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
-  return (__m128i)__builtin_convertvector(
-               ((__builtin_convertvector((__v16qu)__a, __v16hu) +
-                 __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
-                 >> 1, __v16qu);
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Computes the rounded avarages of corresponding elements of two
@@ -2340,11 +2325,7 @@ _mm_avg_epu8(__m128i __a, __m128i __b)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu16(__m128i __a, __m128i __b)
 {
-  typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
-  return (__m128i)__builtin_convertvector(
-               ((__builtin_convertvector((__v8hu)__a, __v8su) +
-                 __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
-                 >> 1, __v8hu);
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
@@ -3564,10 +3545,10 @@ _mm_load_si128(__m128i const *__p)
 ///    A pointer to a memory location containing integer values.
 /// \returns A 128-bit integer vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadu_si128(__m128i const *__p)
+_mm_loadu_si128(__m128i_u const *__p)
 {
   struct __loadu_si128 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_si128*)__p)->__v;
 }
@@ -3585,7 +3566,7 @@ _mm_loadu_si128(__m128i const *__p)
 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
 ///    moved value. The higher order bits are cleared.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadl_epi64(__m128i const *__p)
+_mm_loadl_epi64(__m128i_u const *__p)
 {
   struct __mm_loadl_epi64_struct {
     long long __u;
@@ -4027,10 +4008,10 @@ _mm_store_si128(__m128i *__p, __m128i __b)
 /// \param __b
 ///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_si128(__m128i *__p, __m128i __b)
+_mm_storeu_si128(__m128i_u *__p, __m128i __b)
 {
   struct __storeu_si128 {
-    __m128i __v;
+    __m128i_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_si128*)__p)->__v = __b;
 }
@@ -4139,7 +4120,7 @@ _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
 ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
 ///    value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storel_epi64(__m128i *__p, __m128i __a)
+_mm_storel_epi64(__m128i_u *__p, __m128i __a)
 {
   struct __mm_storel_epi64_struct {
     long long __u;
diff --git a/lib/include/enqcmdintrin.h b/lib/include/enqcmdintrin.h
new file mode 100644
index 000000000..30af67f6b
--- /dev/null
+++ b/lib/include/enqcmdintrin.h
@@ -0,0 +1,63 @@
+/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <enqcmdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ENQCMDINTRIN_H
+#define __ENQCMDINTRIN_H
+
+/* Define the default attributes for the functions in this file */
+#define _DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("enqcmd")))
+
+/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
+///    data, and performs 64-byte enqueue store to memory pointed by \a __dst.
+///    This intrinsics may only be used in User mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsics corresponds to the <c> ENQCMD </c> instruction.
+///
+/// \param __dst
+///    Pointer to the destination of the enqueue store.
+/// \param __src
+///    Pointer to 64-byte command data.
+/// \returns If the command data is successfully written to \a __dst then 0 is
+///    returned. Otherwise 1 is returned.
+static __inline__ int _DEFAULT_FN_ATTRS
+_enqcmd (void *__dst, const void *__src)
+{
+  return __builtin_ia32_enqcmd(__dst, __src);
+}
+
+/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
+///    data, and performs 64-byte enqueue store to memory pointed by \a __dst
+///    This intrinsic may only be used in Privileged mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsics corresponds to the <c> ENQCMDS </c> instruction.
+///
+/// \param __dst
+///    Pointer to the destination of the enqueue store.
+/// \param __src
+///    Pointer to 64-byte command data.
+/// \returns If the command data is successfully written to \a __dst then 0 is
+///    returned. Otherwise 1 is returned.
+static __inline__ int _DEFAULT_FN_ATTRS
+_enqcmds (void *__dst, const void *__src)
+{
+  return __builtin_ia32_enqcmds(__dst, __src);
+}
+
+#undef _DEFAULT_FN_ATTRS
+
+#endif /* __ENQCMDINTRIN_H */
diff --git a/lib/include/f16cintrin.h b/lib/include/f16cintrin.h
index 3d35f28eb..109b604ad 100644
--- a/lib/include/f16cintrin.h
+++ b/lib/include/f16cintrin.h
@@ -1,22 +1,8 @@
 /*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -52,9 +38,9 @@
 static __inline float __DEFAULT_FN_ATTRS128
 _cvtsh_ss(unsigned short __a)
 {
-  __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
-  __v4sf r = __builtin_ia32_vcvtph2ps(v);
-  return r[0];
+  __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
+  __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
+  return __r[0];
 }
 
 /// Converts a 32-bit single-precision float value to a 16-bit
diff --git a/lib/include/float.h b/lib/include/float.h
index 56215cd62..ed610b24a 100644
--- a/lib/include/float.h
+++ b/lib/include/float.h
@@ -1,22 +1,8 @@
 /*===---- float.h - Characteristics of floating point types ----------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -51,7 +37,7 @@
 #  undef FLT_MANT_DIG
 #  undef DBL_MANT_DIG
 #  undef LDBL_MANT_DIG
-#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
 #    undef DECIMAL_DIG
 #  endif
 #  undef FLT_DIG
@@ -78,7 +64,7 @@
 #  undef FLT_MIN
 #  undef DBL_MIN
 #  undef LDBL_MIN
-#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
 #    undef FLT_TRUE_MIN
 #    undef DBL_TRUE_MIN
 #    undef LDBL_TRUE_MIN
@@ -101,7 +87,7 @@
 #define DBL_MANT_DIG __DBL_MANT_DIG__
 #define LDBL_MANT_DIG __LDBL_MANT_DIG__
 
-#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
 #  define DECIMAL_DIG __DECIMAL_DIG__
 #endif
 
@@ -137,7 +123,7 @@
 #define DBL_MIN __DBL_MIN__
 #define LDBL_MIN __LDBL_MIN__
 
-#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
 #  define FLT_TRUE_MIN __FLT_DENORM_MIN__
 #  define DBL_TRUE_MIN __DBL_DENORM_MIN__
 #  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
diff --git a/lib/include/fma4intrin.h b/lib/include/fma4intrin.h
index 7bae2f4a3..694801b3e 100644
--- a/lib/include/fma4intrin.h
+++ b/lib/include/fma4intrin.h
@@ -1,22 +1,8 @@
 /*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/fmaintrin.h b/lib/include/fmaintrin.h
index 094d13afe..d889b7c5e 100644
--- a/lib/include/fmaintrin.h
+++ b/lib/include/fmaintrin.h
@@ -1,22 +1,8 @@
 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/fxsrintrin.h b/lib/include/fxsrintrin.h
index 704b5ad60..afee6aa97 100644
--- a/lib/include/fxsrintrin.h
+++ b/lib/include/fxsrintrin.h
@@ -1,22 +1,8 @@
 /*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/gfniintrin.h b/lib/include/gfniintrin.h
index 804d4f3d0..9bff0fcb6 100644
--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@@ -1,23 +1,9 @@
 /*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/htmintrin.h b/lib/include/htmintrin.h
index 69c8d7bb5..49c2b9860 100644
--- a/lib/include/htmintrin.h
+++ b/lib/include/htmintrin.h
@@ -1,22 +1,8 @@
 /*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
 \*===----------------------------------------------------------------------===*/
 
diff --git a/lib/include/htmxlintrin.h b/lib/include/htmxlintrin.h
index 049dbd61d..6ef6f4b34 100644
--- a/lib/include/htmxlintrin.h
+++ b/lib/include/htmxlintrin.h
@@ -1,22 +1,8 @@
 /*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
 \*===----------------------------------------------------------------------===*/
 
diff --git a/lib/include/ia32intrin.h b/lib/include/ia32intrin.h
index f8972e305..8e38df731 100644
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
@@ -1,22 +1,8 @@
 /* ===-------- ia32intrin.h ---------------------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -28,6 +14,160 @@
 #ifndef __IA32INTRIN_H
 #define __IA32INTRIN_H
 
+/** Find the first set bit starting from the lsb. Result is undefined if
+ *  input is 0.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> BSF </c> instruction or the
+ *  <c> TZCNT </c> instruction.
+ *
+ *  \param __A
+ *     A 32-bit integer operand.
+ *  \returns A 32-bit integer containing the bit number.
+ */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__bsfd(int __A) {
+  return __builtin_ctz(__A);
+}
+
+/** Find the first set bit starting from the msb. Result is undefined if
+ *  input is 0.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> BSR </c> instruction or the
+ *  <c> LZCNT </c> instruction and an <c> XOR </c>.
+ *
+ *  \param __A
+ *     A 32-bit integer operand.
+ *  \returns A 32-bit integer containing the bit number.
+ */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__bsrd(int __A) {
+  return 31 - __builtin_clz(__A);
+}
+
+/** Swaps the bytes in the input. Converting little endian to big endian or
+ *  vice versa.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> BSWAP </c> instruction.
+ *
+ *  \param __A
+ *     A 32-bit integer operand.
+ *  \returns A 32-bit integer containing the swapped bytes.
+ */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__bswapd(int __A) {
+  return __builtin_bswap32(__A);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bswap(int __A) {
+  return __builtin_bswap32(__A);
+}
+
+#define _bit_scan_forward(A) __bsfd((A))
+#define _bit_scan_reverse(A) __bsrd((A))
+
+#ifdef __x86_64__
+/** Find the first set bit starting from the lsb. Result is undefined if
+ *  input is 0.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> BSF </c> instruction or the
+ *  <c> TZCNT </c> instruction.
+ *
+ *  \param __A
+ *     A 64-bit integer operand.
+ *  \returns A 32-bit integer containing the bit number.
+ */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__bsfq(long long __A) {
+  return __builtin_ctzll(__A);
+}
+
+/** Find the first set bit starting from the msb. Result is undefined if
+ *  input is 0.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> BSR </c> instruction or the
+ *  <c> LZCNT </c> instruction and an <c> XOR </c>.
+ *
+ *  \param __A
+ *     A 64-bit integer operand.
+ *  \returns A 32-bit integer containing the bit number.
+ */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__bsrq(long long __A) {
+  return 63 - __builtin_clzll(__A);
+}
+
+/** Swaps the bytes in the input. Converting little endian to big endian or
+ *  vice versa.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> BSWAP </c> instruction.
+ *
+ *  \param __A
+ *     A 64-bit integer operand.
+ *  \returns A 64-bit integer containing the swapped bytes.
+ */
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+__bswapq(long long __A) {
+  return __builtin_bswap64(__A);
+}
+
+#define _bswap64(A) __bswapq((A))
+#endif
+
+/** Counts the number of bits in the source operand having a value of 1.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> POPCNT </c> instruction or a
+ *  a sequence of arithmetic and logic ops to calculate it.
+ *
+ *  \param __A
+ *     An unsigned 32-bit integer operand.
+ *  \returns A 32-bit integer containing the number of bits with value 1 in the
+ *     source operand.
+ */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__popcntd(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#define _popcnt32(A) __popcntd((A))
+
+#ifdef __x86_64__
+/** Counts the number of bits in the source operand having a value of 1.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> POPCNT </c> instruction or a
+ *  a sequence of arithmetic and logic ops to calculate it.
+ *
+ *  \param __A
+ *     An unsigned 64-bit integer operand.
+ *  \returns A 64-bit integer containing the number of bits with value 1 in the
+ *     source operand.
+ */
+static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+__popcntq(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+
+#define _popcnt64(A) __popcntq((A))
+#endif /* __x86_64__ */
+
 #ifdef __x86_64__
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __readeflags(void)
@@ -55,6 +195,92 @@ __writeeflags(unsigned int __f)
 }
 #endif /* !__x86_64__ */
 
+/** Adds the unsigned integer operand to the CRC-32C checksum of the
+ *     unsigned char operand.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> CRC32B </c> instruction.
+ *
+ *  \param __C
+ *     An unsigned integer operand to add to the CRC-32C checksum of operand
+ *     \a  __D.
+ *  \param __D
+ *     An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
+ *  \returns The result of adding operand \a __C to the CRC-32C checksum of
+ *     operand \a __D.
+ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+__crc32b(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+/** Adds the unsigned integer operand to the CRC-32C checksum of the
+ *     unsigned short operand.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> CRC32W </c> instruction.
+ *
+ *  \param __C
+ *     An unsigned integer operand to add to the CRC-32C checksum of operand
+ *     \a  __D.
+ *  \param __D
+ *     An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
+ *  \returns The result of adding operand \a __C to the CRC-32C checksum of
+ *     operand \a __D.
+ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+__crc32w(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+/** Adds the unsigned integer operand to the CRC-32C checksum of the
+ *     second unsigned integer operand.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> CRC32D </c> instruction.
+ *
+ *  \param __C
+ *     An unsigned integer operand to add to the CRC-32C checksum of operand
+ *     \a  __D.
+ *  \param __D
+ *     An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
+ *  \returns The result of adding operand \a __C to the CRC-32C checksum of
+ *     operand \a __D.
+ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+__crc32d(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+/** Adds the unsigned integer operand to the CRC-32C checksum of the
+ *     unsigned 64-bit integer operand.
+ *
+ *  \headerfile <x86intrin.h>
+ *
+ *  This intrinsic corresponds to the <c> CRC32Q </c> instruction.
+ *
+ *  \param __C
+ *     An unsigned integer operand to add to the CRC-32C checksum of operand
+ *     \a  __D.
+ *  \param __D
+ *     An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
+ *  \returns The result of adding operand \a __C to the CRC-32C checksum of
+ *     operand \a __D.
+ */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+__crc32q(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
 __rdpmc(int __A) {
   return __builtin_ia32_rdpmc(__A);
@@ -75,4 +301,64 @@ _wbinvd(void) {
   __builtin_ia32_wbinvd();
 }
 
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+__rolb(unsigned char __X, int __C) {
+  return __builtin_rotateleft8(__X, __C);
+}
+
+static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+__rorb(unsigned char __X, int __C) {
+  return __builtin_rotateright8(__X, __C);
+}
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__rolw(unsigned short __X, int __C) {
+  return __builtin_rotateleft16(__X, __C);
+}
+
+static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+__rorw(unsigned short __X, int __C) {
+  return __builtin_rotateright16(__X, __C);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__rold(unsigned int __X, int __C) {
+  return __builtin_rotateleft32(__X, __C);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__rord(unsigned int __X, int __C) {
+  return __builtin_rotateright32(__X, __C);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rolq(unsigned long long __X, int __C) {
+  return __builtin_rotateleft64(__X, __C);
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rorq(unsigned long long __X, int __C) {
+  return __builtin_rotateright64(__X, __C);
+}
+#endif /* __x86_64__ */
+
+#ifndef _MSC_VER
+/* These are already provided as builtins for MSVC. */
+/* Select the correct function based on the size of long. */
+#ifdef __LP64__
+#define _lrotl(a,b) __rolq((a), (b))
+#define _lrotr(a,b) __rorq((a), (b))
+#else
+#define _lrotl(a,b) __rold((a), (b))
+#define _lrotr(a,b) __rord((a), (b))
+#endif
+#define _rotl(a,b) __rold((a), (b))
+#define _rotr(a,b) __rord((a), (b))
+#endif // _MSC_VER
+
+/* These are not builtins so need to be provided in all modes. */
+#define _rotwl(a,b) __rolw((a), (b))
+#define _rotwr(a,b) __rorw((a), (b))
+
 #endif /* __IA32INTRIN_H */
diff --git a/lib/include/immintrin.h b/lib/include/immintrin.h
index 7d0722ec7..7555ad82f 100644
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@@ -1,22 +1,8 @@
 /*===---- immintrin.h - Intel intrinsics -----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -195,6 +181,15 @@
 #include <avx512pfintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BF16__)
+#include <avx512bf16intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512BF16__))
+#include <avx512vlbf16intrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
 #include <pkuintrin.h>
 #endif
@@ -241,18 +236,6 @@ _rdrand64_step(unsigned long long *__p)
 #endif
 #endif /* __RDRND__ */
 
-/* __bit_scan_forward */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_bit_scan_forward(int __A) {
-  return __builtin_ctz(__A);
-}
-
-/* __bit_scan_reverse */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
-_bit_scan_reverse(int __A) {
-  return 31 - __builtin_clz(__A);
-}
-
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
@@ -378,9 +361,8 @@ _storebe_i64(void * __P, long long __D) {
 #include <fxsrintrin.h>
 #endif
 
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
+/* No feature check desired due to internal MSC_VER checks */
 #include <xsaveintrin.h>
-#endif
 
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
 #include <xsaveoptintrin.h>
@@ -439,7 +421,21 @@ _storebe_i64(void * __P, long long __D) {
 #include <invpcidintrin.h>
 #endif
 
-#ifdef _MSC_VER
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+  defined(__AVX512VP2INTERSECT__)
+#include <avx512vp2intersectintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+  (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
+#include <avx512vlvp2intersectintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__ENQCMD__)
+#include <enqcmdintrin.h>
+#endif
+
+#if defined(_MSC_VER) && __has_extension(gnu_asm)
 /* Define the default attributes for these intrinsics */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 #ifdef __cplusplus
@@ -521,6 +517,6 @@ _InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* _MSC_VER */
+#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */
 
 #endif /* __IMMINTRIN_H */
diff --git a/lib/include/intrin.h b/lib/include/intrin.h
index 966258bab..9786ba147 100644
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@@ -1,22 +1,8 @@
 /* ===-------- intrin.h ---------------------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -200,10 +186,6 @@ __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
 _WriteBarrier(void);
 unsigned __int32 xbegin(void);
 void _xend(void);
-static __inline__
-#define _XCR_XFEATURE_ENABLED_MASK 0
-unsigned __int64 __cdecl _xgetbv(unsigned int);
-void __cdecl _xsetbv(unsigned int, unsigned __int64);
 
 /* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
 #ifdef __x86_64__
@@ -539,12 +521,6 @@ __cpuidex(int __info[4], int __level, int __ecx) {
   __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
                    : "a"(__level), "c"(__ecx));
 }
-static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
-_xgetbv(unsigned int __xcr_no) {
-  unsigned int __eax, __edx;
-  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
-  return ((unsigned __int64)__edx << 32) | __eax;
-}
 static __inline__ void __DEFAULT_FN_ATTRS
 __halt(void) {
   __asm__ volatile ("hlt");
@@ -567,15 +543,9 @@ long _InterlockedAdd(long volatile *Addend, long Value);
 __int64 _ReadStatusReg(int);
 void _WriteStatusReg(int, __int64);
 
-static inline unsigned short _byteswap_ushort (unsigned short val) {
-  return __builtin_bswap16(val);
-}
-static inline unsigned long _byteswap_ulong (unsigned long val) {
-  return __builtin_bswap32(val);
-}
-static inline unsigned __int64 _byteswap_uint64 (unsigned __int64 val) {
-  return __builtin_bswap64(val);
-}
+unsigned short __cdecl _byteswap_ushort(unsigned short val);
+unsigned long __cdecl _byteswap_ulong (unsigned long val);
+unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);
 #endif
 
 /*----------------------------------------------------------------------------*\
diff --git a/lib/include/inttypes.h b/lib/include/inttypes.h
index 1d8eabab0..1c894c4ac 100644
--- a/lib/include/inttypes.h
+++ b/lib/include/inttypes.h
@@ -1,27 +1,18 @@
 /*===---- inttypes.h - Standard header for integer printf macros ----------===*\
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
 \*===----------------------------------------------------------------------===*/
 
 #ifndef __CLANG_INTTYPES_H
+// AIX system headers need inttypes.h to be re-enterable while _STD_TYPES_T
+// is defined until an inclusion of it without _STD_TYPES_T occurs, in which
+// case the header guard macro is defined.
+#if !defined(_AIX) || !defined(_STD_TYPES_T)
 #define __CLANG_INTTYPES_H
+#endif
 
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #error MSVC does not have inttypes.h prior to Visual Studio 2013
diff --git a/lib/include/invpcidintrin.h b/lib/include/invpcidintrin.h
index c30a19fa3..48dae0a86 100644
--- a/lib/include/invpcidintrin.h
+++ b/lib/include/invpcidintrin.h
@@ -1,22 +1,8 @@
 /*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/iso646.h b/lib/include/iso646.h
index dca13c5ba..e0a20c6f1 100644
--- a/lib/include/iso646.h
+++ b/lib/include/iso646.h
@@ -1,24 +1,8 @@
 /*===---- iso646.h - Standard header for alternate spellings of operators---===
  *
- * Copyright (c) 2008 Eli Friedman
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/limits.h b/lib/include/limits.h
index f04187ced..c653580ba 100644
--- a/lib/include/limits.h
+++ b/lib/include/limits.h
@@ -1,24 +1,8 @@
 /*===---- limits.h - Standard header for integer sizes --------------------===*\
  *
- * Copyright (c) 2009 Chris Lattner
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
 \*===----------------------------------------------------------------------===*/
 
diff --git a/lib/include/lwpintrin.h b/lib/include/lwpintrin.h
index 0b28d7358..d8ab0db03 100644
--- a/lib/include/lwpintrin.h
+++ b/lib/include/lwpintrin.h
@@ -1,22 +1,8 @@
 /*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/lzcntintrin.h b/lib/include/lzcntintrin.h
index 35c1651cc..f4ddce9d0 100644
--- a/lib/include/lzcntintrin.h
+++ b/lib/include/lzcntintrin.h
@@ -1,22 +1,8 @@
 /*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/mm3dnow.h b/lib/include/mm3dnow.h
index b0288757a..22ab13aa3 100644
--- a/lib/include/mm3dnow.h
+++ b/lib/include/mm3dnow.h
@@ -1,22 +1,8 @@
 /*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/mm_malloc.h b/lib/include/mm_malloc.h
index 305afd31a..0ea32517a 100644
--- a/lib/include/mm_malloc.h
+++ b/lib/include/mm_malloc.h
@@ -1,22 +1,8 @@
 /*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/mmintrin.h b/lib/include/mmintrin.h
index a73539942..79a8b5501 100644
--- a/lib/include/mmintrin.h
+++ b/lib/include/mmintrin.h
@@ -1,22 +1,8 @@
 /*===---- mmintrin.h - MMX intrinsics --------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -24,7 +10,7 @@
 #ifndef __MMINTRIN_H
 #define __MMINTRIN_H
 
-typedef long long __m64 __attribute__((__vector_size__(8)));
+typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
 
 typedef long long __v1di __attribute__((__vector_size__(8)));
 typedef int __v2si __attribute__((__vector_size__(8)));
diff --git a/lib/include/module.modulemap b/lib/include/module.modulemap
index 1d1af57fd..7954a77a4 100644
--- a/lib/include/module.modulemap
+++ b/lib/include/module.modulemap
@@ -1,22 +1,8 @@
 /*===---- module.modulemap - intrinsics module map -------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -168,4 +154,5 @@ module _Builtin_stddef_max_align_t [system] [extern_c] {
 module opencl_c {
   requires opencl
   header "opencl-c.h"
+  header "opencl-c-base.h"
 }
diff --git a/lib/include/movdirintrin.h b/lib/include/movdirintrin.h
index ec20c5370..30c4d02c8 100644
--- a/lib/include/movdirintrin.h
+++ b/lib/include/movdirintrin.h
@@ -1,22 +1,8 @@
 /*===------------------------- movdirintrin.h ------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/msa.h b/lib/include/msa.h
index da680f5ca..19ea6071a 100644
--- a/lib/include/msa.h
+++ b/lib/include/msa.h
@@ -1,22 +1,8 @@
 /*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/mwaitxintrin.h b/lib/include/mwaitxintrin.h
index 2921eadfa..bca395b0e 100644
--- a/lib/include/mwaitxintrin.h
+++ b/lib/include/mwaitxintrin.h
@@ -1,22 +1,8 @@
 /*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/nmmintrin.h b/lib/include/nmmintrin.h
index 348fb8c7c..672aea496 100644
--- a/lib/include/nmmintrin.h
+++ b/lib/include/nmmintrin.h
@@ -1,22 +1,8 @@
 /*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/opencl-c-base.h b/lib/include/opencl-c-base.h
new file mode 100644
index 000000000..a82954ddd
--- /dev/null
+++ b/lib/include/opencl-c-base.h
@@ -0,0 +1,578 @@
+//===----- opencl-c-base.h - OpenCL C language base definitions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OPENCL_BASE_H_
+#define _OPENCL_BASE_H_
+
+// built-in scalar data types:
+
+/**
+ * An unsigned 8-bit integer.
+ */
+typedef unsigned char uchar;
+
+/**
+ * An unsigned 16-bit integer.
+ */
+typedef unsigned short ushort;
+
+/**
+ * An unsigned 32-bit integer.
+ */
+typedef unsigned int uint;
+
+/**
+ * An unsigned 64-bit integer.
+ */
+typedef unsigned long ulong;
+
+/**
+ * The unsigned integer type of the result of the sizeof operator. This
+ * is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
+ * defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
+ * CL_DEVICE_ADDRESS_BITS is 64-bits.
+ */
+typedef __SIZE_TYPE__ size_t;
+
+/**
+ * A signed integer type that is the result of subtracting two pointers.
+ * This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
+ * defined in table 4.3 is 32-bits and is a 64-bit signed integer if
+ * CL_DEVICE_ADDRESS_BITS is 64-bits.
+ */
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+/**
+ * A signed integer type with the property that any valid pointer to
+ * void can be converted to this type, then converted back to pointer
+ * to void, and the result will compare equal to the original pointer.
+ */
+typedef __INTPTR_TYPE__ intptr_t;
+
+/**
+ * An unsigned integer type with the property that any valid pointer to
+ * void can be converted to this type, then converted back to pointer
+ * to void, and the result will compare equal to the original pointer.
+ */
+typedef __UINTPTR_TYPE__ uintptr_t;
+
+// built-in vector data types:
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef char char8 __attribute__((ext_vector_type(8)));
+typedef char char16 __attribute__((ext_vector_type(16)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar8 __attribute__((ext_vector_type(8)));
+typedef uchar uchar16 __attribute__((ext_vector_type(16)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef short short8 __attribute__((ext_vector_type(8)));
+typedef short short16 __attribute__((ext_vector_type(16)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort8 __attribute__((ext_vector_type(8)));
+typedef ushort ushort16 __attribute__((ext_vector_type(16)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef uint uint8 __attribute__((ext_vector_type(8)));
+typedef uint uint16 __attribute__((ext_vector_type(16)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef long long3 __attribute__((ext_vector_type(3)));
+typedef long long4 __attribute__((ext_vector_type(4)));
+typedef long long8 __attribute__((ext_vector_type(8)));
+typedef long long16 __attribute__((ext_vector_type(16)));
+typedef ulong ulong2 __attribute__((ext_vector_type(2)));
+typedef ulong ulong3 __attribute__((ext_vector_type(3)));
+typedef ulong ulong4 __attribute__((ext_vector_type(4)));
+typedef ulong ulong8 __attribute__((ext_vector_type(8)));
+typedef ulong ulong16 __attribute__((ext_vector_type(16)));
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float8 __attribute__((ext_vector_type(8)));
+typedef float float16 __attribute__((ext_vector_type(16)));
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+typedef half half2 __attribute__((ext_vector_type(2)));
+typedef half half3 __attribute__((ext_vector_type(3)));
+typedef half half4 __attribute__((ext_vector_type(4)));
+typedef half half8 __attribute__((ext_vector_type(8)));
+typedef half half16 __attribute__((ext_vector_type(16)));
+#endif
+#ifdef cl_khr_fp64
+#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef double double3 __attribute__((ext_vector_type(3)));
+typedef double double4 __attribute__((ext_vector_type(4)));
+typedef double double8 __attribute__((ext_vector_type(8)));
+typedef double double16 __attribute__((ext_vector_type(16)));
+#endif
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define NULL ((void*)0)
+#endif
+
+/**
+ * Value of maximum non-infinite single-precision floating-point
+ * number.
+ */
+#define MAXFLOAT 0x1.fffffep127f
+
+/**
+ * A positive float constant expression. HUGE_VALF evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VALF (__builtin_huge_valf())
+
+/**
+ * A positive double constant expression. HUGE_VAL evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VAL (__builtin_huge_val())
+
+/**
+ * A constant expression of type float representing positive or
+ * unsigned infinity.
+ */
+#define INFINITY (__builtin_inff())
+
+/**
+ * A constant expression of type float representing a quiet NaN.
+ */
+#define NAN as_float(INT_MAX)
+
+#define FP_ILOGB0    INT_MIN
+#define FP_ILOGBNAN  INT_MAX
+
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define M_E_F         2.71828182845904523536028747135266250f
+#define M_LOG2E_F     1.44269504088896340735992468100189214f
+#define M_LOG10E_F    0.434294481903251827651128918916605082f
+#define M_LN2_F       0.693147180559945309417232121458176568f
+#define M_LN10_F      2.30258509299404568401799145468436421f
+#define M_PI_F        3.14159265358979323846264338327950288f
+#define M_PI_2_F      1.57079632679489661923132169163975144f
+#define M_PI_4_F      0.785398163397448309615660845819875721f
+#define M_1_PI_F      0.318309886183790671537767526745028724f
+#define M_2_PI_F      0.636619772367581343075535053490057448f
+#define M_2_SQRTPI_F  1.12837916709551257389615890312154517f
+#define M_SQRT2_F     1.41421356237309504880168872420969808f
+#define M_SQRT1_2_F   0.707106781186547524400844362104849039f
+
+#define DBL_DIG 15
+#define DBL_MANT_DIG 53
+#define DBL_MAX_10_EXP +308
+#define DBL_MAX_EXP +1024
+#define DBL_MIN_10_EXP -307
+#define DBL_MIN_EXP -1021
+#define DBL_RADIX 2
+#define DBL_MAX 0x1.fffffffffffffp1023
+#define DBL_MIN 0x1.0p-1022
+#define DBL_EPSILON 0x1.0p-52
+
+#define M_E           0x1.5bf0a8b145769p+1
+#define M_LOG2E       0x1.71547652b82fep+0
+#define M_LOG10E      0x1.bcb7b1526e50ep-2
+#define M_LN2         0x1.62e42fefa39efp-1
+#define M_LN10        0x1.26bb1bbb55516p+1
+#define M_PI          0x1.921fb54442d18p+1
+#define M_PI_2        0x1.921fb54442d18p+0
+#define M_PI_4        0x1.921fb54442d18p-1
+#define M_1_PI        0x1.45f306dc9c883p-2
+#define M_2_PI        0x1.45f306dc9c883p-1
+#define M_2_SQRTPI    0x1.20dd750429b6dp+0
+#define M_SQRT2       0x1.6a09e667f3bcdp+0
+#define M_SQRT1_2     0x1.6a09e667f3bcdp-1
+
+#ifdef cl_khr_fp16
+
+#define HALF_DIG 3
+#define HALF_MANT_DIG 11
+#define HALF_MAX_10_EXP +4
+#define HALF_MAX_EXP +16
+#define HALF_MIN_10_EXP -4
+#define HALF_MIN_EXP -13
+#define HALF_RADIX 2
+#define HALF_MAX ((0x1.ffcp15h))
+#define HALF_MIN ((0x1.0p-14h))
+#define HALF_EPSILON ((0x1.0p-10h))
+
+#define M_E_H         2.71828182845904523536028747135266250h
+#define M_LOG2E_H     1.44269504088896340735992468100189214h
+#define M_LOG10E_H    0.434294481903251827651128918916605082h
+#define M_LN2_H       0.693147180559945309417232121458176568h
+#define M_LN10_H      2.30258509299404568401799145468436421h
+#define M_PI_H        3.14159265358979323846264338327950288h
+#define M_PI_2_H      1.57079632679489661923132169163975144h
+#define M_PI_4_H      0.785398163397448309615660845819875721h
+#define M_1_PI_H      0.318309886183790671537767526745028724h
+#define M_2_PI_H      0.636619772367581343075535053490057448h
+#define M_2_SQRTPI_H  1.12837916709551257389615890312154517h
+#define M_SQRT2_H     1.41421356237309504880168872420969808h
+#define M_SQRT1_2_H   0.707106781186547524400844362104849039h
+
+#endif //cl_khr_fp16
+
+#define CHAR_BIT  8
+#define SCHAR_MAX 127
+#define SCHAR_MIN (-128)
+#define UCHAR_MAX 255
+#define CHAR_MAX  SCHAR_MAX
+#define CHAR_MIN  SCHAR_MIN
+#define USHRT_MAX 65535
+#define SHRT_MAX  32767
+#define SHRT_MIN  (-32768)
+#define UINT_MAX  0xffffffff
+#define INT_MAX   2147483647
+#define INT_MIN   (-2147483647-1)
+#define ULONG_MAX 0xffffffffffffffffUL
+#define LONG_MAX  0x7fffffffffffffffL
+#define LONG_MIN  (-0x7fffffffffffffffL-1)
+
+// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
+
+// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
+typedef uint cl_mem_fence_flags;
+
+/**
+ * Queue a memory fence to ensure correct
+ * ordering of memory operations to local memory
+ */
+#define CLK_LOCAL_MEM_FENCE    0x01
+
+/**
+ * Queue a memory fence to ensure correct
+ * ordering of memory operations to global memory
+ */
+#define CLK_GLOBAL_MEM_FENCE   0x02
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+typedef enum memory_scope {
+  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
+} memory_scope;
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+/**
+ * Queue a memory fence to ensure correct ordering of memory
+ * operations between work-items of a work-group to
+ * image memory.
+ */
+#define CLK_IMAGE_MEM_FENCE  0x04
+
+#ifndef ATOMIC_VAR_INIT
+#define ATOMIC_VAR_INIT(x) (x)
+#endif //ATOMIC_VAR_INIT
+#define ATOMIC_FLAG_INIT 0
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum memory_order
+{
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
+
+// These values need to match the runtime equivalent
+//
+// Addressing Mode.
+//
+#define CLK_ADDRESS_NONE                0
+#define CLK_ADDRESS_CLAMP_TO_EDGE       2
+#define CLK_ADDRESS_CLAMP               4
+#define CLK_ADDRESS_REPEAT              6
+#define CLK_ADDRESS_MIRRORED_REPEAT     8
+
+//
+// Coordination Normalization
+//
+#define CLK_NORMALIZED_COORDS_FALSE     0
+#define CLK_NORMALIZED_COORDS_TRUE      1
+
+//
+// Filtering Mode.
+//
+#define CLK_FILTER_NEAREST              0x10
+#define CLK_FILTER_LINEAR               0x20
+
+#ifdef cl_khr_gl_msaa_sharing
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
+#endif //cl_khr_gl_msaa_sharing
+
+//
+// Channel Datatype.
+//
+#define CLK_SNORM_INT8        0x10D0
+#define CLK_SNORM_INT16       0x10D1
+#define CLK_UNORM_INT8        0x10D2
+#define CLK_UNORM_INT16       0x10D3
+#define CLK_UNORM_SHORT_565   0x10D4
+#define CLK_UNORM_SHORT_555   0x10D5
+#define CLK_UNORM_INT_101010  0x10D6
+#define CLK_SIGNED_INT8       0x10D7
+#define CLK_SIGNED_INT16      0x10D8
+#define CLK_SIGNED_INT32      0x10D9
+#define CLK_UNSIGNED_INT8     0x10DA
+#define CLK_UNSIGNED_INT16    0x10DB
+#define CLK_UNSIGNED_INT32    0x10DC
+#define CLK_HALF_FLOAT        0x10DD
+#define CLK_FLOAT             0x10DE
+#define CLK_UNORM_INT24       0x10DF
+
+// Channel order, numbering must be aligned with cl_channel_order in cl.h
+//
+#define CLK_R         0x10B0
+#define CLK_A         0x10B1
+#define CLK_RG        0x10B2
+#define CLK_RA        0x10B3
+#define CLK_RGB       0x10B4
+#define CLK_RGBA      0x10B5
+#define CLK_BGRA      0x10B6
+#define CLK_ARGB      0x10B7
+#define CLK_INTENSITY 0x10B8
+#define CLK_LUMINANCE 0x10B9
+#define CLK_Rx                0x10BA
+#define CLK_RGx               0x10BB
+#define CLK_RGBx              0x10BC
+#define CLK_DEPTH             0x10BD
+#define CLK_DEPTH_STENCIL     0x10BE
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define CLK_sRGB              0x10BF
+#define CLK_sRGBx             0x10C0
+#define CLK_sRGBA             0x10C1
+#define CLK_sBGRA             0x10C2
+#define CLK_ABGR              0x10C3
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v2.0 s6.13.16 - Pipe Functions
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+
+// OpenCL v2.0 s6.13.17 - Enqueue Kernels
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+#define CLK_SUCCESS                                 0
+#define CLK_ENQUEUE_FAILURE                         -101
+#define CLK_INVALID_QUEUE                           -102
+#define CLK_INVALID_NDRANGE                         -160
+#define CLK_INVALID_EVENT_WAIT_LIST                 -57
+#define CLK_DEVICE_QUEUE_FULL                       -161
+#define CLK_INVALID_ARG_SIZE                        -51
+#define CLK_EVENT_ALLOCATION_FAILURE                -100
+#define CLK_OUT_OF_RESOURCES                        -5
+
+#define CLK_NULL_QUEUE                              0
+#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
+
+// execution model related definitions
+#define CLK_ENQUEUE_FLAGS_NO_WAIT                   0x0
+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL               0x1
+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP           0x2
+
+typedef int kernel_enqueue_flags_t;
+typedef int clk_profiling_info;
+
+// Profiling info name (see capture_event_profiling_info)
+#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
+
+#define MAX_WORK_DIM 3
+
+typedef struct {
+  unsigned int workDimension;
+  size_t globalWorkOffset[MAX_WORK_DIM];
+  size_t globalWorkSize[MAX_WORK_DIM];
+  size_t localWorkSize[MAX_WORK_DIM];
+} ndrange_t;
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#ifdef cl_intel_device_side_avc_motion_estimation
+#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
+
+#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
+#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
+#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2
+#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3
+
+#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0
+#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1
+#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2
+#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3
+
+#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0
+#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
+#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
+
+#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
+#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
+#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
+#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
+#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
+#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
+#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
+#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
+
+#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
+#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
+#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
+
+#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
+#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
+#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
+#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
+#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
+#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
+#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
+#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
+#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
+
+#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
+#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
+
+#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
+#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
+#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
+
+#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
+#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
+#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
+#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
+
+#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
+#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
+#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
+#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
+#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
+
+#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
+#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
+#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
+#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
+
+#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0
+#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1
+#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2
+
+#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
+#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
+
+#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24)
+#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24)
+#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30)
+
+#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
+#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
+
+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0
+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
+
+#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
+
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
+
+#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1
+#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2
+#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3
+
+#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
+#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
+
+#define CLK_AVC_ME_INITIALIZE_INTEL 0x0
+
+#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0
+#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0
+#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0
+
+#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0
+#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0
+#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0
+
+#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
+#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
+#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
+#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
+
+#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
+#endif // cl_intel_device_side_avc_motion_estimation
+
+#endif //_OPENCL_BASE_H_
diff --git a/lib/include/opencl-c.h b/lib/include/opencl-c.h
index 3d3dfb749..4207c53cc 100644
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
@@ -1,15 +1,16 @@
 //===--- opencl-c.h - OpenCL C language builtin function header -----------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef _OPENCL_H_
 #define _OPENCL_H_
 
+#include "opencl-c-base.h"
+
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifndef cl_khr_depth_images
 #define cl_khr_depth_images
@@ -23,9 +24,6 @@
 #endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
-#ifndef cl_intel_planar_yuv
-#define cl_intel_planar_yuv
-#endif // cl_intel_planar_yuv
 #pragma OPENCL EXTENSION cl_intel_planar_yuv : begin
 #pragma OPENCL EXTENSION cl_intel_planar_yuv : end
 #endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2
@@ -37,255 +35,6 @@
 #define __purefn __attribute__((pure))
 #define __cnfn __attribute__((const))
 
-// built-in scalar data types:
-
-/**
- * An unsigned 8-bit integer.
- */
-typedef unsigned char uchar;
-
-/**
- * An unsigned 16-bit integer.
- */
-typedef unsigned short ushort;
-
-/**
- * An unsigned 32-bit integer.
- */
-typedef unsigned int uint;
-
-/**
- * An unsigned 64-bit integer.
- */
-typedef unsigned long ulong;
-
-/**
- * The unsigned integer type of the result of the sizeof operator. This
- * is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
- * defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
- * CL_DEVICE_ADDRESS_BITS is 64-bits.
- */
-typedef __SIZE_TYPE__ size_t;
-
-/**
- * A signed integer type that is the result of subtracting two pointers.
- * This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
- * defined in table 4.3 is 32-bits and is a 64-bit signed integer if
- * CL_DEVICE_ADDRESS_BITS is 64-bits.
- */
-typedef __PTRDIFF_TYPE__ ptrdiff_t;
-
-/**
-* A signed integer type with the property that any valid pointer to
-* void can be converted to this type, then converted back to pointer
-* to void, and the result will compare equal to the original pointer.
-*/
-typedef __INTPTR_TYPE__ intptr_t;
-
-/**
-* An unsigned integer type with the property that any valid pointer to
-* void can be converted to this type, then converted back to pointer
-* to void, and the result will compare equal to the original pointer.
-*/
-typedef __UINTPTR_TYPE__ uintptr_t;
-
-// built-in vector data types:
-typedef char char2 __attribute__((ext_vector_type(2)));
-typedef char char3 __attribute__((ext_vector_type(3)));
-typedef char char4 __attribute__((ext_vector_type(4)));
-typedef char char8 __attribute__((ext_vector_type(8)));
-typedef char char16 __attribute__((ext_vector_type(16)));
-typedef uchar uchar2 __attribute__((ext_vector_type(2)));
-typedef uchar uchar3 __attribute__((ext_vector_type(3)));
-typedef uchar uchar4 __attribute__((ext_vector_type(4)));
-typedef uchar uchar8 __attribute__((ext_vector_type(8)));
-typedef uchar uchar16 __attribute__((ext_vector_type(16)));
-typedef short short2 __attribute__((ext_vector_type(2)));
-typedef short short3 __attribute__((ext_vector_type(3)));
-typedef short short4 __attribute__((ext_vector_type(4)));
-typedef short short8 __attribute__((ext_vector_type(8)));
-typedef short short16 __attribute__((ext_vector_type(16)));
-typedef ushort ushort2 __attribute__((ext_vector_type(2)));
-typedef ushort ushort3 __attribute__((ext_vector_type(3)));
-typedef ushort ushort4 __attribute__((ext_vector_type(4)));
-typedef ushort ushort8 __attribute__((ext_vector_type(8)));
-typedef ushort ushort16 __attribute__((ext_vector_type(16)));
-typedef int int2 __attribute__((ext_vector_type(2)));
-typedef int int3 __attribute__((ext_vector_type(3)));
-typedef int int4 __attribute__((ext_vector_type(4)));
-typedef int int8 __attribute__((ext_vector_type(8)));
-typedef int int16 __attribute__((ext_vector_type(16)));
-typedef uint uint2 __attribute__((ext_vector_type(2)));
-typedef uint uint3 __attribute__((ext_vector_type(3)));
-typedef uint uint4 __attribute__((ext_vector_type(4)));
-typedef uint uint8 __attribute__((ext_vector_type(8)));
-typedef uint uint16 __attribute__((ext_vector_type(16)));
-typedef long long2 __attribute__((ext_vector_type(2)));
-typedef long long3 __attribute__((ext_vector_type(3)));
-typedef long long4 __attribute__((ext_vector_type(4)));
-typedef long long8 __attribute__((ext_vector_type(8)));
-typedef long long16 __attribute__((ext_vector_type(16)));
-typedef ulong ulong2 __attribute__((ext_vector_type(2)));
-typedef ulong ulong3 __attribute__((ext_vector_type(3)));
-typedef ulong ulong4 __attribute__((ext_vector_type(4)));
-typedef ulong ulong8 __attribute__((ext_vector_type(8)));
-typedef ulong ulong16 __attribute__((ext_vector_type(16)));
-typedef float float2 __attribute__((ext_vector_type(2)));
-typedef float float3 __attribute__((ext_vector_type(3)));
-typedef float float4 __attribute__((ext_vector_type(4)));
-typedef float float8 __attribute__((ext_vector_type(8)));
-typedef float float16 __attribute__((ext_vector_type(16)));
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-typedef half half2 __attribute__((ext_vector_type(2)));
-typedef half half3 __attribute__((ext_vector_type(3)));
-typedef half half4 __attribute__((ext_vector_type(4)));
-typedef half half8 __attribute__((ext_vector_type(8)));
-typedef half half16 __attribute__((ext_vector_type(16)));
-#endif
-#ifdef cl_khr_fp64
-#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#endif
-typedef double double2 __attribute__((ext_vector_type(2)));
-typedef double double3 __attribute__((ext_vector_type(3)));
-typedef double double4 __attribute__((ext_vector_type(4)));
-typedef double double8 __attribute__((ext_vector_type(8)));
-typedef double double16 __attribute__((ext_vector_type(16)));
-#endif
-
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-#define NULL ((void*)0)
-#endif
-
-/**
- * Value of maximum non-infinite single-precision floating-point
- * number.
- */
-#define MAXFLOAT 0x1.fffffep127f
-
-/**
- * A positive float constant expression. HUGE_VALF evaluates
- * to +infinity. Used as an error value returned by the built-in
- * math functions.
- */
-#define HUGE_VALF (__builtin_huge_valf())
-
-/**
- * A positive double constant expression. HUGE_VAL evaluates
- * to +infinity. Used as an error value returned by the built-in
- * math functions.
- */
-#define HUGE_VAL (__builtin_huge_val())
-
-/**
- * A constant expression of type float representing positive or
- * unsigned infinity.
- */
-#define INFINITY (__builtin_inff())
-
-/**
- * A constant expression of type float representing a quiet NaN.
- */
-#define NAN as_float(INT_MAX)
-
-#define FP_ILOGB0    INT_MIN
-#define FP_ILOGBNAN    INT_MAX
-
-#define FLT_DIG 6
-#define FLT_MANT_DIG 24
-#define FLT_MAX_10_EXP +38
-#define FLT_MAX_EXP +128
-#define FLT_MIN_10_EXP -37
-#define FLT_MIN_EXP -125
-#define FLT_RADIX 2
-#define FLT_MAX 0x1.fffffep127f
-#define FLT_MIN 0x1.0p-126f
-#define FLT_EPSILON 0x1.0p-23f
-
-#define M_E_F         2.71828182845904523536028747135266250f
-#define M_LOG2E_F     1.44269504088896340735992468100189214f
-#define M_LOG10E_F    0.434294481903251827651128918916605082f
-#define M_LN2_F       0.693147180559945309417232121458176568f
-#define M_LN10_F      2.30258509299404568401799145468436421f
-#define M_PI_F        3.14159265358979323846264338327950288f
-#define M_PI_2_F      1.57079632679489661923132169163975144f
-#define M_PI_4_F      0.785398163397448309615660845819875721f
-#define M_1_PI_F      0.318309886183790671537767526745028724f
-#define M_2_PI_F      0.636619772367581343075535053490057448f
-#define M_2_SQRTPI_F  1.12837916709551257389615890312154517f
-#define M_SQRT2_F     1.41421356237309504880168872420969808f
-#define M_SQRT1_2_F   0.707106781186547524400844362104849039f
-
-#define DBL_DIG 15
-#define DBL_MANT_DIG 53
-#define DBL_MAX_10_EXP +308
-#define DBL_MAX_EXP +1024
-#define DBL_MIN_10_EXP -307
-#define DBL_MIN_EXP -1021
-#define DBL_RADIX 2
-#define DBL_MAX 0x1.fffffffffffffp1023
-#define DBL_MIN 0x1.0p-1022
-#define DBL_EPSILON 0x1.0p-52
-
-#define M_E           0x1.5bf0a8b145769p+1
-#define M_LOG2E       0x1.71547652b82fep+0
-#define M_LOG10E      0x1.bcb7b1526e50ep-2
-#define M_LN2         0x1.62e42fefa39efp-1
-#define M_LN10        0x1.26bb1bbb55516p+1
-#define M_PI          0x1.921fb54442d18p+1
-#define M_PI_2        0x1.921fb54442d18p+0
-#define M_PI_4        0x1.921fb54442d18p-1
-#define M_1_PI        0x1.45f306dc9c883p-2
-#define M_2_PI        0x1.45f306dc9c883p-1
-#define M_2_SQRTPI    0x1.20dd750429b6dp+0
-#define M_SQRT2       0x1.6a09e667f3bcdp+0
-#define M_SQRT1_2     0x1.6a09e667f3bcdp-1
-
-#ifdef cl_khr_fp16
-
-#define HALF_DIG 3
-#define HALF_MANT_DIG 11
-#define HALF_MAX_10_EXP +4
-#define HALF_MAX_EXP +16
-#define HALF_MIN_10_EXP -4
-#define HALF_MIN_EXP -13
-#define HALF_RADIX 2
-#define HALF_MAX ((0x1.ffcp15h))
-#define HALF_MIN ((0x1.0p-14h))
-#define HALF_EPSILON ((0x1.0p-10h))
-
-#define M_E_H         2.71828182845904523536028747135266250h
-#define M_LOG2E_H     1.44269504088896340735992468100189214h
-#define M_LOG10E_H    0.434294481903251827651128918916605082h
-#define M_LN2_H       0.693147180559945309417232121458176568h
-#define M_LN10_H      2.30258509299404568401799145468436421h
-#define M_PI_H        3.14159265358979323846264338327950288h
-#define M_PI_2_H      1.57079632679489661923132169163975144h
-#define M_PI_4_H      0.785398163397448309615660845819875721h
-#define M_1_PI_H      0.318309886183790671537767526745028724h
-#define M_2_PI_H      0.636619772367581343075535053490057448h
-#define M_2_SQRTPI_H  1.12837916709551257389615890312154517h
-#define M_SQRT2_H     1.41421356237309504880168872420969808h
-#define M_SQRT1_2_H   0.707106781186547524400844362104849039h
-
-#endif //cl_khr_fp16
-
-#define CHAR_BIT    8
-#define SCHAR_MAX  127
-#define SCHAR_MIN  (-128)
-#define UCHAR_MAX  255
-#define CHAR_MAX  SCHAR_MAX
-#define CHAR_MIN  SCHAR_MIN
-#define USHRT_MAX  65535
-#define SHRT_MAX  32767
-#define SHRT_MIN  (-32768)
-#define UINT_MAX  0xffffffff
-#define INT_MAX    2147483647
-#define INT_MIN    (-2147483647-1)
-#define ULONG_MAX  0xffffffffffffffffUL
-#define LONG_MAX  0x7fffffffffffffffL
-#define LONG_MIN  (-0x7fffffffffffffffL-1)
 
 // OpenCL v1.1/1.2/2.0 s6.2.3 - Explicit conversions
 
@@ -9598,8 +9347,6 @@ long8 __ovld __cnfn clamp(long8 x, long8 minval, long8 maxval);
 ulong8 __ovld __cnfn clamp(ulong8 x, ulong8 minval, ulong8 maxval);
 long16 __ovld __cnfn clamp(long16 x, long16 minval, long16 maxval);
 ulong16 __ovld __cnfn clamp(ulong16 x, ulong16 minval, ulong16 maxval);
-char __ovld __cnfn clamp(char x, char minval, char maxval);
-uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
 char2 __ovld __cnfn clamp(char2 x, char minval, char maxval);
 uchar2 __ovld __cnfn clamp(uchar2 x, uchar minval, uchar maxval);
 char3 __ovld __cnfn clamp(char3 x, char minval, char maxval);
@@ -9610,8 +9357,6 @@ char8 __ovld __cnfn clamp(char8 x, char minval, char maxval);
 uchar8 __ovld __cnfn clamp(uchar8 x, uchar minval, uchar maxval);
 char16 __ovld __cnfn clamp(char16 x, char minval, char maxval);
 uchar16 __ovld __cnfn clamp(uchar16 x, uchar minval, uchar maxval);
-short __ovld __cnfn clamp(short x, short minval, short maxval);
-ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
 short2 __ovld __cnfn clamp(short2 x, short minval, short maxval);
 ushort2 __ovld __cnfn clamp(ushort2 x, ushort minval, ushort maxval);
 short3 __ovld __cnfn clamp(short3 x, short minval, short maxval);
@@ -9622,8 +9367,6 @@ short8 __ovld __cnfn clamp(short8 x, short minval, short maxval);
 ushort8 __ovld __cnfn clamp(ushort8 x, ushort minval, ushort maxval);
 short16 __ovld __cnfn clamp(short16 x, short minval, short maxval);
 ushort16 __ovld __cnfn clamp(ushort16 x, ushort minval, ushort maxval);
-int __ovld __cnfn clamp(int x, int minval, int maxval);
-uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
 int2 __ovld __cnfn clamp(int2 x, int minval, int maxval);
 uint2 __ovld __cnfn clamp(uint2 x, uint minval, uint maxval);
 int3 __ovld __cnfn clamp(int3 x, int minval, int maxval);
@@ -9634,8 +9377,6 @@ int8 __ovld __cnfn clamp(int8 x, int minval, int maxval);
 uint8 __ovld __cnfn clamp(uint8 x, uint minval, uint maxval);
 int16 __ovld __cnfn clamp(int16 x, int minval, int maxval);
 uint16 __ovld __cnfn clamp(uint16 x, uint minval, uint maxval);
-long __ovld __cnfn clamp(long x, long minval, long maxval);
-ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
 long2 __ovld __cnfn clamp(long2 x, long minval, long maxval);
 ulong2 __ovld __cnfn clamp(ulong2 x, ulong minval, ulong maxval);
 long3 __ovld __cnfn clamp(long3 x, long minval, long maxval);
@@ -9911,8 +9652,6 @@ long8 __ovld __cnfn max(long8 x, long8 y);
 ulong8 __ovld __cnfn max(ulong8 x, ulong8 y);
 long16 __ovld __cnfn max(long16 x, long16 y);
 ulong16 __ovld __cnfn max(ulong16 x, ulong16 y);
-char __ovld __cnfn max(char x, char y);
-uchar __ovld __cnfn max(uchar x, uchar y);
 char2 __ovld __cnfn max(char2 x, char y);
 uchar2 __ovld __cnfn max(uchar2 x, uchar y);
 char3 __ovld __cnfn max(char3 x, char y);
@@ -9923,8 +9662,6 @@ char8 __ovld __cnfn max(char8 x, char y);
 uchar8 __ovld __cnfn max(uchar8 x, uchar y);
 char16 __ovld __cnfn max(char16 x, char y);
 uchar16 __ovld __cnfn max(uchar16 x, uchar y);
-short __ovld __cnfn max(short x, short y);
-ushort __ovld __cnfn max(ushort x, ushort y);
 short2 __ovld __cnfn max(short2 x, short y);
 ushort2 __ovld __cnfn max(ushort2 x, ushort y);
 short3 __ovld __cnfn max(short3 x, short y);
@@ -9935,8 +9672,6 @@ short8 __ovld __cnfn max(short8 x, short y);
 ushort8 __ovld __cnfn max(ushort8 x, ushort y);
 short16 __ovld __cnfn max(short16 x, short y);
 ushort16 __ovld __cnfn max(ushort16 x, ushort y);
-int __ovld __cnfn max(int x, int y);
-uint __ovld __cnfn max(uint x, uint y);
 int2 __ovld __cnfn max(int2 x, int y);
 uint2 __ovld __cnfn max(uint2 x, uint y);
 int3 __ovld __cnfn max(int3 x, int y);
@@ -9947,8 +9682,6 @@ int8 __ovld __cnfn max(int8 x, int y);
 uint8 __ovld __cnfn max(uint8 x, uint y);
 int16 __ovld __cnfn max(int16 x, int y);
 uint16 __ovld __cnfn max(uint16 x, uint y);
-long __ovld __cnfn max(long x, long y);
-ulong __ovld __cnfn max(ulong x, ulong y);
 long2 __ovld __cnfn max(long2 x, long y);
 ulong2 __ovld __cnfn max(ulong2 x, ulong y);
 long3 __ovld __cnfn max(long3 x, long y);
@@ -10011,8 +9744,6 @@ long8 __ovld __cnfn min(long8 x, long8 y);
 ulong8 __ovld __cnfn min(ulong8 x, ulong8 y);
 long16 __ovld __cnfn min(long16 x, long16 y);
 ulong16 __ovld __cnfn min(ulong16 x, ulong16 y);
-char __ovld __cnfn min(char x, char y);
-uchar __ovld __cnfn min(uchar x, uchar y);
 char2 __ovld __cnfn min(char2 x, char y);
 uchar2 __ovld __cnfn min(uchar2 x, uchar y);
 char3 __ovld __cnfn min(char3 x, char y);
@@ -10023,8 +9754,6 @@ char8 __ovld __cnfn min(char8 x, char y);
 uchar8 __ovld __cnfn min(uchar8 x, uchar y);
 char16 __ovld __cnfn min(char16 x, char y);
 uchar16 __ovld __cnfn min(uchar16 x, uchar y);
-short __ovld __cnfn min(short x, short y);
-ushort __ovld __cnfn min(ushort x, ushort y);
 short2 __ovld __cnfn min(short2 x, short y);
 ushort2 __ovld __cnfn min(ushort2 x, ushort y);
 short3 __ovld __cnfn min(short3 x, short y);
@@ -10035,8 +9764,6 @@ short8 __ovld __cnfn min(short8 x, short y);
 ushort8 __ovld __cnfn min(ushort8 x, ushort y);
 short16 __ovld __cnfn min(short16 x, short y);
 ushort16 __ovld __cnfn min(ushort16 x, ushort y);
-int __ovld __cnfn min(int x, int y);
-uint __ovld __cnfn min(uint x, uint y);
 int2 __ovld __cnfn min(int2 x, int y);
 uint2 __ovld __cnfn min(uint2 x, uint y);
 int3 __ovld __cnfn min(int3 x, int y);
@@ -10047,8 +9774,6 @@ int8 __ovld __cnfn min(int8 x, int y);
 uint8 __ovld __cnfn min(uint8 x, uint y);
 int16 __ovld __cnfn min(int16 x, int y);
 uint16 __ovld __cnfn min(uint16 x, uint y);
-long __ovld __cnfn min(long x, long y);
-ulong __ovld __cnfn min(ulong x, ulong y);
 long2 __ovld __cnfn min(long2 x, long y);
 ulong2 __ovld __cnfn min(ulong2 x, ulong y);
 long3 __ovld __cnfn min(long3 x, long y);
@@ -10627,7 +10352,6 @@ half3 __ovld __cnfn step(half3 edge, half3 x);
 half4 __ovld __cnfn step(half4 edge, half4 x);
 half8 __ovld __cnfn step(half8 edge, half8 x);
 half16 __ovld __cnfn step(half16 edge, half16 x);
-half __ovld __cnfn step(half edge, half x);
 half2 __ovld __cnfn step(half edge, half2 x);
 half3 __ovld __cnfn step(half edge, half3 x);
 half4 __ovld __cnfn step(half edge, half4 x);
@@ -10679,7 +10403,6 @@ half3 __ovld __cnfn smoothstep(half3 edge0, half3 edge1, half3 x);
 half4 __ovld __cnfn smoothstep(half4 edge0, half4 edge1, half4 x);
 half8 __ovld __cnfn smoothstep(half8 edge0, half8 edge1, half8 x);
 half16 __ovld __cnfn smoothstep(half16 edge0, half16 edge1, half16 x);
-half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
 half2 __ovld __cnfn smoothstep(half edge0, half edge1, half2 x);
 half3 __ovld __cnfn smoothstep(half edge0, half edge1, half3 x);
 half4 __ovld __cnfn smoothstep(half edge0, half edge1, half4 x);
@@ -12777,30 +12500,6 @@ void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
 
 // OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
 
-// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
-typedef uint cl_mem_fence_flags;
-
-/**
- * Queue a memory fence to ensure correct
- * ordering of memory operations to local memory
- */
-#define CLK_LOCAL_MEM_FENCE    0x01
-
-/**
- * Queue a memory fence to ensure correct
- * ordering of memory operations to global memory
- */
-#define CLK_GLOBAL_MEM_FENCE   0x02
-
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-/**
- * Queue a memory fence to ensure correct ordering of memory
- * operations between work-items of a work-group to
- * image memory.
- */
-#define CLK_IMAGE_MEM_FENCE  0x04
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
-
 /**
  * All work-items in a work-group executing the kernel
  * on a processor must execute this function before any
@@ -12834,17 +12533,6 @@ typedef uint cl_mem_fence_flags;
 void __ovld __conv barrier(cl_mem_fence_flags flags);
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-
-typedef enum memory_scope {
-  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
-  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
-  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
-  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
-#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
-  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
-#endif
-} memory_scope;
-
 void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
 void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -13341,6 +13029,10 @@ int __ovld atomic_add(volatile __global int *p, int val);
 unsigned int __ovld atomic_add(volatile __global unsigned int *p, unsigned int val);
 int __ovld atomic_add(volatile __local int *p, int val);
 unsigned int __ovld atomic_add(volatile __local unsigned int *p, unsigned int val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_add(volatile int *p, int val);
+unsigned int __ovld atomic_add(volatile unsigned int *p, unsigned int val);
+#endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_add(volatile __global int *p, int val);
@@ -13367,6 +13059,10 @@ int __ovld atomic_sub(volatile __global int *p, int val);
 unsigned int __ovld atomic_sub(volatile __global unsigned int *p, unsigned int val);
 int __ovld atomic_sub(volatile __local int *p, int val);
 unsigned int __ovld atomic_sub(volatile __local unsigned int *p, unsigned int val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_sub(volatile int *p, int val);
+unsigned int __ovld atomic_sub(volatile unsigned int *p, unsigned int val);
+#endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_sub(volatile __global int *p, int val);
@@ -13395,6 +13091,11 @@ int __ovld atomic_xchg(volatile __local int *p, int val);
 unsigned int __ovld atomic_xchg(volatile __local unsigned int *p, unsigned int val);
 float __ovld atomic_xchg(volatile __global float *p, float val);
 float __ovld atomic_xchg(volatile __local float *p, float val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_xchg(volatile int *p, int val);
+unsigned int __ovld atomic_xchg(volatile unsigned int *p, unsigned int val);
+float __ovld atomic_xchg(volatile float *p, float val);
+#endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_xchg(volatile __global int *p, int val);
@@ -13422,6 +13123,10 @@ int __ovld atomic_inc(volatile __global int *p);
 unsigned int __ovld atomic_inc(volatile __global unsigned int *p);
 int __ovld atomic_inc(volatile __local int *p);
 unsigned int __ovld atomic_inc(volatile __local unsigned int *p);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_inc(volatile int *p);
+unsigned int __ovld atomic_inc(volatile unsigned int *p);
+#endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_inc(volatile __global int *p);
@@ -13449,6 +13154,10 @@ int __ovld atomic_dec(volatile __global int *p);
 unsigned int __ovld atomic_dec(volatile __global unsigned int *p);
 int __ovld atomic_dec(volatile __local int *p);
 unsigned int __ovld atomic_dec(volatile __local unsigned int *p);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_dec(volatile int *p);
+unsigned int __ovld atomic_dec(volatile unsigned int *p);
+#endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_dec(volatile __global int *p);
@@ -13477,6 +13186,10 @@ int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
 unsigned int __ovld atomic_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
 int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
 unsigned int __ovld atomic_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val);
+unsigned int __ovld atomic_cmpxchg(volatile unsigned int *p, unsigned int cmp, unsigned int val);
+#endif
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
@@ -13505,6 +13218,10 @@ int __ovld atomic_min(volatile __global int *p, int val);
 unsigned int __ovld atomic_min(volatile __global unsigned int *p, unsigned int val);
 int __ovld atomic_min(volatile __local int *p, int val);
 unsigned int __ovld atomic_min(volatile __local unsigned int *p, unsigned int val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_min(volatile int *p, int val);
+unsigned int __ovld atomic_min(volatile unsigned int *p, unsigned int val);
+#endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_min(volatile __global int *p, int val);
@@ -13533,6 +13250,10 @@ int __ovld atomic_max(volatile __global int *p, int val);
 unsigned int __ovld atomic_max(volatile __global unsigned int *p, unsigned int val);
 int __ovld atomic_max(volatile __local int *p, int val);
 unsigned int __ovld atomic_max(volatile __local unsigned int *p, unsigned int val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_max(volatile int *p, int val);
+unsigned int __ovld atomic_max(volatile unsigned int *p, unsigned int val);
+#endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_max(volatile __global int *p, int val);
@@ -13560,6 +13281,10 @@ int __ovld atomic_and(volatile __global int *p, int val);
 unsigned int __ovld atomic_and(volatile __global unsigned int *p, unsigned int val);
 int __ovld atomic_and(volatile __local int *p, int val);
 unsigned int __ovld atomic_and(volatile __local unsigned int *p, unsigned int val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_and(volatile int *p, int val);
+unsigned int __ovld atomic_and(volatile unsigned int *p, unsigned int val);
+#endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_and(volatile __global int *p, int val);
@@ -13587,6 +13312,10 @@ int __ovld atomic_or(volatile __global int *p, int val);
 unsigned int __ovld atomic_or(volatile __global unsigned int *p, unsigned int val);
 int __ovld atomic_or(volatile __local int *p, int val);
 unsigned int __ovld atomic_or(volatile __local unsigned int *p, unsigned int val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_or(volatile int *p, int val);
+unsigned int __ovld atomic_or(volatile unsigned int *p, unsigned int val);
+#endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_or(volatile __global int *p, int val);
@@ -13614,6 +13343,10 @@ int __ovld atomic_xor(volatile __global int *p, int val);
 unsigned int __ovld atomic_xor(volatile __global unsigned int *p, unsigned int val);
 int __ovld atomic_xor(volatile __local int *p, int val);
 unsigned int __ovld atomic_xor(volatile __local unsigned int *p, unsigned int val);
+#ifdef __OPENCL_CPP_VERSION__
+int __ovld atomic_xor(volatile int *p, int val);
+unsigned int __ovld atomic_xor(volatile unsigned int *p, unsigned int val);
+#endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
 int __ovld atom_xor(volatile __global int *p, int val);
@@ -13639,20 +13372,6 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v
 // OpenCL v2.0 s6.13.11 - Atomics Functions
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-#ifndef ATOMIC_VAR_INIT
-#define ATOMIC_VAR_INIT(x) (x)
-#endif //ATOMIC_VAR_INIT
-#define ATOMIC_FLAG_INIT 0
-
-// enum values aligned with what clang uses in EmitAtomicExpr()
-typedef enum memory_order
-{
-  memory_order_relaxed = __ATOMIC_RELAXED,
-  memory_order_acquire = __ATOMIC_ACQUIRE,
-  memory_order_release = __ATOMIC_RELEASE,
-  memory_order_acq_rel = __ATOMIC_ACQ_REL,
-  memory_order_seq_cst = __ATOMIC_SEQ_CST
-} memory_order;
 
 // double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
@@ -14470,33 +14189,11 @@ half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
 #if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
 
-int printf(__constant const char* st, ...);
+int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
 #endif
 
 // OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
 
-// These values need to match the runtime equivalent
-//
-// Addressing Mode.
-//
-#define CLK_ADDRESS_NONE                0
-#define CLK_ADDRESS_CLAMP_TO_EDGE       2
-#define CLK_ADDRESS_CLAMP               4
-#define CLK_ADDRESS_REPEAT              6
-#define CLK_ADDRESS_MIRRORED_REPEAT     8
-
-//
-// Coordination Normalization
-//
-#define CLK_NORMALIZED_COORDS_FALSE     0
-#define CLK_NORMALIZED_COORDS_TRUE      1
-
-//
-// Filtering Mode.
-//
-#define CLK_FILTER_NEAREST              0x10
-#define CLK_FILTER_LINEAR               0x20
-
 #ifdef cl_khr_gl_msaa_sharing
 #pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
 #endif //cl_khr_gl_msaa_sharing
@@ -14712,30 +14409,6 @@ float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler,
 int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
 uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
 
-float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
-int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
-uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);
-
-float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
-int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
-uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
-
-float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
-int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
-uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
-
-float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
-
-float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
-int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
-uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
-
-float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
-
-float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
-int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
-uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
-
 #endif //cl_khr_mipmap_image
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
@@ -14895,29 +14568,6 @@ float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler
 int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
 uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
 
-float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
-int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
-uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);
-
-float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
-int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
-uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
-
-float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
-int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
-uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
-
-float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
-
-float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
-int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
-uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
-
-float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
-
-float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
-int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
-uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
 #endif //cl_khr_mipmap_image
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
@@ -15332,26 +14982,6 @@ int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);
  * CLK_FLOAT
  */
 
-//
-// Channel Datatype.
-//
-#define CLK_SNORM_INT8        0x10D0
-#define CLK_SNORM_INT16       0x10D1
-#define CLK_UNORM_INT8        0x10D2
-#define CLK_UNORM_INT16       0x10D3
-#define CLK_UNORM_SHORT_565   0x10D4
-#define CLK_UNORM_SHORT_555   0x10D5
-#define CLK_UNORM_INT_101010  0x10D6
-#define CLK_SIGNED_INT8       0x10D7
-#define CLK_SIGNED_INT16      0x10D8
-#define CLK_SIGNED_INT32      0x10D9
-#define CLK_UNSIGNED_INT8     0x10DA
-#define CLK_UNSIGNED_INT16    0x10DB
-#define CLK_UNSIGNED_INT32    0x10DC
-#define CLK_HALF_FLOAT        0x10DD
-#define CLK_FLOAT             0x10DE
-#define CLK_UNORM_INT24       0x10DF
-
 int __ovld __cnfn get_image_channel_data_type(read_only image1d_t image);
 int __ovld __cnfn get_image_channel_data_type(read_only image1d_buffer_t image);
 int __ovld __cnfn get_image_channel_data_type(read_only image2d_t image);
@@ -15423,30 +15053,6 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept
  * CLK_INTENSITY
  * CLK_LUMINANCE
  */
-// Channel order, numbering must be aligned with cl_channel_order in cl.h
-//
-#define CLK_R         0x10B0
-#define CLK_A         0x10B1
-#define CLK_RG        0x10B2
-#define CLK_RA        0x10B3
-#define CLK_RGB       0x10B4
-#define CLK_RGBA      0x10B5
-#define CLK_BGRA      0x10B6
-#define CLK_ARGB      0x10B7
-#define CLK_INTENSITY 0x10B8
-#define CLK_LUMINANCE 0x10B9
-#define CLK_Rx                0x10BA
-#define CLK_RGx               0x10BB
-#define CLK_RGBx              0x10BC
-#define CLK_DEPTH             0x10BD
-#define CLK_DEPTH_STENCIL     0x10BE
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-#define CLK_sRGB              0x10BF
-#define CLK_sRGBx             0x10C0
-#define CLK_sRGBA             0x10C1
-#define CLK_sBGRA             0x10C2
-#define CLK_ABGR              0x10C3
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
 int __ovld __cnfn get_image_channel_order(read_only image1d_buffer_t image);
@@ -15605,20 +15211,17 @@ size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t
 #if defined(cl_khr_gl_msaa_sharing)
 int __ovld get_image_num_samples(read_only image2d_msaa_t image);
 int __ovld get_image_num_samples(read_only image2d_msaa_depth_t image);
-int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
 int __ovld get_image_num_samples(read_only image2d_array_msaa_t image);
 int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
 
 int __ovld get_image_num_samples(write_only image2d_msaa_t image);
 int __ovld get_image_num_samples(write_only image2d_msaa_depth_t image);
-int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
 int __ovld get_image_num_samples(write_only image2d_array_msaa_t image);
 int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int __ovld get_image_num_samples(read_write image2d_msaa_t image);
 int __ovld get_image_num_samples(read_write image2d_msaa_depth_t image);
-int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
 int __ovld get_image_num_samples(read_write image2d_array_msaa_t image);
 int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -15728,7 +15331,6 @@ double __ovld __conv work_group_scan_inclusive_max(double x);
 
 // OpenCL v2.0 s6.13.16 - Pipe Functions
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
 bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
@@ -15736,44 +15338,6 @@ bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
 // OpenCL v2.0 s6.13.17 - Enqueue Kernels
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
-#define CL_COMPLETE                                 0x0
-#define CL_RUNNING                                  0x1
-#define CL_SUBMITTED                                0x2
-#define CL_QUEUED                                   0x3
-
-#define CLK_SUCCESS                                 0
-#define CLK_ENQUEUE_FAILURE                         -101
-#define CLK_INVALID_QUEUE                           -102
-#define CLK_INVALID_NDRANGE                         -160
-#define CLK_INVALID_EVENT_WAIT_LIST                 -57
-#define CLK_DEVICE_QUEUE_FULL                       -161
-#define CLK_INVALID_ARG_SIZE                        -51
-#define CLK_EVENT_ALLOCATION_FAILURE                -100
-#define CLK_OUT_OF_RESOURCES                        -5
-
-#define CLK_NULL_QUEUE                              0
-#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
-
-// execution model related definitions
-#define CLK_ENQUEUE_FLAGS_NO_WAIT                   0x0
-#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL               0x1
-#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP           0x2
-
-typedef int kernel_enqueue_flags_t;
-typedef int clk_profiling_info;
-
-// Profiling info name (see capture_event_profiling_info)
-#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
-
-#define MAX_WORK_DIM        3
-
-typedef struct {
-    unsigned int workDimension;
-    size_t globalWorkOffset[MAX_WORK_DIM];
-    size_t globalWorkSize[MAX_WORK_DIM];
-    size_t localWorkSize[MAX_WORK_DIM];
-} ndrange_t;
-
 ndrange_t __ovld ndrange_1D(size_t);
 ndrange_t __ovld ndrange_1D(size_t, size_t);
 ndrange_t __ovld ndrange_1D(size_t, size_t, size_t);
@@ -16216,138 +15780,6 @@ void        __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, u
 #ifdef cl_intel_device_side_avc_motion_estimation
 #pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
 
-#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
-#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
-#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2
-#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3
-
-#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0
-#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1
-#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2
-#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3
-
-#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0
-#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
-#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
-
-#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
-#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
-#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
-#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
-#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
-#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
-#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
-#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
-
-#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
-#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
-#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
-
-#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
-#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
-#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
-#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
-#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
-#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
-#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
-#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
-#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
-
-#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
-#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
-
-#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
-#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
-#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
-
-#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
-#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
-#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
-#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
-
-#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
-#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
-#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
-#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
-#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
-
-#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
-#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
-#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
-#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
-
-#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0
-#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1
-#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2
-
-#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
-#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
-
-#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24)
-#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24)
-#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30)
-#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30)
-
-#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
-#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
-
-#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0
-#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
-#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
-#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
-
-#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
-#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
-#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
-#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
-
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
-#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
-#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
-#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
-#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
-#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
-
-#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1
-#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2
-#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3
-
-#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
-#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
-
-#define CLK_AVC_ME_INITIALIZE_INTEL 0x0
-
-#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0
-#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0
-#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0
-
-#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0
-#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0
-#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0
-
-#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
-#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
-#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
-#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
-
 // MCE built-in functions
 uchar __ovld
 intel_sub_group_avc_mce_get_default_inter_base_multi_reference_penalty(
@@ -17034,6 +16466,34 @@ uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
 uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
 #endif // cl_amd_media_ops2
 
+#if defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : begin
+uint __ovld arm_dot(uchar4 a, uchar4 b);
+int __ovld arm_dot(char4 a, char4 b);
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : end
+#endif // defined(cl_arm_integer_dot_product_int8)
+
+#if defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : begin
+uint __ovld arm_dot_acc(uchar4 a, uchar4 b, uint c);
+int __ovld arm_dot_acc(char4 a, char4 b, int c);
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : end
+#endif // defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(cl_arm_integer_dot_product_accumulate_int16)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int16 : begin
+uint __ovld arm_dot_acc(ushort2 a, ushort2 b, uint c);
+int __ovld arm_dot_acc(short2 a, short2 b, int c);
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int16 : end
+#endif // defined(cl_arm_integer_dot_product_accumulate_int16)
+
+#if defined(cl_arm_integer_dot_product_accumulate_saturate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_saturate_int8 : begin
+uint __ovld arm_dot_acc_sat(uchar4 a, uchar4 b, uint c);
+int __ovld arm_dot_acc_sat(char4 a, char4 b, int c);
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_saturate_int8 : end
+#endif // defined(cl_arm_integer_dot_product_accumulate_saturate_int8)
+
 // Disable any extensions we may have enabled previously.
 #pragma OPENCL EXTENSION all : disable
 
diff --git a/lib/include/openmp_wrappers/__clang_openmp_math.h b/lib/include/openmp_wrappers/__clang_openmp_math.h
new file mode 100644
index 000000000..5d7ce9a96
--- /dev/null
+++ b/lib/include/openmp_wrappers/__clang_openmp_math.h
@@ -0,0 +1,35 @@
+/*===---- __clang_openmp_math.h - OpenMP target math support ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if defined(__NVPTX__) && defined(_OPENMP)
+/// TODO:
+/// We are currently reusing the functionality of the Clang-CUDA code path
+/// as an alternative to the host declarations provided by math.h and cmath.
+/// This is suboptimal.
+///
+/// We should instead declare the device functions in a similar way, e.g.,
+/// through OpenMP 5.0 variants, and afterwards populate the module with the
+/// host declarations by unconditionally including the host math.h or cmath,
+/// respectively. This is actually what the Clang-CUDA code path does, using
+/// __device__ instead of variants to avoid redeclarations and get the desired
+/// overload resolution.
+
+#define __CUDA__
+
+#if defined(__cplusplus)
+  #include <__clang_cuda_cmath.h>
+#endif
+
+#undef __CUDA__
+
+/// Magic macro for stopping the math.h/cmath host header from being included.
+#define __CLANG_NO_HOST_MATH__
+
+#endif
+
diff --git a/lib/include/openmp_wrappers/__clang_openmp_math_declares.h b/lib/include/openmp_wrappers/__clang_openmp_math_declares.h
new file mode 100644
index 000000000..a422c98bf
--- /dev/null
+++ b/lib/include/openmp_wrappers/__clang_openmp_math_declares.h
@@ -0,0 +1,33 @@
+/*===---- __clang_openmp_math_declares.h - OpenMP math declares ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_OPENMP_MATH_DECLARES_H__
+#define __CLANG_OPENMP_MATH_DECLARES_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#if defined(__NVPTX__) && defined(_OPENMP)
+
+#define __CUDA__
+
+#if defined(__cplusplus)
+  #include <__clang_cuda_math_forward_declares.h>
+#endif
+
+/// Include declarations for libdevice functions.
+#include <__clang_cuda_libdevice_declares.h>
+/// Provide definitions for these functions.
+#include <__clang_cuda_device_functions.h>
+
+#undef __CUDA__
+
+#endif
+#endif
diff --git a/lib/include/openmp_wrappers/cmath b/lib/include/openmp_wrappers/cmath
new file mode 100644
index 000000000..a5183a1d8
--- /dev/null
+++ b/lib/include/openmp_wrappers/cmath
@@ -0,0 +1,16 @@
+/*===-------------- cmath - Alternative cmath header -----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include <__clang_openmp_math.h>
+
+#ifndef __CLANG_NO_HOST_MATH__
+#include_next <cmath>
+#else
+#undef __CLANG_NO_HOST_MATH__
+#endif
diff --git a/lib/include/openmp_wrappers/math.h b/lib/include/openmp_wrappers/math.h
new file mode 100644
index 000000000..d2786ecb2
--- /dev/null
+++ b/lib/include/openmp_wrappers/math.h
@@ -0,0 +1,17 @@
+/*===------------- math.h - Alternative math.h header ----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include <__clang_openmp_math.h>
+
+#ifndef __CLANG_NO_HOST_MATH__
+#include_next <math.h>
+#else
+#undef __CLANG_NO_HOST_MATH__
+#endif
+
diff --git a/lib/include/pconfigintrin.h b/lib/include/pconfigintrin.h
index fee3cad38..d2014b026 100644
--- a/lib/include/pconfigintrin.h
+++ b/lib/include/pconfigintrin.h
@@ -1,22 +1,8 @@
 /*===---- pconfigintrin.h - X86 platform configuration ---------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -30,6 +16,8 @@
 
 #define __PCONFIG_KEY_PROGRAM 0x00000001
 
+#if __has_extension(gnu_asm)
+
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
   __attribute__((__always_inline__, __nodebug__,  __target__("pconfig")))
@@ -47,4 +35,6 @@ _pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
 
 #undef __DEFAULT_FN_ATTRS
 
+#endif /* __has_extension(gnu_asm) */
+
 #endif
diff --git a/lib/include/pkuintrin.h b/lib/include/pkuintrin.h
index 6976924d8..c62080bec 100644
--- a/lib/include/pkuintrin.h
+++ b/lib/include/pkuintrin.h
@@ -1,23 +1,9 @@
 /*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/pmmintrin.h b/lib/include/pmmintrin.h
index 7e1a9eae5..c376f298c 100644
--- a/lib/include/pmmintrin.h
+++ b/lib/include/pmmintrin.h
@@ -1,22 +1,8 @@
 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/popcntintrin.h b/lib/include/popcntintrin.h
index 75ceab9e1..312901014 100644
--- a/lib/include/popcntintrin.h
+++ b/lib/include/popcntintrin.h
@@ -1,22 +1,8 @@
 /*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -43,22 +29,6 @@ _mm_popcnt_u32(unsigned int __A)
   return __builtin_popcount(__A);
 }
 
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
-///
-/// \param __A
-///    A signed 32-bit integer operand.
-/// \returns A 32-bit integer containing the number of bits with value 1 in the
-///    source operand.
-static __inline__ int __DEFAULT_FN_ATTRS
-_popcnt32(int __A)
-{
-  return __builtin_popcount(__A);
-}
-
 #ifdef __x86_64__
 /// Counts the number of bits in the source operand having a value of 1.
 ///
@@ -75,22 +45,6 @@ _mm_popcnt_u64(unsigned long long __A)
 {
   return __builtin_popcountll(__A);
 }
-
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
-///
-/// \param __A
-///    A signed 64-bit integer operand.
-/// \returns A 64-bit integer containing the number of bits with value 1 in the
-///    source operand.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_popcnt64(long long __A)
-{
-  return __builtin_popcountll(__A);
-}
 #endif /* __x86_64__ */
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/include/ppc_wrappers/emmintrin.h b/lib/include/ppc_wrappers/emmintrin.h
new file mode 100644
index 000000000..617ce24ac
--- /dev/null
+++ b/lib/include/ppc_wrappers/emmintrin.h
@@ -0,0 +1,2318 @@
+/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header file is to help porting code using Intel intrinsics
+   explicitly from x86_64 to powerpc64/powerpc64le.
+
+   Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
+   PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
+   However scalar float operations in vector (XMM) registers require
+   the POWER8 VSX ISA (2.07) level. There are differences for data
+   format and placement of float scalars in the vector register, which
+   require extra steps to match SSE2 scalar float semantics on POWER.
+
+   It should be noted that there's much difference between X86_64's
+   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
+   portable <fenv.h> instead of access MXSCR directly.
+
+   Most SSE2 scalar float intrinsic operations can be performed more
+   efficiently as C language float scalar operations or optimized to
+   use vector SIMD operations. We recommend this for new applications.
+*/
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef EMMINTRIN_H_
+#define EMMINTRIN_H_
+
+#include <altivec.h>
+
+/* We need definitions from the SSE header files.  */
+#include <xmmintrin.h>
+
+/* SSE2 */
+typedef __vector double __v2df;
+typedef __vector long long __v2di;
+typedef __vector unsigned long long __v2du;
+typedef __vector int __v4si;
+typedef __vector unsigned int __v4su;
+typedef __vector short __v8hi;
+typedef __vector unsigned short __v8hu;
+typedef __vector signed char __v16qi;
+typedef __vector unsigned char __v16qu;
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same types.  */
+typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
+/* Define two value permute mask.  */
+#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sd (double __F)
+{
+  return __extension__ (__m128d){ __F, 0.0 };
+}
+
+/* Create a vector with both elements equal to F.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pd (double __F)
+{
+  return __extension__ (__m128d){ __F, __F };
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd1 (double __F)
+{
+  return _mm_set1_pd (__F);
+}
+
+/* Create a vector with the lower value X and upper value W.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd (double __W, double __X)
+{
+  return __extension__ (__m128d){ __X, __W };
+}
+
+/* Create a vector with the lower value W and upper value X.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pd (double __W, double __X)
+{
+  return __extension__ (__m128d){ __W, __X };
+}
+
+/* Create an undefined vector.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_pd (void)
+{
+  __m128d __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_pd (void)
+{
+  return (__m128d) vec_splats (0);
+}
+
+/* Sets the low DPFP value of A from the low value of B.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sd (__m128d __A, __m128d __B)
+{
+  __v2df result = (__v2df) __A;
+  result [0] = ((__v2df) __B)[0];
+  return (__m128d) result;
+}
+
+/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd (double const *__P)
+{
+  return ((__m128d)vec_ld(0, (__v16qu*)__P));
+}
+
+/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_pd (double const *__P)
+{
+  return (vec_vsx_ld(0, __P));
+}
+
+/* Create a vector with all two elements equal to *P.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_pd (double const *__P)
+{
+  return (vec_splats (*__P));
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sd (double const *__P)
+{
+  return _mm_set_sd (*__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd1 (double const *__P)
+{
+  return _mm_load1_pd (__P);
+}
+
+/* Load two DPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_pd (double const *__P)
+{
+  __v2df __tmp = _mm_load_pd (__P);
+  return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
+}
+
+/* Store two DPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd (double *__P, __m128d __A)
+{
+  vec_st((__v16qu)__A, 0, (__v16qu*)__P);
+}
+
+/* Store two DPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_pd (double *__P, __m128d __A)
+{
+  *(__m128d_u *)__P = __A;
+}
+
+/* Stores the lower DPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sd (double *__P, __m128d __A)
+{
+  *__P = ((__v2df)__A)[0];
+}
+
+extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_f64 (__m128d __A)
+{
+  return ((__v2df)__A)[0];
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pd (double *__P, __m128d __A)
+{
+  _mm_store_sd (__P, __A);
+}
+
+/* Stores the upper DPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pd (double *__P, __m128d __A)
+{
+  *__P = ((__v2df)__A)[1];
+}
+/* Store the lower DPFP value across two words.
+   The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_pd (double *__P, __m128d __A)
+{
+  _mm_store_pd (__P, vec_splat (__A, 0));
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd1 (double *__P, __m128d __A)
+{
+  _mm_store1_pd (__P, __A);
+}
+
+/* Store two DPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_pd (double *__P, __m128d __A)
+{
+  _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
+}
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64 (__m128i __A)
+{
+  return ((__v2di)__A)[0];
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64x (__m128i __A)
+{
+  return ((__v2di)__A)[0];
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A + (__v2df)__B);
+}
+
+/* Add the lower double-precision (64-bit) floating-point element in
+   a and b, store the result in the lower element of dst, and copy
+   the upper element from a to the upper element of dst. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_sd (__m128d __A, __m128d __B)
+{
+  __A[0] = __A[0] + __B[0];
+  return (__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A - (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_sd (__m128d __A, __m128d __B)
+{
+  __A[0] = __A[0] - __B[0];
+  return (__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A * (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_sd (__m128d __A, __m128d __B)
+{
+  __A[0] = __A[0] * __B[0];
+  return (__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A / (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_sd (__m128d __A, __m128d __B)
+{
+  __A[0] = __A[0] / __B[0];
+  return (__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_pd (__m128d __A)
+{
+  return (vec_sqrt (__A));
+}
+
+/* Return pair {sqrt (B[0]), A[1]}.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sd (__m128d __A, __m128d __B)
+{
+  __v2df c;
+  c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pd (__m128d __A, __m128d __B)
+{
+  return (vec_min (__A, __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = vec_min (a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pd (__m128d __A, __m128d __B)
+{
+  return (vec_max (__A, __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = vec_max (a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_pd (__m128d __A, __m128d __B)
+{
+  __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
+  return ((__m128d)vec_nor (temp, temp));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_pd (__m128d __A, __m128d __B)
+{
+  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_pd (__m128d __A, __m128d __B)
+{
+#if _ARCH_PWR8
+  __v2du c, d;
+  /* Compare against self will return false (0's) if NAN.  */
+  c = (__v2du)vec_cmpeq (__A, __A);
+  d = (__v2du)vec_cmpeq (__B, __B);
+#else
+  __v2du a, b;
+  __v2du c, d;
+  const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
+  a = (__v2du)vec_abs ((__v2df)__A);
+  b = (__v2du)vec_abs ((__v2df)__B);
+  c = (__v2du)vec_cmpgt (double_exp_mask, a);
+  d = (__v2du)vec_cmpgt (double_exp_mask, b);
+#endif
+  /* A != NAN and B != NAN.  */
+  return ((__m128d)vec_and(c, d));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_pd (__m128d __A, __m128d __B)
+{
+#if _ARCH_PWR8
+  __v2du c, d;
+  /* Compare against self will return false (0's) if NAN.  */
+  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
+  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
+  /* A == NAN OR B == NAN converts too:
+     NOT(A != NAN) OR NOT(B != NAN).  */
+  c = vec_nor (c, c);
+  return ((__m128d)vec_orc(c, d));
+#else
+  __v2du c, d;
+  /* Compare against self will return false (0's) if NAN.  */
+  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
+  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
+  /* Convert the true ('1's) is NAN.  */
+  c = vec_nor (c, c);
+  d = vec_nor (d, d);
+  return ((__m128d)vec_or(c, d));
+#endif
+}
+
+extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_sd(__m128d  __A, __m128d  __B)
+{
+  __v2df a, b, c;
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we do the operation. */
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmpeq(a, b);
+  /* Then we merge the lower double result with the original upper
+     double from __A.  */
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmplt(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmple(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmpgt(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmpge(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  c = (__v2df) vec_cmpeq(a, b);
+  c = vec_nor (c, c);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  /* Not less than is just greater than or equal.  */
+  c = (__v2df) vec_cmpge(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  /* Not less than or equal is just greater than.  */
+  c = (__v2df) vec_cmpge(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  /* Not greater than is just less than or equal.  */
+  c = (__v2df) vec_cmple(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_sd (__m128d __A, __m128d __B)
+{
+  __v2df a, b, c;
+  a = vec_splats (__A[0]);
+  b = vec_splats (__B[0]);
+  /* Not greater than or equal is just less than.  */
+  c = (__v2df) vec_cmplt(a, b);
+  return (__m128d) _mm_setr_pd (c[0], __A[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_sd (__m128d __A, __m128d __B)
+{
+  __v2df r;
+  r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
+  return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_sd (__m128d __A, __m128d __B)
+{
+  __v2df r;
+  r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
+  return (__m128d) _mm_setr_pd (r[0], __A[1]);
+}
+
+/* FIXME
+   The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
+   exactly the same because GCC for PowerPC only generates unordered
+   compares (scalar and vector).
+   Technically __mm_comieq_sp et all should be using the ordered
+   compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
+   be OK.   */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_sd (__m128d __A, __m128d __B)
+{
+	return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_sd (__m128d __A, __m128d __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+/* Create a vector of Qi, where i is the element number.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64x (long long __q1, long long __q0)
+{
+  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64 (__m64 __q1,  __m64 __q0)
+{
+  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
+{
+  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
+	       short __q3, short __q2, short __q1, short __q0)
+{
+  return __extension__ (__m128i)(__v8hi){
+    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
+	      char __q11, char __q10, char __q09, char __q08,
+	      char __q07, char __q06, char __q05, char __q04,
+	      char __q03, char __q02, char __q01, char __q00)
+{
+  return __extension__ (__m128i)(__v16qi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+  };
+}
+
+/* Set all of the elements of the vector to A.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64x (long long __A)
+{
+  return _mm_set_epi64x (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64 (__m64 __A)
+{
+  return _mm_set_epi64 (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi32 (int __A)
+{
+  return _mm_set_epi32 (__A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi16 (short __A)
+{
+  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi8 (char __A)
+{
+  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+		       __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+/* Create a vector of Qi, where i is the element number.
+   The parameter order is reversed from the _mm_set_epi* functions.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi64 (__m64 __q0, __m64 __q1)
+{
+  return _mm_set_epi64 (__q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
+{
+  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
+	        short __q4, short __q5, short __q6, short __q7)
+{
+  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
+	       char __q04, char __q05, char __q06, char __q07,
+	       char __q08, char __q09, char __q10, char __q11,
+	       char __q12, char __q13, char __q14, char __q15)
+{
+  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
+		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_si128 (__m128i const *__P)
+{
+  return *__P;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si128 (__m128i_u const *__P)
+{
+  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_epi64 (__m128i_u const *__P)
+{
+  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_si128 (__m128i *__P, __m128i __B)
+{
+  vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
+{
+  *__P = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
+{
+  *(long long *)__P = ((__v2di)__B)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi64_pi64 (__m128i_u __B)
+{
+  return (__m64) ((__v2di)__B)[0];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movpi64_epi64 (__m64 __A)
+{
+  return _mm_set_epi64 ((__m64)0LL, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_epi64 (__m128i __A)
+{
+  return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
+}
+
+/* Create an undefined vector.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_si128 (void)
+{
+  __m128i __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si128 (void)
+{
+  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_pd (__m128i __A)
+{
+  __v2di val;
+  /* For LE need to generate Vector Unpack Low Signed Word.
+     Which is generated from unpackh.  */
+  val = (__v2di)vec_unpackh ((__v4si)__A);
+
+  return (__m128d)vec_ctf (val, 0);
+}
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ps (__m128i __A)
+{
+  return ((__m128)vec_ctf((__v4si)__A, 0));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epi32 (__m128d __A)
+{
+  __v2df rounded = vec_rint (__A);
+  __v4si result, temp;
+  const __v4si vzero =
+    { 0, 0, 0, 0 };
+
+  /* VSX Vector truncate Double-Precision to integer and Convert to
+   Signed Integer Word format with Saturate.  */
+  __asm__(
+      "xvcvdpsxws %x0,%x1"
+      : "=wa" (temp)
+      : "wa" (rounded)
+      : );
+
+#ifdef _ARCH_PWR8
+  temp = vec_mergeo (temp, temp);
+  result = (__v4si) vec_vpkudum ((__vector long long) temp,
+				 (__vector long long) vzero);
+#else
+  {
+    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
+    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+  }
+#endif
+  return (__m128i) result;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_pi32 (__m128d __A)
+{
+  __m128i result = _mm_cvtpd_epi32(__A);
+
+  return (__m64) result[0];
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ps (__m128d __A)
+{
+  __v4sf result;
+  __v4si temp;
+  const __v4si vzero = { 0, 0, 0, 0 };
+
+  __asm__(
+      "xvcvdpsp %x0,%x1"
+      : "=wa" (temp)
+      : "wa" (__A)
+      : );
+
+#ifdef _ARCH_PWR8
+  temp = vec_mergeo (temp, temp);
+  result = (__v4sf) vec_vpkudum ((__vector long long) temp,
+				 (__vector long long) vzero);
+#else
+  {
+    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
+    result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+  }
+#endif
+  return ((__m128)result);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epi32 (__m128d __A)
+{
+  __v4si result;
+  __v4si temp;
+  const __v4si vzero = { 0, 0, 0, 0 };
+
+  /* VSX Vector truncate Double-Precision to integer and Convert to
+   Signed Integer Word format with Saturate.  */
+  __asm__(
+      "xvcvdpsxws %x0,%x1"
+      : "=wa" (temp)
+      : "wa" (__A)
+      : );
+
+#ifdef _ARCH_PWR8
+  temp = vec_mergeo (temp, temp);
+  result = (__v4si) vec_vpkudum ((__vector long long) temp,
+				 (__vector long long) vzero);
+#else
+  {
+    const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
+    result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
+  }
+#endif
+
+  return ((__m128i) result);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_pi32 (__m128d __A)
+{
+  __m128i result = _mm_cvttpd_epi32 (__A);
+
+  return (__m64) result[0];
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si32 (__m128i __A)
+{
+  return ((__v4si)__A)[0];
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_pd (__m64 __A)
+{
+  __v4si temp;
+  __v2di tmp2;
+  __v2df result;
+
+  temp = (__v4si)vec_splats (__A);
+  tmp2 = (__v2di)vec_unpackl (temp);
+  result = vec_ctf ((__vector signed long long) tmp2, 0);
+  return (__m128d)result;
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epi32 (__m128 __A)
+{
+  __v4sf rounded;
+  __v4si result;
+
+  rounded = vec_rint((__v4sf) __A);
+  result = vec_cts (rounded, 0);
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epi32 (__m128 __A)
+{
+  __v4si result;
+
+  result = vec_cts ((__v4sf) __A, 0);
+  return (__m128i) result;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pd (__m128 __A)
+{
+  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
+#ifdef vec_doubleh
+  return (__m128d) vec_doubleh ((__v4sf)__A);
+#else
+  /* Otherwise the compiler is not current and so need to generate the
+     equivalent code.  */
+  __v4sf a = (__v4sf)__A;
+  __v4sf temp;
+  __v2df result;
+#ifdef __LITTLE_ENDIAN__
+  /* The input float values are in elements {[0], [1]} but the convert
+     instruction needs them in elements {[1], [3]}, So we use two
+     shift left double vector word immediates to get the elements
+     lined up.  */
+  temp = __builtin_vsx_xxsldwi (a, a, 3);
+  temp = __builtin_vsx_xxsldwi (a, temp, 2);
+#else
+  /* The input float values are in elements {[0], [1]} but the convert
+     instruction needs them in elements {[0], [2]}, So we use two
+     shift left double vector word immediates to get the elements
+     lined up.  */
+  temp = vec_vmrghw (a, a);
+#endif
+  __asm__(
+      " xvcvspdp %x0,%x1"
+      : "=wa" (result)
+      : "wa" (temp)
+      : );
+  return (__m128d) result;
+#endif
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si32 (__m128d __A)
+{
+  __v2df rounded = vec_rint((__v2df) __A);
+  int result = ((__v2df)rounded)[0];
+
+  return result;
+}
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64 (__m128d __A)
+{
+  __v2df rounded = vec_rint ((__v2df) __A );
+  long long result = ((__v2df) rounded)[0];
+
+  return result;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64x (__m128d __A)
+{
+  return _mm_cvtsd_si64 ((__v2df)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si32 (__m128d __A)
+{
+  int result = ((__v2df)__A)[0];
+
+  return result;
+}
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64 (__m128d __A)
+{
+  long long result = ((__v2df)__A)[0];
+
+  return result;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64x (__m128d __A)
+{
+  return _mm_cvttsd_si64 (__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_ss (__m128 __A, __m128d __B)
+{
+  __v4sf result = (__v4sf)__A;
+
+#ifdef __LITTLE_ENDIAN__
+  __v4sf temp_s;
+  /* Copy double element[0] to element [1] for conversion.  */
+  __v2df temp_b = vec_splat((__v2df)__B, 0);
+
+  /* Pre-rotate __A left 3 (logically right 1) elements.  */
+  result = __builtin_vsx_xxsldwi (result, result, 3);
+  /* Convert double to single float scalar in a vector.  */
+  __asm__(
+      "xscvdpsp %x0,%x1"
+      : "=wa" (temp_s)
+      : "wa" (temp_b)
+      : );
+  /* Shift the resulting scalar into vector element [0].  */
+  result = __builtin_vsx_xxsldwi (result, temp_s, 1);
+#else
+  result [0] = ((__v2df)__B)[0];
+#endif
+  return (__m128) result;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_sd (__m128d __A, int __B)
+{
+  __v2df result = (__v2df)__A;
+  double db = __B;
+  result [0] = db;
+  return (__m128d)result;
+}
+
+/* Intel intrinsic.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_sd (__m128d __A, long long __B)
+{
+  __v2df result = (__v2df)__A;
+  double db = __B;
+  result [0] = db;
+  return (__m128d)result;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_sd (__m128d __A, long long __B)
+{
+  return _mm_cvtsi64_sd (__A, __B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sd (__m128d __A, __m128 __B)
+{
+#ifdef __LITTLE_ENDIAN__
+  /* Use splat to move element [0] into position for the convert. */
+  __v4sf temp = vec_splat ((__v4sf)__B, 0);
+  __v2df res;
+  /* Convert single float scalar to double in a vector.  */
+  __asm__(
+      "xscvspdp %x0,%x1"
+      : "=wa" (res)
+      : "wa" (temp)
+      : );
+  return (__m128d) vec_mergel (res, (__v2df)__A);
+#else
+  __v2df res = (__v2df)__A;
+  res [0] = ((__v4sf)__B) [0];
+  return (__m128d) res;
+#endif
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
+{
+  __vector double result;
+  const int litmsk = __mask & 0x3;
+
+  if (litmsk == 0)
+    result = vec_mergeh (__A, __B);
+#if __GNUC__ < 6
+  else if (litmsk == 1)
+    result = vec_xxpermdi (__B, __A, 2);
+  else if (litmsk == 2)
+    result = vec_xxpermdi (__B, __A, 1);
+#else
+  else if (litmsk == 1)
+    result = vec_xxpermdi (__A, __B, 2);
+  else if (litmsk == 2)
+    result = vec_xxpermdi (__A, __B, 1);
+#endif
+  else
+    result = vec_mergel (__A, __B);
+
+  return result;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pd (__m128d __A, double const *__B)
+{
+  __v2df result = (__v2df)__A;
+  result [1] = *__B;
+  return (__m128d)result;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pd (__m128d __A, double const *__B)
+{
+  __v2df result = (__v2df)__A;
+  result [0] = *__B;
+  return (__m128d)result;
+}
+
+#ifdef _ARCH_PWR8
+/* Intrinsic functions that require PowerISA 2.07 minimum.  */
+
+/* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pd (__m128d  __A)
+{
+  __vector unsigned long long result;
+  static const __vector unsigned int perm_mask =
+    {
+#ifdef __LITTLE_ENDIAN__
+	0x80800040, 0x80808080, 0x80808080, 0x80808080
+#else
+      0x80808080, 0x80808080, 0x80808080, 0x80804000
+#endif
+    };
+
+  result = ((__vector unsigned long long)
+	    vec_vbpermq ((__vector unsigned char) __A,
+			 (__vector unsigned char) perm_mask));
+
+#ifdef __LITTLE_ENDIAN__
+  return result[1];
+#else
+  return result[0];
+#endif
+}
+#endif /* _ARCH_PWR8 */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergel ((__vector long long) __A,
+			       (__vector long long) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_mergeh ((__vector long long) __A,
+			       (__vector long long) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4su)__A + (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A + (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4su)__A - (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A - (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_epi16 (__m128i __A, __m128i __B)
+{
+  __vector signed int zero = {0, 0, 0, 0};
+
+  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epi16 (__m128i __A, __m128i __B)
+{
+  __vector signed int w0, w1;
+
+  __vector unsigned char xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
+      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
+      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
+#endif
+    };
+
+  w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
+  w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
+  return (__m128i) vec_perm (w0, w1, xform1);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi16 (__m128i __A, __m128i __B)
+{
+    return (__m128i) ((__v8hi)__A * (__v8hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_su32 (__m64 __A, __m64 __B)
+{
+  unsigned int a = __A;
+  unsigned int b = __B;
+
+  return ((__m64)a * (__m64)b);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epu32 (__m128i __A, __m128i __B)
+{
+#if __GNUC__ < 8
+  __v2du result;
+
+#ifdef __LITTLE_ENDIAN__
+  /* VMX Vector Multiply Odd Unsigned Word.  */
+  __asm__(
+      "vmulouw %0,%1,%2"
+      : "=v" (result)
+      : "v" (__A), "v" (__B)
+      : );
+#else
+  /* VMX Vector Multiply Even Unsigned Word.  */
+  __asm__(
+      "vmuleuw %0,%1,%2"
+      : "=v" (result)
+      : "v" (__A), "v" (__B)
+      : );
+#endif
+  return (__m128i) result;
+#else
+  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
+#endif
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi16 (__m128i __A, int __B)
+{
+  __v8hu lshift;
+  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (__B >= 0 && __B < 16)
+    {
+      if (__builtin_constant_p(__B))
+	lshift = (__v8hu) vec_splat_s16(__B);
+      else
+	lshift = vec_splats ((unsigned short) __B);
+
+      result = vec_sl ((__v8hi) __A, lshift);
+    }
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi32 (__m128i __A, int __B)
+{
+  __v4su lshift;
+  __v4si result = { 0, 0, 0, 0 };
+
+  if (__B >= 0 && __B < 32)
+    {
+      if (__builtin_constant_p(__B) && __B < 16)
+	lshift = (__v4su) vec_splat_s32(__B);
+      else
+	lshift = vec_splats ((unsigned int) __B);
+
+      result = vec_sl ((__v4si) __A, lshift);
+    }
+
+  return (__m128i) result;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi64 (__m128i __A, int __B)
+{
+  __v2du lshift;
+  __v2di result = { 0, 0 };
+
+  if (__B >= 0 && __B < 64)
+    {
+      if (__builtin_constant_p(__B) && __B < 16)
+	lshift = (__v2du) vec_splat_s32(__B);
+      else
+	lshift = (__v2du) vec_splats ((unsigned int) __B);
+
+      result = vec_sl ((__v2di) __A, lshift);
+    }
+
+  return (__m128i) result;
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi16 (__m128i __A, int __B)
+{
+  __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hi result;
+
+  if (__B < 16)
+    {
+      if (__builtin_constant_p(__B))
+	rshift = (__v8hu) vec_splat_s16(__B);
+      else
+	rshift = vec_splats ((unsigned short) __B);
+    }
+  result = vec_sra ((__v8hi) __A, rshift);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi32 (__m128i __A, int __B)
+{
+  __v4su rshift = { 31, 31, 31, 31 };
+  __v4si result;
+
+  if (__B < 32)
+    {
+      if (__builtin_constant_p(__B))
+	{
+	  if (__B < 16)
+	      rshift = (__v4su) vec_splat_s32(__B);
+	    else
+	      rshift = (__v4su) vec_splats((unsigned int)__B);
+	}
+      else
+	rshift = vec_splats ((unsigned int) __B);
+    }
+  result = vec_sra ((__v4si) __A, rshift);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bslli_si128 (__m128i __A, const int __N)
+{
+  __v16qu result;
+  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (__N < 16)
+    result = vec_sld ((__v16qu) __A, zeros, __N);
+  else
+    result = zeros;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bsrli_si128 (__m128i __A, const int __N)
+{
+  __v16qu result;
+  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (__N < 16)
+#ifdef __LITTLE_ENDIAN__
+    if (__builtin_constant_p(__N))
+      /* Would like to use Vector Shift Left Double by Octet
+	 Immediate here to use the immediate form and avoid
+	 load of __N * 8 value into a separate VR.  */
+      result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
+    else
+#endif
+      {
+	__v16qu shift = vec_splats((unsigned char)(__N*8));
+#ifdef __LITTLE_ENDIAN__
+	result = vec_sro ((__v16qu)__A, shift);
+#else
+	result = vec_slo ((__v16qu)__A, shift);
+#endif
+      }
+  else
+    result = zeros;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si128 (__m128i __A, const int __N)
+{
+  return _mm_bsrli_si128 (__A, __N);
+}
+
+extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si128 (__m128i __A, const int _imm5)
+{
+  __v16qu result;
+  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (_imm5 < 16)
+#ifdef __LITTLE_ENDIAN__
+    result = vec_sld ((__v16qu) __A, zeros, _imm5);
+#else
+    result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
+#endif
+  else
+    result = zeros;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_srli_epi16 (__m128i  __A, int __B)
+{
+  __v8hu rshift;
+  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (__B < 16)
+    {
+      if (__builtin_constant_p(__B))
+	rshift = (__v8hu) vec_splat_s16(__B);
+      else
+	rshift = vec_splats ((unsigned short) __B);
+
+      result = vec_sr ((__v8hi) __A, rshift);
+    }
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi32 (__m128i __A, int __B)
+{
+  __v4su rshift;
+  __v4si result = { 0, 0, 0, 0 };
+
+  if (__B < 32)
+    {
+      if (__builtin_constant_p(__B))
+	{
+	  if (__B < 16)
+	      rshift = (__v4su) vec_splat_s32(__B);
+	    else
+	      rshift = (__v4su) vec_splats((unsigned int)__B);
+	}
+      else
+	rshift = vec_splats ((unsigned int) __B);
+
+      result = vec_sr ((__v4si) __A, rshift);
+    }
+
+  return (__m128i) result;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi64 (__m128i __A, int __B)
+{
+  __v2du rshift;
+  __v2di result = { 0, 0 };
+
+  if (__B < 64)
+    {
+      if (__builtin_constant_p(__B))
+	{
+	  if (__B < 16)
+	      rshift = (__v2du) vec_splat_s32(__B);
+	    else
+	      rshift = (__v2du) vec_splats((unsigned long long)__B);
+	}
+      else
+	rshift = (__v2du) vec_splats ((unsigned int) __B);
+
+      result = vec_sr ((__v2di) __A, rshift);
+    }
+
+  return (__m128i) result;
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi16 (__m128i __A, __m128i __B)
+{
+  __v8hu lshift;
+  __vector __bool short shmask;
+  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu result;
+
+#ifdef __LITTLE_ENDIAN__
+  lshift = vec_splat ((__v8hu) __B, 0);
+#else
+  lshift = vec_splat ((__v8hu) __B, 3);
+#endif
+  shmask = vec_cmple (lshift, shmax);
+  result = vec_sl ((__v8hu) __A, lshift);
+  result = vec_sel ((__v8hu) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi32 (__m128i __A, __m128i __B)
+{
+  __v4su lshift;
+  __vector __bool int shmask;
+  const __v4su shmax = { 32, 32, 32, 32 };
+  __v4su result;
+#ifdef __LITTLE_ENDIAN__
+  lshift = vec_splat ((__v4su) __B, 0);
+#else
+  lshift = vec_splat ((__v4su) __B, 1);
+#endif
+  shmask = vec_cmplt (lshift, shmax);
+  result = vec_sl ((__v4su) __A, lshift);
+  result = vec_sel ((__v4su) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi64 (__m128i __A, __m128i __B)
+{
+  __v2du lshift;
+  __vector __bool long long shmask;
+  const __v2du shmax = { 64, 64 };
+  __v2du result;
+
+  lshift = vec_splat ((__v2du) __B, 0);
+  shmask = vec_cmplt (lshift, shmax);
+  result = vec_sl ((__v2du) __A, lshift);
+  result = vec_sel ((__v2du) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi16 (__m128i __A, __m128i __B)
+{
+  const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu rshift;
+  __v8hi result;
+
+#ifdef __LITTLE_ENDIAN__
+  rshift = vec_splat ((__v8hu)__B, 0);
+#else
+  rshift = vec_splat ((__v8hu)__B, 3);
+#endif
+  rshift = vec_min (rshift, rshmax);
+  result = vec_sra ((__v8hi) __A, rshift);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi32 (__m128i __A, __m128i __B)
+{
+  const __v4su rshmax = { 31, 31, 31, 31 };
+  __v4su rshift;
+  __v4si result;
+
+#ifdef __LITTLE_ENDIAN__
+  rshift = vec_splat ((__v4su)__B, 0);
+#else
+  rshift = vec_splat ((__v4su)__B, 1);
+#endif
+  rshift = vec_min (rshift, rshmax);
+  result = vec_sra ((__v4si) __A, rshift);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi16 (__m128i __A, __m128i __B)
+{
+  __v8hu rshift;
+  __vector __bool short shmask;
+  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
+  __v8hu result;
+
+#ifdef __LITTLE_ENDIAN__
+  rshift = vec_splat ((__v8hu) __B, 0);
+#else
+  rshift = vec_splat ((__v8hu) __B, 3);
+#endif
+  shmask = vec_cmple (rshift, shmax);
+  result = vec_sr ((__v8hu) __A, rshift);
+  result = vec_sel ((__v8hu) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi32 (__m128i __A, __m128i __B)
+{
+  __v4su rshift;
+  __vector __bool int shmask;
+  const __v4su shmax = { 32, 32, 32, 32 };
+  __v4su result;
+
+#ifdef __LITTLE_ENDIAN__
+  rshift = vec_splat ((__v4su) __B, 0);
+#else
+  rshift = vec_splat ((__v4su) __B, 1);
+#endif
+  shmask = vec_cmplt (rshift, shmax);
+  result = vec_sr ((__v4su) __A, rshift);
+  result = vec_sel ((__v4su) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi64 (__m128i __A, __m128i __B)
+{
+  __v2du rshift;
+  __vector __bool long long shmask;
+  const __v2du shmax = { 64, 64 };
+  __v2du result;
+
+  rshift = vec_splat ((__v2du) __B, 0);
+  shmask = vec_cmplt (rshift, shmax);
+  result = vec_sr ((__v2du) __A, rshift);
+  result = vec_sel ((__v2du) shmask, result, shmask);
+
+  return (__m128i) result;
+}
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_pd (__m128d __A, __m128d __B)
+{
+  return (vec_and ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_pd (__m128d __A, __m128d __B)
+{
+  return (vec_andc ((__v2df) __B, (__v2df) __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_pd (__m128d __A, __m128d __B)
+{
+  return (vec_or ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_pd (__m128d __A, __m128d __B)
+{
+  return (vec_xor ((__v2df) __A, (__v2df) __B));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi16 (__m128i const __A, int const __N)
+{
+  return (unsigned short) ((__v8hi)__A)[__N & 7];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
+{
+  __v8hi result = (__v8hi)__A;
+
+  result [(__N & 7)] = __D;
+
+  return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
+}
+
+
+#ifdef _ARCH_PWR8
+/* Intrinsic functions that require PowerISA 2.07 minimum.  */
+
+/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_epi8 (__m128i __A)
+{
+  __vector unsigned long long result;
+  static const __vector unsigned char perm_mask =
+    {
+	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
+    };
+
+  result = ((__vector unsigned long long)
+	    vec_vbpermq ((__vector unsigned char) __A,
+			 (__vector unsigned char) perm_mask));
+
+#ifdef __LITTLE_ENDIAN__
+  return result[1];
+#else
+  return result[0];
+#endif
+}
+#endif /* _ARCH_PWR8 */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epu16 (__m128i __A, __m128i __B)
+{
+  __v4su w0, w1;
+  __v16qu xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
+      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
+      0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
+#endif
+    };
+
+  w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
+  w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
+  return (__m128i) vec_perm (w0, w1, xform1);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflehi_epi16 (__m128i __A, const int __mask)
+{
+  unsigned long element_selector_98 = __mask & 0x03;
+  unsigned long element_selector_BA = (__mask >> 2) & 0x03;
+  unsigned long element_selector_DC = (__mask >> 4) & 0x03;
+  unsigned long element_selector_FE = (__mask >> 6) & 0x03;
+  static const unsigned short permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
+#else
+	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
+#endif
+    };
+  __v2du pmask =
+#ifdef __LITTLE_ENDIAN__
+      { 0x1716151413121110UL,  0UL};
+#else
+      { 0x1011121314151617UL,  0UL};
+#endif
+  __m64_union t;
+  __v2du a, r;
+
+  t.as_short[0] = permute_selectors[element_selector_98];
+  t.as_short[1] = permute_selectors[element_selector_BA];
+  t.as_short[2] = permute_selectors[element_selector_DC];
+  t.as_short[3] = permute_selectors[element_selector_FE];
+  pmask[1] = t.as_m64;
+  a = (__v2du)__A;
+  r = vec_perm (a, a, (__vector unsigned char)pmask);
+  return (__m128i) r;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflelo_epi16 (__m128i __A, const int __mask)
+{
+  unsigned long element_selector_10 = __mask & 0x03;
+  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned short permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+	      0x0100, 0x0302, 0x0504, 0x0706
+#else
+	      0x0001, 0x0203, 0x0405, 0x0607
+#endif
+    };
+  __v2du pmask =
+#ifdef __LITTLE_ENDIAN__
+                 { 0UL,  0x1f1e1d1c1b1a1918UL};
+#else
+                 { 0UL,  0x18191a1b1c1d1e1fUL};
+#endif
+  __m64_union t;
+  __v2du a, r;
+  t.as_short[0] = permute_selectors[element_selector_10];
+  t.as_short[1] = permute_selectors[element_selector_32];
+  t.as_short[2] = permute_selectors[element_selector_54];
+  t.as_short[3] = permute_selectors[element_selector_76];
+  pmask[0] = t.as_m64;
+  a = (__v2du)__A;
+  r = vec_perm (a, a, (__vector unsigned char)pmask);
+  return (__m128i) r;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi32 (__m128i __A, const int __mask)
+{
+  unsigned long element_selector_10 = __mask & 0x03;
+  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned int permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+#else
+      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
+#endif
+    };
+  __v4su t;
+
+  t[0] = permute_selectors[element_selector_10];
+  t[1] = permute_selectors[element_selector_32];
+  t[2] = permute_selectors[element_selector_54] + 0x10101010;
+  t[3] = permute_selectors[element_selector_76] + 0x10101010;
+  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
+{
+  __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
+  __v16qu mask, tmp;
+  __m128i_u *p = (__m128i_u*)__C;
+
+  tmp = (__v16qu)_mm_loadu_si128(p);
+  mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
+  tmp = vec_sel (tmp, (__v16qu)__A, mask);
+  _mm_storeu_si128 (p, (__m128i)tmp);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
+}
+
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_epu8 (__m128i __A, __m128i __B)
+{
+  __v16qu a, b;
+  __v16qu vmin, vmax, vabsdiff;
+  __v4si vsum;
+  const __v4su zero = { 0, 0, 0, 0 };
+  __v4si result;
+
+  a = (__v16qu) __A;
+  b = (__v16qu) __B;
+  vmin = vec_min (a, b);
+  vmax = vec_max (a, b);
+  vabsdiff = vec_sub (vmax, vmin);
+  /* Sum four groups of bytes into integers.  */
+  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+  /* Sum across four integers with two integer results.  */
+  result = vec_sum2s (vsum, (__vector signed int) zero);
+  /* Rotate the sums into the correct position.  */
+#ifdef __LITTLE_ENDIAN__
+  result = vec_sld (result, result, 4);
+#else
+  result = vec_sld (result, result, 6);
+#endif
+  /* Rotate the sums into the correct position.  */
+  return (__m128i) result;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si32 (int *__A, int __B)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "dcbtstt 0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+  *__A = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si64 (long long int *__A, long long int __B)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "	dcbtstt	0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+  *__A = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si128 (__m128i *__A, __m128i __B)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "dcbtstt 0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+  *__A = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pd (double *__A, __m128d __B)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "dcbtstt 0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+  *(__m128d*)__A = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clflush (void const *__A)
+{
+  /* Use the data cache block flush.  */
+  __asm__ (
+    "dcbf 0,%0"
+    :
+    : "b" (__A)
+    : "memory"
+  );
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lfence (void)
+{
+  /* Use light weight sync for load to load ordering.  */
+  __atomic_thread_fence (__ATOMIC_RELEASE);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mfence (void)
+{
+  /* Use heavy weight sync for any to any ordering.  */
+  __atomic_thread_fence (__ATOMIC_SEQ_CST);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si128 (int __A)
+{
+  return _mm_set_epi32 (0, 0, 0, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si128 (long long __A)
+{
+  return __extension__ (__m128i)(__v2di){ __A, 0LL };
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si128 (long long __A)
+{
+  return __extension__ (__m128i)(__v2di){ __A, 0LL };
+}
+
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ps(__m128d __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_si128(__m128d __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pd(__m128 __A)
+{
+  return (__m128d) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_si128(__m128 __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ps(__m128i __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pd(__m128i __A)
+{
+  return (__m128d) __A;
+}
+
+#endif /* EMMINTRIN_H_ */
diff --git a/lib/include/ppc_wrappers/mm_malloc.h b/lib/include/ppc_wrappers/mm_malloc.h
new file mode 100644
index 000000000..d91d7865c
--- /dev/null
+++ b/lib/include/ppc_wrappers/mm_malloc.h
@@ -0,0 +1,44 @@
+/*===---- mm_malloc.h - Implementation of _mm_malloc and _mm_free ----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM_MALLOC_H_INCLUDED
+#define _MM_MALLOC_H_INCLUDED
+
+#include <stdlib.h>
+
+/* We can't depend on <stdlib.h> since the prototype of posix_memalign
+   may not be visible.  */
+#ifndef __cplusplus
+extern int posix_memalign (void **, size_t, size_t);
+#else
+extern "C" int posix_memalign (void **, size_t, size_t) throw ();
+#endif
+
+static __inline void *
+_mm_malloc (size_t size, size_t alignment)
+{
+  /* PowerPC64 ELF V2 ABI requires quadword alignment.  */
+  size_t vec_align = sizeof (__vector float);
+  void *ptr;
+
+  if (alignment < vec_align)
+    alignment = vec_align;
+  if (posix_memalign (&ptr, alignment, size) == 0)
+    return ptr;
+  else
+    return NULL;
+}
+
+static __inline void
+_mm_free (void * ptr)
+{
+  free (ptr);
+}
+
+#endif /* _MM_MALLOC_H_INCLUDED */
diff --git a/lib/include/ppc_wrappers/mmintrin.h b/lib/include/ppc_wrappers/mmintrin.h
new file mode 100644
index 000000000..b949653ad
--- /dev/null
+++ b/lib/include/ppc_wrappers/mmintrin.h
@@ -0,0 +1,1443 @@
+/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header file is to help porting code using Intel intrinsics
+   explicitly from x86_64 to powerpc64/powerpc64le.
+
+   Since PowerPC target doesn't support native 64-bit vector type, we
+   typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
+   works well for _si64 and some _pi32 operations.
+
+   For _pi16 and _pi8 operations, it's better to transfer __m64 into
+   128-bit PowerPC vector first. Power8 introduced direct register
+   move instructions which helps for more efficient implementation.
+
+   It's user's responsibility to determine if the results of such port
+   are acceptable or further changes are needed. Please note that much
+   code using Intel intrinsics CAN BE REWRITTEN in more portable and
+   efficient standard C or GNU C extensions with 64-bit scalar
+   operations, or 128-bit SSE/Altivec operations, which are more
+   recommended. */
+#error                                                                         \
+    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef _MMINTRIN_H_INCLUDED
+#define _MMINTRIN_H_INCLUDED
+
+#include <altivec.h>
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef __attribute__((__aligned__(8))) unsigned long long __m64;
+
+typedef __attribute__((__aligned__(8))) union {
+  __m64 as_m64;
+  char as_char[8];
+  signed char as_signed_char[8];
+  short as_short[4];
+  int as_int[2];
+  long long as_long_long;
+  float as_float[2];
+  double as_double;
+} __m64_union;
+
+/* Empty the multimedia state.  */
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_empty(void) {
+  /* nothing to do on PowerPC.  */
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_empty(void) {
+  /* nothing to do on PowerPC.  */
+}
+
+/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi32_si64(int __i) {
+  return (__m64)(unsigned int)__i;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_from_int(int __i) {
+  return _mm_cvtsi32_si64(__i);
+}
+
+/* Convert the lower 32 bits of the __m64 object into an integer.  */
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi64_si32(__m64 __i) {
+  return ((int)__i);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_to_int(__m64 __i) {
+  return _mm_cvtsi64_si32(__i);
+}
+
+/* Convert I to a __m64 object.  */
+
+/* Intel intrinsic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_from_int64(long long __i) {
+  return (__m64)__i;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi64_m64(long long __i) {
+  return (__m64)__i;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi64x_si64(long long __i) {
+  return (__m64)__i;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set_pi64x(long long __i) {
+  return (__m64)__i;
+}
+
+/* Convert the __m64 object to a 64bit integer.  */
+
+/* Intel intrinsic.  */
+extern __inline long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_to_int64(__m64 __i) {
+  return (long long)__i;
+}
+
+extern __inline long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtm64_si64(__m64 __i) {
+  return (long long)__i;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtsi64_si64x(__m64 __i) {
+  return (long long)__i;
+}
+
+#ifdef _ARCH_PWR8
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+   the result, and the four 16-bit values from M2 into the upper four 8-bit
+   values of the result, all with signed saturation.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_packs_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short vm1;
+  __vector signed char vresult;
+
+  vm1 = (__vector signed short)(__vector unsigned long long)
+#ifdef __LITTLE_ENDIAN__
+      {__m1, __m2};
+#else
+      {__m2, __m1};
+#endif
+  vresult = vec_packs(vm1, vm1);
+  return (__m64)((__vector long long)vresult)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_packsswb(__m64 __m1, __m64 __m2) {
+  return _mm_packs_pi16(__m1, __m2);
+}
+
+/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
+   the result, and the two 32-bit values from M2 into the upper two 16-bit
+   values of the result, all with signed saturation.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_packs_pi32(__m64 __m1, __m64 __m2) {
+  __vector signed int vm1;
+  __vector signed short vresult;
+
+  vm1 = (__vector signed int)(__vector unsigned long long)
+#ifdef __LITTLE_ENDIAN__
+      {__m1, __m2};
+#else
+      {__m2, __m1};
+#endif
+  vresult = vec_packs(vm1, vm1);
+  return (__m64)((__vector long long)vresult)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_packssdw(__m64 __m1, __m64 __m2) {
+  return _mm_packs_pi32(__m1, __m2);
+}
+
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+   the result, and the four 16-bit values from M2 into the upper four 8-bit
+   values of the result, all with unsigned saturation.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_packs_pu16(__m64 __m1, __m64 __m2) {
+  __vector unsigned char r;
+  __vector signed short vm1 = (__vector signed short)(__vector long long)
+#ifdef __LITTLE_ENDIAN__
+      {__m1, __m2};
+#else
+      {__m2, __m1};
+#endif
+  const __vector signed short __zero = {0};
+  __vector __bool short __select = vec_cmplt(vm1, __zero);
+  r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
+  __vector __bool char packsel = vec_pack(__select, __select);
+  r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
+  return (__m64)((__vector long long)r)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_packuswb(__m64 __m1, __m64 __m2) {
+  return _mm_packs_pu16(__m1, __m2);
+}
+#endif /* end ARCH_PWR8 */
+
+/* Interleave the four 8-bit values from the high half of M1 with the four
+   8-bit values from the high half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats(__m1);
+  b = (__vector unsigned char)vec_splats(__m2);
+  c = vec_mergel(a, b);
+  return (__m64)((__vector long long)c)[1];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = m1.as_char[4];
+  res.as_char[1] = m2.as_char[4];
+  res.as_char[2] = m1.as_char[5];
+  res.as_char[3] = m2.as_char[5];
+  res.as_char[4] = m1.as_char[6];
+  res.as_char[5] = m2.as_char[6];
+  res.as_char[6] = m1.as_char[7];
+  res.as_char[7] = m2.as_char[7];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpckhbw(__m64 __m1, __m64 __m2) {
+  return _mm_unpackhi_pi8(__m1, __m2);
+}
+
+/* Interleave the two 16-bit values from the high half of M1 with the two
+   16-bit values from the high half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = m1.as_short[2];
+  res.as_short[1] = m2.as_short[2];
+  res.as_short[2] = m1.as_short[3];
+  res.as_short[3] = m2.as_short[3];
+
+  return (__m64)res.as_m64;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpckhwd(__m64 __m1, __m64 __m2) {
+  return _mm_unpackhi_pi16(__m1, __m2);
+}
+/* Interleave the 32-bit value from the high half of M1 with the 32-bit
+   value from the high half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = m1.as_int[1];
+  res.as_int[1] = m2.as_int[1];
+
+  return (__m64)res.as_m64;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpckhdq(__m64 __m1, __m64 __m2) {
+  return _mm_unpackhi_pi32(__m1, __m2);
+}
+/* Interleave the four 8-bit values from the low half of M1 with the four
+   8-bit values from the low half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats(__m1);
+  b = (__vector unsigned char)vec_splats(__m2);
+  c = vec_mergel(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = m1.as_char[0];
+  res.as_char[1] = m2.as_char[0];
+  res.as_char[2] = m1.as_char[1];
+  res.as_char[3] = m2.as_char[1];
+  res.as_char[4] = m1.as_char[2];
+  res.as_char[5] = m2.as_char[2];
+  res.as_char[6] = m1.as_char[3];
+  res.as_char[7] = m2.as_char[3];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpcklbw(__m64 __m1, __m64 __m2) {
+  return _mm_unpacklo_pi8(__m1, __m2);
+}
+/* Interleave the two 16-bit values from the low half of M1 with the two
+   16-bit values from the low half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = m1.as_short[0];
+  res.as_short[1] = m2.as_short[0];
+  res.as_short[2] = m1.as_short[1];
+  res.as_short[3] = m2.as_short[1];
+
+  return (__m64)res.as_m64;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpcklwd(__m64 __m1, __m64 __m2) {
+  return _mm_unpacklo_pi16(__m1, __m2);
+}
+
+/* Interleave the 32-bit value from the low half of M1 with the 32-bit
+   value from the low half of M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = m1.as_int[0];
+  res.as_int[1] = m2.as_int[0];
+
+  return (__m64)res.as_m64;
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_punpckldq(__m64 __m1, __m64 __m2) {
+  return _mm_unpacklo_pi32(__m1, __m2);
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_add_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = vec_add(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = m1.as_char[0] + m2.as_char[0];
+  res.as_char[1] = m1.as_char[1] + m2.as_char[1];
+  res.as_char[2] = m1.as_char[2] + m2.as_char[2];
+  res.as_char[3] = m1.as_char[3] + m2.as_char[3];
+  res.as_char[4] = m1.as_char[4] + m2.as_char[4];
+  res.as_char[5] = m1.as_char[5] + m2.as_char[5];
+  res.as_char[6] = m1.as_char[6] + m2.as_char[6];
+  res.as_char[7] = m1.as_char[7] + m2.as_char[7];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddb(__m64 __m1, __m64 __m2) {
+  return _mm_add_pi8(__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_add_pi16(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_add(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = m1.as_short[0] + m2.as_short[0];
+  res.as_short[1] = m1.as_short[1] + m2.as_short[1];
+  res.as_short[2] = m1.as_short[2] + m2.as_short[2];
+  res.as_short[3] = m1.as_short[3] + m2.as_short[3];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddw(__m64 __m1, __m64 __m2) {
+  return _mm_add_pi16(__m1, __m2);
+}
+
+/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_add_pi32(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR9
+  __vector signed int a, b, c;
+
+  a = (__vector signed int)vec_splats(__m1);
+  b = (__vector signed int)vec_splats(__m2);
+  c = vec_add(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = m1.as_int[0] + m2.as_int[0];
+  res.as_int[1] = m1.as_int[1] + m2.as_int[1];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddd(__m64 __m1, __m64 __m2) {
+  return _mm_add_pi32(__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sub_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = vec_sub(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = m1.as_char[0] - m2.as_char[0];
+  res.as_char[1] = m1.as_char[1] - m2.as_char[1];
+  res.as_char[2] = m1.as_char[2] - m2.as_char[2];
+  res.as_char[3] = m1.as_char[3] - m2.as_char[3];
+  res.as_char[4] = m1.as_char[4] - m2.as_char[4];
+  res.as_char[5] = m1.as_char[5] - m2.as_char[5];
+  res.as_char[6] = m1.as_char[6] - m2.as_char[6];
+  res.as_char[7] = m1.as_char[7] - m2.as_char[7];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubb(__m64 __m1, __m64 __m2) {
+  return _mm_sub_pi8(__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sub_pi16(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_sub(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = m1.as_short[0] - m2.as_short[0];
+  res.as_short[1] = m1.as_short[1] - m2.as_short[1];
+  res.as_short[2] = m1.as_short[2] - m2.as_short[2];
+  res.as_short[3] = m1.as_short[3] - m2.as_short[3];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubw(__m64 __m1, __m64 __m2) {
+  return _mm_sub_pi16(__m1, __m2);
+}
+
+/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sub_pi32(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR9
+  __vector signed int a, b, c;
+
+  a = (__vector signed int)vec_splats(__m1);
+  b = (__vector signed int)vec_splats(__m2);
+  c = vec_sub(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = m1.as_int[0] - m2.as_int[0];
+  res.as_int[1] = m1.as_int[1] - m2.as_int[1];
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubd(__m64 __m1, __m64 __m2) {
+  return _mm_sub_pi32(__m1, __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_add_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 + __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sub_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 - __m2);
+}
+
+/* Shift the 64-bit value in M left by COUNT.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sll_si64(__m64 __m, __m64 __count) {
+  return (__m << __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psllq(__m64 __m, __m64 __count) {
+  return _mm_sll_si64(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_slli_si64(__m64 __m, const int __count) {
+  return (__m << __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psllqi(__m64 __m, const int __count) {
+  return _mm_slli_si64(__m, __count);
+}
+
+/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srl_si64(__m64 __m, __m64 __count) {
+  return (__m >> __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrlq(__m64 __m, __m64 __count) {
+  return _mm_srl_si64(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srli_si64(__m64 __m, const int __count) {
+  return (__m >> __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrlqi(__m64 __m, const int __count) {
+  return _mm_srli_si64(__m, __count);
+}
+
+/* Bit-wise AND the 64-bit values in M1 and M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_and_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 & __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pand(__m64 __m1, __m64 __m2) {
+  return _mm_and_si64(__m1, __m2);
+}
+
+/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
+   64-bit value in M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_andnot_si64(__m64 __m1, __m64 __m2) {
+  return (~__m1 & __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pandn(__m64 __m1, __m64 __m2) {
+  return _mm_andnot_si64(__m1, __m2);
+}
+
+/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_or_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 | __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_por(__m64 __m1, __m64 __m2) {
+  return _mm_or_si64(__m1, __m2);
+}
+
+/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_xor_si64(__m64 __m1, __m64 __m2) {
+  return (__m1 ^ __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pxor(__m64 __m1, __m64 __m2) {
+  return _mm_xor_si64(__m1, __m2);
+}
+
+/* Creates a 64-bit zero.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_setzero_si64(void) {
+  return (__m64)0;
+}
+
+/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
+   test is true and zero if false.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
+#if defined(_ARCH_PWR6) && defined(__powerpc64__)
+  __m64 res;
+  __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
+  return (res);
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
+  res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
+  res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
+  res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
+  res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
+  res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
+  res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
+  res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpeqb(__m64 __m1, __m64 __m2) {
+  return _mm_cmpeq_pi8(__m1, __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = (__vector signed char)vec_cmpgt(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
+  res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
+  res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
+  res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
+  res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
+  res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
+  res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
+  res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpgtb(__m64 __m1, __m64 __m2) {
+  return _mm_cmpgt_pi8(__m1, __m2);
+}
+
+/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
+   the test is true and zero if false.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = (__vector signed short)vec_cmpeq(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
+  res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
+  res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
+  res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpeqw(__m64 __m1, __m64 __m2) {
+  return _mm_cmpeq_pi16(__m1, __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR8
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = (__vector signed short)vec_cmpgt(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
+  res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
+  res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
+  res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpgtw(__m64 __m1, __m64 __m2) {
+  return _mm_cmpgt_pi16(__m1, __m2);
+}
+
+/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
+   the test is true and zero if false.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR9
+  __vector signed int a, b, c;
+
+  a = (__vector signed int)vec_splats(__m1);
+  b = (__vector signed int)vec_splats(__m2);
+  c = (__vector signed int)vec_cmpeq(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
+  res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpeqd(__m64 __m1, __m64 __m2) {
+  return _mm_cmpeq_pi32(__m1, __m2);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
+#if _ARCH_PWR9
+  __vector signed int a, b, c;
+
+  a = (__vector signed int)vec_splats(__m1);
+  b = (__vector signed int)vec_splats(__m2);
+  c = (__vector signed int)vec_cmpgt(a, b);
+  return (__m64)((__vector long long)c)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __m1;
+  m2.as_m64 = __m2;
+
+  res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
+  res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
+
+  return (__m64)res.as_m64;
+#endif
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pcmpgtd(__m64 __m1, __m64 __m2) {
+  return _mm_cmpgt_pi32(__m1, __m2);
+}
+
+#if _ARCH_PWR8
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
+   saturated arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_adds_pi8(__m64 __m1, __m64 __m2) {
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = vec_adds(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddsb(__m64 __m1, __m64 __m2) {
+  return _mm_adds_pi8(__m1, __m2);
+}
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
+   saturated arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_adds_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_adds(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddsw(__m64 __m1, __m64 __m2) {
+  return _mm_adds_pi16(__m1, __m2);
+}
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
+   saturated arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_adds_pu8(__m64 __m1, __m64 __m2) {
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats(__m1);
+  b = (__vector unsigned char)vec_splats(__m2);
+  c = vec_adds(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddusb(__m64 __m1, __m64 __m2) {
+  return _mm_adds_pu8(__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
+   saturated arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_adds_pu16(__m64 __m1, __m64 __m2) {
+  __vector unsigned short a, b, c;
+
+  a = (__vector unsigned short)vec_splats(__m1);
+  b = (__vector unsigned short)vec_splats(__m2);
+  c = vec_adds(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_paddusw(__m64 __m1, __m64 __m2) {
+  return _mm_adds_pu16(__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
+   saturating arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_subs_pi8(__m64 __m1, __m64 __m2) {
+  __vector signed char a, b, c;
+
+  a = (__vector signed char)vec_splats(__m1);
+  b = (__vector signed char)vec_splats(__m2);
+  c = vec_subs(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubsb(__m64 __m1, __m64 __m2) {
+  return _mm_subs_pi8(__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+   signed saturating arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_subs_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_subs(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubsw(__m64 __m1, __m64 __m2) {
+  return _mm_subs_pi16(__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
+   unsigned saturating arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_subs_pu8(__m64 __m1, __m64 __m2) {
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats(__m1);
+  b = (__vector unsigned char)vec_splats(__m2);
+  c = vec_subs(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubusb(__m64 __m1, __m64 __m2) {
+  return _mm_subs_pu8(__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+   unsigned saturating arithmetic.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_subs_pu16(__m64 __m1, __m64 __m2) {
+  __vector unsigned short a, b, c;
+
+  a = (__vector unsigned short)vec_splats(__m1);
+  b = (__vector unsigned short)vec_splats(__m2);
+  c = vec_subs(a, b);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psubusw(__m64 __m1, __m64 __m2) {
+  return _mm_subs_pu16(__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
+   four 32-bit intermediate results, which are then summed by pairs to
+   produce two 32-bit results.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_madd_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b;
+  __vector signed int c;
+  __vector signed int zero = {0, 0, 0, 0};
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = vec_vmsumshm(a, b, zero);
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pmaddwd(__m64 __m1, __m64 __m2) {
+  return _mm_madd_pi16(__m1, __m2);
+}
+/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
+   M2 and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b;
+  __vector signed short c;
+  __vector signed int w0, w1;
+  __vector unsigned char xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
+      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
+      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
+#endif
+  };
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+
+  w0 = vec_vmulesh(a, b);
+  w1 = vec_vmulosh(a, b);
+  c = (__vector signed short)vec_perm(w0, w1, xform1);
+
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pmulhw(__m64 __m1, __m64 __m2) {
+  return _mm_mulhi_pi16(__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
+   the low 16 bits of the results.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
+  __vector signed short a, b, c;
+
+  a = (__vector signed short)vec_splats(__m1);
+  b = (__vector signed short)vec_splats(__m2);
+  c = a * b;
+  return (__m64)((__vector long long)c)[0];
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pmullw(__m64 __m1, __m64 __m2) {
+  return _mm_mullo_pi16(__m1, __m2);
+}
+
+/* Shift four 16-bit values in M left by COUNT.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sll_pi16(__m64 __m, __m64 __count) {
+  __vector signed short m, r;
+  __vector unsigned short c;
+
+  if (__count <= 15) {
+    m = (__vector signed short)vec_splats(__m);
+    c = (__vector unsigned short)vec_splats((unsigned short)__count);
+    r = vec_sl(m, (__vector unsigned short)c);
+    return (__m64)((__vector long long)r)[0];
+  } else
+    return (0);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psllw(__m64 __m, __m64 __count) {
+  return _mm_sll_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_slli_pi16(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sll_pi16.  */
+  return _mm_sll_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psllwi(__m64 __m, int __count) {
+  return _mm_slli_pi16(__m, __count);
+}
+
+/* Shift two 32-bit values in M left by COUNT.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sll_pi32(__m64 __m, __m64 __count) {
+  __m64_union m, res;
+
+  m.as_m64 = __m;
+
+  res.as_int[0] = m.as_int[0] << __count;
+  res.as_int[1] = m.as_int[1] << __count;
+  return (res.as_m64);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pslld(__m64 __m, __m64 __count) {
+  return _mm_sll_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_slli_pi32(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sll_pi32.  */
+  return _mm_sll_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_pslldi(__m64 __m, int __count) {
+  return _mm_slli_pi32(__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sra_pi16(__m64 __m, __m64 __count) {
+  __vector signed short m, r;
+  __vector unsigned short c;
+
+  if (__count <= 15) {
+    m = (__vector signed short)vec_splats(__m);
+    c = (__vector unsigned short)vec_splats((unsigned short)__count);
+    r = vec_sra(m, (__vector unsigned short)c);
+    return (__m64)((__vector long long)r)[0];
+  } else
+    return (0);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psraw(__m64 __m, __m64 __count) {
+  return _mm_sra_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srai_pi16(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sra_pi32.  */
+  return _mm_sra_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrawi(__m64 __m, int __count) {
+  return _mm_srai_pi16(__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sra_pi32(__m64 __m, __m64 __count) {
+  __m64_union m, res;
+
+  m.as_m64 = __m;
+
+  res.as_int[0] = m.as_int[0] >> __count;
+  res.as_int[1] = m.as_int[1] >> __count;
+  return (res.as_m64);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrad(__m64 __m, __m64 __count) {
+  return _mm_sra_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srai_pi32(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sra_pi32.  */
+  return _mm_sra_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psradi(__m64 __m, int __count) {
+  return _mm_srai_pi32(__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srl_pi16(__m64 __m, __m64 __count) {
+  __vector unsigned short m, r;
+  __vector unsigned short c;
+
+  if (__count <= 15) {
+    m = (__vector unsigned short)vec_splats(__m);
+    c = (__vector unsigned short)vec_splats((unsigned short)__count);
+    r = vec_sr(m, (__vector unsigned short)c);
+    return (__m64)((__vector long long)r)[0];
+  } else
+    return (0);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrlw(__m64 __m, __m64 __count) {
+  return _mm_srl_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srli_pi16(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_sra_pi32.  */
+  return _mm_srl_pi16(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrlwi(__m64 __m, int __count) {
+  return _mm_srli_pi16(__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srl_pi32(__m64 __m, __m64 __count) {
+  __m64_union m, res;
+
+  m.as_m64 = __m;
+
+  res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
+  res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
+  return (res.as_m64);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrld(__m64 __m, __m64 __count) {
+  return _mm_srl_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_srli_pi32(__m64 __m, int __count) {
+  /* Promote int to long then invoke mm_srl_pi32.  */
+  return _mm_srl_pi32(__m, __count);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _m_psrldi(__m64 __m, int __count) {
+  return _mm_srli_pi32(__m, __count);
+}
+#endif /* _ARCH_PWR8 */
+
+/* Creates a vector of two 32-bit values; I0 is least significant.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set_pi32(int __i1, int __i0) {
+  __m64_union res;
+
+  res.as_int[0] = __i0;
+  res.as_int[1] = __i1;
+  return (res.as_m64);
+}
+
+/* Creates a vector of four 16-bit values; W0 is least significant.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
+  __m64_union res;
+
+  res.as_short[0] = __w0;
+  res.as_short[1] = __w1;
+  res.as_short[2] = __w2;
+  res.as_short[3] = __w3;
+  return (res.as_m64);
+}
+
+/* Creates a vector of eight 8-bit values; B0 is least significant.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
+                char __b2, char __b1, char __b0) {
+  __m64_union res;
+
+  res.as_char[0] = __b0;
+  res.as_char[1] = __b1;
+  res.as_char[2] = __b2;
+  res.as_char[3] = __b3;
+  res.as_char[4] = __b4;
+  res.as_char[5] = __b5;
+  res.as_char[6] = __b6;
+  res.as_char[7] = __b7;
+  return (res.as_m64);
+}
+
+/* Similar, but with the arguments in reverse order.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_setr_pi32(int __i0, int __i1) {
+  __m64_union res;
+
+  res.as_int[0] = __i0;
+  res.as_int[1] = __i1;
+  return (res.as_m64);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
+                 char __b5, char __b6, char __b7) {
+  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+/* Creates a vector of two 32-bit values, both elements containing I.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set1_pi32(int __i) {
+  __m64_union res;
+
+  res.as_int[0] = __i;
+  res.as_int[1] = __i;
+  return (res.as_m64);
+}
+
+/* Creates a vector of four 16-bit values, all elements containing W.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set1_pi16(short __w) {
+#if _ARCH_PWR9
+  __vector signed short w;
+
+  w = (__vector signed short)vec_splats(__w);
+  return (__m64)((__vector long long)w)[0];
+#else
+  __m64_union res;
+
+  res.as_short[0] = __w;
+  res.as_short[1] = __w;
+  res.as_short[2] = __w;
+  res.as_short[3] = __w;
+  return (res.as_m64);
+#endif
+}
+
+/* Creates a vector of eight 8-bit values, all elements containing B.  */
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_set1_pi8(signed char __b) {
+#if _ARCH_PWR8
+  __vector signed char b;
+
+  b = (__vector signed char)vec_splats(__b);
+  return (__m64)((__vector long long)b)[0];
+#else
+  __m64_union res;
+
+  res.as_char[0] = __b;
+  res.as_char[1] = __b;
+  res.as_char[2] = __b;
+  res.as_char[3] = __b;
+  res.as_char[4] = __b;
+  res.as_char[5] = __b;
+  res.as_char[6] = __b;
+  res.as_char[7] = __b;
+  return (res.as_m64);
+#endif
+}
+#endif /* _MMINTRIN_H_INCLUDED */
diff --git a/lib/include/ppc_wrappers/xmmintrin.h b/lib/include/ppc_wrappers/xmmintrin.h
new file mode 100644
index 000000000..1b322b665
--- /dev/null
+++ b/lib/include/ppc_wrappers/xmmintrin.h
@@ -0,0 +1,1838 @@
+/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header file is to help porting code using Intel intrinsics
+   explicitly from x86_64 to powerpc64/powerpc64le.
+
+   Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
+   VMX/VSX ISA is a good match for vector float SIMD operations.
+   However scalar float operations in vector (XMM) registers require
+   the POWER8 VSX ISA (2.07) level. There are differences for data
+   format and placement of float scalars in the vector register, which
+   require extra steps to match SSE scalar float semantics on POWER.
+
+   It should be noted that there's much difference between X86_64's
+   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
+   portable <fenv.h> instead of access MXSCR directly.
+
+   Most SSE scalar float intrinsic operations can be performed more
+   efficiently as C language float scalar operations or optimized to
+   use vector SIMD operations. We recommend this for new applications. */
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef _XMMINTRIN_H_INCLUDED
+#define _XMMINTRIN_H_INCLUDED
+
+/* Define four value permute mask */
+#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
+
+#include <altivec.h>
+
+/* Avoid collisions between altivec.h and strict adherence to C++ and
+   C11 standards.  This should eventually be done inside altivec.h itself,
+   but only after testing a full distro build.  */
+#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
+				 (defined(__STDC_VERSION__) &&	\
+				  __STDC_VERSION__ >= 201112L))
+#undef vector
+#undef pixel
+#undef bool
+#endif
+
+/* We need type definitions from the MMX header file.  */
+#include <mmintrin.h>
+
+/* Get _mm_malloc () and _mm_free ().  */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same type.  */
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
+				       __aligned__ (1)));
+
+/* Internal data types for implementing the intrinsics.  */
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+/* Create an undefined vector.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ps (void)
+{
+  __m128 __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ps (void)
+{
+  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
+}
+
+/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps (float const *__P)
+{
+  return ((__m128)vec_ld(0, (__v4sf*)__P));
+}
+
+/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ps (float const *__P)
+{
+  return (vec_vsx_ld(0, __P));
+}
+
+/* Load four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_ps (float const *__P)
+{
+  __v4sf   __tmp;
+  __m128 result;
+  static const __vector unsigned char permute_vector =
+    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
+	0x17, 0x10, 0x11, 0x12, 0x13 };
+
+  __tmp = vec_ld (0, (__v4sf *) __P);
+  result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
+  return result;
+}
+
+/* Create a vector with all four elements equal to F.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ps (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps1 (float __F)
+{
+  return _mm_set1_ps (__F);
+}
+
+/* Create the vector [Z Y X W].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
+}
+
+/* Create the vector [W X Y Z].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ps (float __Z, float __Y, float __X, float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
+}
+
+/* Store four SPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps (float *__P, __m128 __A)
+{
+  vec_st((__v4sf)__A, 0, (__v4sf*)__P);
+}
+
+/* Store four SPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ps (float *__P, __m128 __A)
+{
+  *(__m128_u *)__P = __A;
+}
+
+/* Store four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_ps (float *__P, __m128 __A)
+{
+  __v4sf   __tmp;
+  static const __vector unsigned char permute_vector =
+    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
+	0x17, 0x10, 0x11, 0x12, 0x13 };
+
+  __tmp = (__m128) vec_perm (__A, __A, permute_vector);
+
+  _mm_store_ps (__P, __tmp);
+}
+
+/* Store the lower SPFP value across four words.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_ps (float *__P, __m128 __A)
+{
+  __v4sf __va = vec_splat((__v4sf)__A, 0);
+  _mm_store_ps (__P, __va);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps1 (float *__P, __m128 __A)
+{
+  _mm_store1_ps (__P, __A);
+}
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ss (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
+}
+
+/* Sets the low SPFP value of A from the low value of B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+
+  return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ss (float const *__P)
+{
+  return _mm_set_ss (*__P);
+}
+
+/* Stores the lower SPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ss (float *__P, __m128 __A)
+{
+  *__P = ((__v4sf)__A)[0];
+}
+
+/* Perform the respective operation on the lower SPFP (single-precision
+   floating-point) values of A and B; the upper three SPFP values are
+   passed through from A.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a + b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] + __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a - b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] - __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a * b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] * __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a / b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] / __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = vec_sqrt (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+/* Perform the respective operation on the four SPFP values in A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A + (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A - (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A * (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A / (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ps (__m128 __A)
+{
+  return (vec_sqrt ((__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ps (__m128 __A)
+{
+  return (vec_re ((__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ps (__m128 __A)
+{
+  return (vec_rsqrte (__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = _mm_rcp_ps (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = vec_rsqrte (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ss (__m128 __A, __m128 __B)
+{
+  __v4sf a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower float)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper float values) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf)__A, 0);
+  b = vec_splat ((__v4sf)__B, 0);
+  c = vec_min (a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ss (__m128 __A, __m128 __B)
+{
+  __v4sf a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower float)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper float values) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = vec_max (a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ps (__m128 __A, __m128 __B)
+{
+  __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
+  return vec_sel (__B, __A, m);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ps (__m128 __A, __m128 __B)
+{
+  __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
+  return vec_sel (__B, __A, m);
+}
+
+/* Perform logical bit-wise operations on 128-bit values.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
+//  return __builtin_ia32_andps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
+}
+
+/* Perform a comparison on the four SPFP values of A and B.  For each
+   element, if the comparison is true, place a mask of all ones in the
+   result, otherwise a mask of zeros.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ps (__m128  __A, __m128  __B)
+{
+  __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
+  return ((__m128)vec_nor (temp, temp));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ps (__m128  __A, __m128  __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
+  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
+  return ((__m128 ) vec_and (c, d));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ps (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
+  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
+  return ((__m128 ) vec_or (c, d));
+}
+
+/* Perform a comparison on the lower SPFP values of A and B.  If the
+   comparison is true, place a mask of all ones in the result, otherwise a
+   mask of zeros.  The upper three SPFP values are passed through from A.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ss (__m128  __A, __m128  __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpeq(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmplt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmple(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpgt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpge(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpeq(a, b);
+  c = vec_nor (c, c);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpge(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpgt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmple(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we do the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmplt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ss (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
+  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
+  c = vec_and (c, d);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ss (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
+  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
+  c = vec_or (c, d);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
+}
+
+/* Compare the lower SPFP values of A and B and return 1 if true
+   and 0 if false.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+/* FIXME
+ * The __mm_ucomi??_ss implementations below are exactly the same as
+ * __mm_comi??_ss because GCC for PowerPC only generates unordered
+ * compares (scalar and vector).
+ * Technically __mm_comieq_ss et al should be using the ordered
+ * compare and signal for QNaNs.
+ * The __mm_ucomieq_sd et all should be OK, as is.
+ */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_f32 (__m128 __A)
+{
+  return ((__v4sf)__A)[0];
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the current
+   rounding mode.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si32 (__m128 __A)
+{
+  __m64 res = 0;
+#ifdef _ARCH_PWR8
+  double dtmp;
+  __asm__(
+#ifdef __LITTLE_ENDIAN__
+      "xxsldwi %x0,%x0,%x0,3;\n"
+#endif
+      "xscvspdp %x2,%x0;\n"
+      "fctiw  %2,%2;\n"
+      "mfvsrd  %1,%x2;\n"
+      : "+wa" (__A),
+        "=r" (res),
+        "=f" (dtmp)
+      : );
+#else
+  res = __builtin_rint(__A[0]);
+#endif
+  return (res);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ss2si (__m128 __A)
+{
+  return _mm_cvtss_si32 (__A);
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the
+   current rounding mode.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64 (__m128 __A)
+{
+  __m64 res = 0;
+#ifdef _ARCH_PWR8
+  double dtmp;
+  __asm__(
+#ifdef __LITTLE_ENDIAN__
+      "xxsldwi %x0,%x0,%x0,3;\n"
+#endif
+      "xscvspdp %x2,%x0;\n"
+      "fctid  %2,%2;\n"
+      "mfvsrd  %1,%x2;\n"
+      : "+wa" (__A),
+        "=r" (res),
+        "=f" (dtmp)
+      : );
+#else
+  res = __builtin_llrint(__A[0]);
+#endif
+  return (res);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64x (__m128 __A)
+{
+  return _mm_cvtss_si64 ((__v4sf) __A);
+}
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint
+{
+  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
+  _MM_HINT_ET0 = 7,
+  _MM_HINT_ET1 = 6,
+  _MM_HINT_T0 = 3,
+  _MM_HINT_T1 = 2,
+  _MM_HINT_T2 = 1,
+  _MM_HINT_NTA = 0
+};
+
+/* Loads one cache line from address P to a location "closer" to the
+   processor.  The selector I specifies the type of prefetch operation.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_prefetch (const void *__P, enum _mm_hint __I)
+{
+  /* Current PowerPC will ignores the hint parameters.  */
+  __builtin_prefetch (__P);
+}
+
+/* Convert the two lower SPFP values to 32-bit integers according to the
+   current rounding mode.  Return the integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi32 (__m128 __A)
+{
+  /* Splat two lower SPFP values to both halves.  */
+  __v4sf temp, rounded;
+  __vector unsigned long long result;
+
+  /* Splat two lower SPFP values to both halves.  */
+  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
+  rounded = vec_rint(temp);
+  result = (__vector unsigned long long) vec_cts (rounded, 0);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ps2pi (__m128 __A)
+{
+  return _mm_cvtps_pi32 (__A);
+}
+
+/* Truncate the lower SPFP value to a 32-bit integer.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si32 (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ss2si (__m128 __A)
+{
+  return _mm_cvttss_si32 (__A);
+}
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64 (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64x (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+/* Truncate the two lower SPFP values to 32-bit integers.  Return the
+   integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_pi32 (__m128 __A)
+{
+  __v4sf temp;
+  __vector unsigned long long result;
+
+  /* Splat two lower SPFP values to both halves.  */
+  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
+  result = (__vector unsigned long long) vec_cts (temp, 0);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ps2pi (__m128 __A)
+{
+  return _mm_cvttps_pi32 (__A);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_ss (__m128 __A, int __B)
+{
+  float temp = __B;
+  __A[0] = temp;
+
+  return __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_si2ss (__m128 __A, int __B)
+{
+  return _mm_cvtsi32_ss (__A, __B);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+/* Intel intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_ss (__m128 __A, long long __B)
+{
+  float temp = __B;
+  __A[0] = temp;
+
+  return __A;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_ss (__m128 __A, long long __B)
+{
+  return _mm_cvtsi64_ss (__A, __B);
+}
+
+/* Convert the two 32-bit values in B to SPFP form and insert them
+   as the two lower elements in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_ps (__m128        __A, __m64        __B)
+{
+  __vector signed int vm1;
+  __vector float vf1;
+
+  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
+  vf1 = (__vector float) vec_ctf (vm1, 0);
+
+  return ((__m128) (__vector unsigned long long)
+    { ((__vector unsigned long long)vf1) [0],
+	((__vector unsigned long long)__A) [1]});
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_pi2ps (__m128 __A, __m64 __B)
+{
+  return _mm_cvtpi32_ps (__A, __B);
+}
+
+/* Convert the four signed 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi16_ps (__m64 __A)
+{
+  __vector signed short vs8;
+  __vector signed int vi4;
+  __vector float vf1;
+
+  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
+  vi4 = vec_vupklsh (vs8);
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the four unsigned 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu16_ps (__m64 __A)
+{
+  const __vector unsigned short zero =
+    { 0, 0, 0, 0, 0, 0, 0, 0 };
+  __vector unsigned short vs8;
+  __vector unsigned int vi4;
+  __vector float vf1;
+
+  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
+  vi4 = (__vector unsigned int) vec_mergel
+#ifdef __LITTLE_ENDIAN__
+                                           (vs8, zero);
+#else
+                                           (zero, vs8);
+#endif
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the low four signed 8-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi8_ps (__m64 __A)
+{
+  __vector signed char vc16;
+  __vector signed short vs8;
+  __vector signed int vi4;
+  __vector float vf1;
+
+  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
+  vs8 = vec_vupkhsb (vc16);
+  vi4 = vec_vupkhsh (vs8);
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_cvtpu8_ps (__m64  __A)
+{
+  const __vector unsigned char zero =
+    { 0, 0, 0, 0, 0, 0, 0, 0 };
+  __vector unsigned char vc16;
+  __vector unsigned short vs8;
+  __vector unsigned int vi4;
+  __vector float vf1;
+
+  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
+#ifdef __LITTLE_ENDIAN__
+  vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
+  vi4 = (__vector unsigned int) vec_mergeh (vs8,
+					    (__vector unsigned short) zero);
+#else
+  vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
+  vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
+                                            vs8);
+#endif
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the four signed 32-bit values in A and B to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
+{
+  __vector signed int vi4;
+  __vector float vf4;
+
+  vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
+  vf4 = (__vector float) vec_ctf (vi4, 0);
+  return (__m128) vf4;
+}
+
+/* Convert the four SPFP values in A to four signed 16-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi16 (__m128 __A)
+{
+  __v4sf rounded;
+  __vector signed int temp;
+  __vector unsigned long long result;
+
+  rounded = vec_rint(__A);
+  temp = vec_cts (rounded, 0);
+  result = (__vector unsigned long long) vec_pack (temp, temp);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+/* Convert the four SPFP values in A to four signed 8-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi8 (__m128 __A)
+{
+  __v4sf rounded;
+  __vector signed int tmp_i;
+  static const __vector signed int zero = {0, 0, 0, 0};
+  __vector signed short tmp_s;
+  __vector signed char res_v;
+
+  rounded = vec_rint(__A);
+  tmp_i = vec_cts (rounded, 0);
+  tmp_s = vec_pack (tmp_i, zero);
+  res_v = vec_pack (tmp_s, tmp_s);
+  return (__m64) ((__vector long long) res_v)[0];
+}
+
+/* Selects four specific SPFP values from A and B based on MASK.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
+{
+  unsigned long element_selector_10 = __mask & 0x03;
+  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned int permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+#else
+      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
+#endif
+    };
+  __vector unsigned int t;
+
+  t[0] = permute_selectors[element_selector_10];
+  t[1] = permute_selectors[element_selector_32];
+  t[2] = permute_selectors[element_selector_54] + 0x10101010;
+  t[3] = permute_selectors[element_selector_76] + 0x10101010;
+  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
+}
+
+/* Selects and interleaves the upper two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
+}
+
+/* Selects and interleaves the lower two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
+}
+
+/* Sets the upper two SPFP values with 64-bits of data loaded from P;
+   the lower two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pi (__m128 __A, __m64 const *__P)
+{
+  __vector unsigned long long __a = (__vector unsigned long long)__A;
+  __vector unsigned long long __p = vec_splats(*__P);
+  __a [1] = __p [1];
+
+  return (__m128)__a;
+}
+
+/* Stores the upper two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pi (__m64 *__P, __m128 __A)
+{
+  __vector unsigned long long __a = (__vector unsigned long long) __A;
+
+  *__P = __a[1];
+}
+
+/* Moves the upper two values of B into the lower two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehl_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_mergel ((__vector unsigned long long)__B,
+			      (__vector unsigned long long)__A);
+}
+
+/* Moves the lower two values of B into the upper two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movelh_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_mergeh ((__vector unsigned long long)__A,
+			      (__vector unsigned long long)__B);
+}
+
+/* Sets the lower two SPFP values with 64-bits of data loaded from P;
+   the upper two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pi (__m128 __A, __m64 const *__P)
+{
+  __vector unsigned long long __a = (__vector unsigned long long)__A;
+  __vector unsigned long long __p = vec_splats(*__P);
+  __a [0] = __p [0];
+
+  return (__m128)__a;
+}
+
+/* Stores the lower two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pi (__m64 *__P, __m128 __A)
+{
+  __vector unsigned long long __a = (__vector unsigned long long) __A;
+
+  *__P = __a[0];
+}
+
+#ifdef _ARCH_PWR8
+/* Intrinsic functions that require PowerISA 2.07 minimum.  */
+
+/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_ps (__m128  __A)
+{
+  __vector unsigned long long result;
+  static const __vector unsigned int perm_mask =
+    {
+#ifdef __LITTLE_ENDIAN__
+	0x00204060, 0x80808080, 0x80808080, 0x80808080
+#else
+      0x80808080, 0x80808080, 0x80808080, 0x00204060
+#endif
+    };
+
+  result = ((__vector unsigned long long)
+	    vec_vbpermq ((__vector unsigned char) __A,
+			 (__vector unsigned char) perm_mask));
+
+#ifdef __LITTLE_ENDIAN__
+  return result[1];
+#else
+  return result[0];
+#endif
+}
+#endif /* _ARCH_PWR8 */
+
+/* Create a vector with all four elements equal to *P.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_ps (float const *__P)
+{
+  return _mm_set1_ps (*__P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps1 (float const *__P)
+{
+  return _mm_load1_ps (__P);
+}
+
+/* Extracts one of the four words of A.  The selector N must be immediate.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_pi16 (__m64 const __A, int const __N)
+{
+  unsigned int shiftr = __N & 3;
+#ifdef __BIG_ENDIAN__
+  shiftr = 3 - shiftr;
+#endif
+
+  return ((__A >> (shiftr * 16)) & 0xffff);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pextrw (__m64 const __A, int const __N)
+{
+  return _mm_extract_pi16 (__A, __N);
+}
+
+/* Inserts word D into one of four words of A.  The selector N must be
+   immediate.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
+{
+  const int shiftl = (__N & 3) * 16;
+  const __m64 shiftD = (const __m64) __D << shiftl;
+  const __m64 mask = 0xffffUL << shiftl;
+  __m64 result = (__A & (~mask)) | (shiftD & mask);
+
+  return (result);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pinsrw (__m64 const __A, int const __D, int const __N)
+{
+  return _mm_insert_pi16 (__A, __D, __N);
+}
+
+/* Compute the element-wise maximum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_max_pi16 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector signed short a, b, r;
+  __vector __bool short c;
+
+  a = (__vector signed short)vec_splats (__A);
+  b = (__vector signed short)vec_splats (__B);
+  c = (__vector __bool short)vec_cmpgt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+  res.as_short[0] =
+      (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
+  res.as_short[1] =
+      (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
+  res.as_short[2] =
+      (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
+  res.as_short[3] =
+      (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxsw (__m64 __A, __m64 __B)
+{
+  return _mm_max_pi16 (__A, __B);
+}
+
+/* Compute the element-wise maximum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pu8 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector unsigned char a, b, r;
+  __vector __bool char c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = (__vector __bool char)vec_cmpgt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+  long i;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+
+  for (i = 0; i < 8; i++)
+  res.as_char[i] =
+      ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
+	  m1.as_char[i] : m2.as_char[i];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxub (__m64 __A, __m64 __B)
+{
+  return _mm_max_pu8 (__A, __B);
+}
+
+/* Compute the element-wise minimum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pi16 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector signed short a, b, r;
+  __vector __bool short c;
+
+  a = (__vector signed short)vec_splats (__A);
+  b = (__vector signed short)vec_splats (__B);
+  c = (__vector __bool short)vec_cmplt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+  res.as_short[0] =
+      (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
+  res.as_short[1] =
+      (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
+  res.as_short[2] =
+      (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
+  res.as_short[3] =
+      (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminsw (__m64 __A, __m64 __B)
+{
+  return _mm_min_pi16 (__A, __B);
+}
+
+/* Compute the element-wise minimum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pu8 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector unsigned char a, b, r;
+  __vector __bool char c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = (__vector __bool char)vec_cmplt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+  long i;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+
+  for (i = 0; i < 8; i++)
+  res.as_char[i] =
+      ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
+	  m1.as_char[i] : m2.as_char[i];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminub (__m64 __A, __m64 __B)
+{
+  return _mm_min_pu8 (__A, __B);
+}
+
+/* Create an 8-bit mask of the signs of 8-bit values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+  unsigned long long p =
+#ifdef __LITTLE_ENDIAN__
+                         0x0008101820283038UL; // permute control for sign bits
+#else
+                         0x3830282018100800UL; // permute control for sign bits
+#endif
+  return __builtin_bpermd (p, __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmovmskb (__m64 __A)
+{
+  return _mm_movemask_pi8 (__A);
+}
+
+/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
+   in B and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __A, __m64 __B)
+{
+  __vector unsigned short a, b;
+  __vector unsigned short c;
+  __vector unsigned int w0, w1;
+  __vector unsigned char xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
+      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
+#endif
+    };
+
+  a = (__vector unsigned short)vec_splats (__A);
+  b = (__vector unsigned short)vec_splats (__B);
+
+  w0 = vec_vmuleuh (a, b);
+  w1 = vec_vmulouh (a, b);
+  c = (__vector unsigned short)vec_perm (w0, w1, xform1);
+
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhuw (__m64 __A, __m64 __B)
+{
+  return _mm_mulhi_pu16 (__A, __B);
+}
+
+/* Return a combination of the four 16-bit values in A.  The selector
+   must be an immediate.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int const __N)
+{
+  unsigned long element_selector_10 = __N & 0x03;
+  unsigned long element_selector_32 = (__N >> 2) & 0x03;
+  unsigned long element_selector_54 = (__N >> 4) & 0x03;
+  unsigned long element_selector_76 = (__N >> 6) & 0x03;
+  static const unsigned short permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
+#else
+	      0x0607, 0x0405, 0x0203, 0x0001
+#endif
+    };
+  __m64_union t;
+  __vector unsigned long long a, p, r;
+
+#ifdef __LITTLE_ENDIAN__
+  t.as_short[0] = permute_selectors[element_selector_10];
+  t.as_short[1] = permute_selectors[element_selector_32];
+  t.as_short[2] = permute_selectors[element_selector_54];
+  t.as_short[3] = permute_selectors[element_selector_76];
+#else
+  t.as_short[3] = permute_selectors[element_selector_10];
+  t.as_short[2] = permute_selectors[element_selector_32];
+  t.as_short[1] = permute_selectors[element_selector_54];
+  t.as_short[0] = permute_selectors[element_selector_76];
+#endif
+  p = vec_splats (t.as_m64);
+  a = vec_splats (__A);
+  r = vec_perm (a, a, (__vector unsigned char)p);
+  return (__m64) ((__vector long long) r)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pshufw (__m64 __A, int const __N)
+{
+  return _mm_shuffle_pi16 (__A, __N);
+}
+
+/* Conditionally store byte elements of A into P.  The high bit of each
+   byte in the selector N determines whether the corresponding byte from
+   A is stored.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
+{
+  __m64 hibit = 0x8080808080808080UL;
+  __m64 mask, tmp;
+  __m64 *p = (__m64*)__P;
+
+  tmp = *p;
+  mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
+  tmp = (tmp & (~mask)) | (__A & mask);
+  *p = tmp;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_maskmovq (__m64 __A, __m64 __N, char *__P)
+{
+  _mm_maskmove_si64 (__A, __N, __P);
+}
+
+/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu8 (__m64 __A, __m64 __B)
+{
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = vec_avg (a, b);
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgb (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu8 (__A, __B);
+}
+
+/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu16 (__m64 __A, __m64 __B)
+{
+  __vector unsigned short a, b, c;
+
+  a = (__vector unsigned short)vec_splats (__A);
+  b = (__vector unsigned short)vec_splats (__B);
+  c = vec_avg (a, b);
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgw (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu16 (__A, __B);
+}
+
+/* Compute the sum of the absolute differences of the unsigned 8-bit
+   values in A and B.  Return the value in the lower 16-bit word; the
+   upper words are cleared.  */
+extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_pu8 (__m64  __A, __m64  __B)
+{
+  __vector unsigned char a, b;
+  __vector unsigned char vmin, vmax, vabsdiff;
+  __vector signed int vsum;
+  const __vector unsigned int zero =
+    { 0, 0, 0, 0 };
+  __m64_union result = {0};
+
+  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
+  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
+  vmin = vec_min (a, b);
+  vmax = vec_max (a, b);
+  vabsdiff = vec_sub (vmax, vmin);
+  /* Sum four groups of bytes into integers.  */
+  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+  /* Sum across four integers with integer result.  */
+  vsum = vec_sums (vsum, (__vector signed int) zero);
+  /* The sum is in the right most 32-bits of the vector result.
+     Transfer to a GPR and truncate to 16 bits.  */
+  result.as_short[0] = vsum[3];
+  return result.as_m64;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psadbw (__m64 __A, __m64 __B)
+{
+  return _mm_sad_pu8 (__A, __B);
+}
+
+/* Stores the data in A to the address P without polluting the caches.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pi (__m64 *__P, __m64 __A)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "	dcbtstt	0,%0"
+    :
+    : "b" (__P)
+    : "memory"
+  );
+  *__P = __A;
+}
+
+/* Likewise.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ps (float *__P, __m128 __A)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "	dcbtstt	0,%0"
+    :
+    : "b" (__P)
+    : "memory"
+  );
+  _mm_store_ps (__P, __A);
+}
+
+/* Guarantees that every preceding store is globally visible before
+   any subsequent store.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sfence (void)
+{
+  /* Generate a light weight sync.  */
+  __atomic_thread_fence (__ATOMIC_RELEASE);
+}
+
+/* The execution of the next instruction is delayed by an implementation
+   specific amount of time.  The instruction does not modify the
+   architectural state.  This is after the pop_options pragma because
+   it does not require SSE support in the processor--the encoding is a
+   nop on processors that do not support it.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_pause (void)
+{
+  /* There is no exact match with this construct, but the following is
+     close to the desired effect.  */
+#if _ARCH_PWR8
+  /* On power8 and later processors we can depend on Program Priority
+     (PRI) and associated "very low" PPI setting.  Since we don't know
+     what PPI this thread is running at we: 1) save the current PRI
+     from the PPR SPR into a local GRP, 2) set the PRI to "very low*
+     via the special or 31,31,31 encoding. 3) issue an "isync" to
+     insure the PRI change takes effect before we execute any more
+     instructions.
+     Now we can execute a lwsync (release barrier) while we execute
+     this thread at "very low" PRI.  Finally we restore the original
+     PRI and continue execution.  */
+  unsigned long __PPR;
+
+  __asm__ volatile (
+    "	mfppr	%0;"
+    "   or 31,31,31;"
+    "   isync;"
+    "   lwsync;"
+    "   isync;"
+    "   mtppr	%0;"
+    : "=r" (__PPR)
+    :
+    : "memory"
+  );
+#else
+  /* For older processor where we may not even have Program Priority
+     controls we can only depend on Heavy Weight Sync.  */
+  __atomic_thread_fence (__ATOMIC_SEQ_CST);
+#endif
+}
+
+/* Transpose the 4x4 matrix composed of row[0-3].  */
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
+do {									\
+  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
+  __v4sf __t0 = vec_vmrghw (__r0, __r1);			\
+  __v4sf __t1 = vec_vmrghw (__r2, __r3);			\
+  __v4sf __t2 = vec_vmrglw (__r0, __r1);			\
+  __v4sf __t3 = vec_vmrglw (__r2, __r3);			\
+  (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, 	\
+			       (__vector long long)__t1);	\
+  (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,	\
+			       (__vector long long)__t1);	\
+  (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,	\
+			       (__vector long long)__t3);	\
+  (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,	\
+			       (__vector long long)__t3);	\
+} while (0)
+
+/* For backward source compatibility.  */
+//# include <emmintrin.h>
+
+#endif /* _XMMINTRIN_H_INCLUDED */
diff --git a/lib/include/prfchwintrin.h b/lib/include/prfchwintrin.h
index 70851396f..6e8a4ef2e 100644
--- a/lib/include/prfchwintrin.h
+++ b/lib/include/prfchwintrin.h
@@ -1,22 +1,8 @@
 /*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/ptwriteintrin.h b/lib/include/ptwriteintrin.h
index 1bb1df0a2..0a04f7c1d 100644
--- a/lib/include/ptwriteintrin.h
+++ b/lib/include/ptwriteintrin.h
@@ -1,22 +1,8 @@
 /*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/rdseedintrin.h b/lib/include/rdseedintrin.h
index 419466932..ccb3d2dd2 100644
--- a/lib/include/rdseedintrin.h
+++ b/lib/include/rdseedintrin.h
@@ -1,22 +1,8 @@
 /*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/rtmintrin.h b/lib/include/rtmintrin.h
index e6a58d743..36ff58351 100644
--- a/lib/include/rtmintrin.h
+++ b/lib/include/rtmintrin.h
@@ -1,22 +1,8 @@
 /*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/s390intrin.h b/lib/include/s390intrin.h
index d51274c07..73a915c23 100644
--- a/lib/include/s390intrin.h
+++ b/lib/include/s390intrin.h
@@ -1,22 +1,8 @@
 /*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/sgxintrin.h b/lib/include/sgxintrin.h
index 20aee7661..303a21f6b 100644
--- a/lib/include/sgxintrin.h
+++ b/lib/include/sgxintrin.h
@@ -1,22 +1,8 @@
 /*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -28,6 +14,8 @@
 #ifndef __SGXINTRIN_H
 #define __SGXINTRIN_H
 
+#if __has_extension(gnu_asm)
+
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
   __attribute__((__always_inline__, __nodebug__,  __target__("sgx")))
@@ -67,4 +55,6 @@ _enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
 
 #undef __DEFAULT_FN_ATTRS
 
+#endif /* __has_extension(gnu_asm) */
+
 #endif
diff --git a/lib/include/shaintrin.h b/lib/include/shaintrin.h
index 3df4718ce..08b1fb1dc 100644
--- a/lib/include/shaintrin.h
+++ b/lib/include/shaintrin.h
@@ -1,22 +1,8 @@
 /*===---- shaintrin.h - SHA intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/smmintrin.h b/lib/include/smmintrin.h
index 4806b3e4e..025830a74 100644
--- a/lib/include/smmintrin.h
+++ b/lib/include/smmintrin.h
@@ -1,22 +1,8 @@
 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/stdalign.h b/lib/include/stdalign.h
index 3738d1284..6ad25db45 100644
--- a/lib/include/stdalign.h
+++ b/lib/include/stdalign.h
@@ -1,22 +1,8 @@
 /*===---- stdalign.h - Standard header for alignment ------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/stdarg.h b/lib/include/stdarg.h
index 101426fff..0bc39408c 100644
--- a/lib/include/stdarg.h
+++ b/lib/include/stdarg.h
@@ -1,24 +1,8 @@
 /*===---- stdarg.h - Variable argument handling ----------------------------===
  *
- * Copyright (c) 2008 Eli Friedman
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/stdatomic.h b/lib/include/stdatomic.h
index b4845a74e..665551ea6 100644
--- a/lib/include/stdatomic.h
+++ b/lib/include/stdatomic.h
@@ -1,22 +1,8 @@
 /*===---- stdatomic.h - Standard header for atomic types and operations -----===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/stdbool.h b/lib/include/stdbool.h
index 5cb66b55d..2525363dd 100644
--- a/lib/include/stdbool.h
+++ b/lib/include/stdbool.h
@@ -1,24 +1,8 @@
 /*===---- stdbool.h - Standard header for booleans -------------------------===
  *
- * Copyright (c) 2008 Eli Friedman
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/stddef.h b/lib/include/stddef.h
index 735499671..15acd4427 100644
--- a/lib/include/stddef.h
+++ b/lib/include/stddef.h
@@ -1,24 +1,8 @@
 /*===---- stddef.h - Basic type definitions --------------------------------===
  *
- * Copyright (c) 2008 Eli Friedman
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/stdint.h b/lib/include/stdint.h
index 0afcca3a9..192f653e9 100644
--- a/lib/include/stdint.h
+++ b/lib/include/stdint.h
@@ -1,29 +1,18 @@
 /*===---- stdint.h - Standard header for sized integer types --------------===*\
  *
- * Copyright (c) 2009 Chris Lattner
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
 \*===----------------------------------------------------------------------===*/
 
 #ifndef __CLANG_STDINT_H
+// AIX system headers need stdint.h to be re-enterable while _STD_TYPES_T
+// is defined until an inclusion of it without _STD_TYPES_T occurs, in which
+// case the header guard macro is defined.
+#if !defined(_AIX) || !defined(_STD_TYPES_T) || !defined(__STDC_HOSTED__)
 #define __CLANG_STDINT_H
+#endif
 
 /* If we're hosted, fall back to the system's stdint.h, which might have
  * additional definitions.
diff --git a/lib/include/stdnoreturn.h b/lib/include/stdnoreturn.h
index a7a301d7e..e83cd8153 100644
--- a/lib/include/stdnoreturn.h
+++ b/lib/include/stdnoreturn.h
@@ -1,22 +1,8 @@
 /*===---- stdnoreturn.h - Standard header for noreturn macro ---------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/tbmintrin.h b/lib/include/tbmintrin.h
index 1d0d746a8..f4e848a1c 100644
--- a/lib/include/tbmintrin.h
+++ b/lib/include/tbmintrin.h
@@ -1,22 +1,8 @@
 /*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/tgmath.h b/lib/include/tgmath.h
index 34e26dcc0..7acf18b9d 100644
--- a/lib/include/tgmath.h
+++ b/lib/include/tgmath.h
@@ -1,24 +1,8 @@
 /*===---- tgmath.h - Standard header for type generic math ----------------===*\
  *
- * Copyright (c) 2009 Howard Hinnant
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
 \*===----------------------------------------------------------------------===*/
 
diff --git a/lib/include/tmmintrin.h b/lib/include/tmmintrin.h
index 734cd391b..35533e115 100644
--- a/lib/include/tmmintrin.h
+++ b/lib/include/tmmintrin.h
@@ -1,22 +1,8 @@
 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/unwind.h b/lib/include/unwind.h
index 0e8317e5b..029524b7b 100644
--- a/lib/include/unwind.h
+++ b/lib/include/unwind.h
@@ -1,22 +1,8 @@
 /*===---- unwind.h - Stack unwinding ----------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -66,8 +52,8 @@ extern "C" {
 #pragma GCC visibility push(default)
 #endif
 
-typedef uintptr_t _Unwind_Word;
-typedef intptr_t _Unwind_Sword;
+typedef uintptr_t _Unwind_Word __attribute__((__mode__(__unwind_word__)));
+typedef intptr_t _Unwind_Sword __attribute__((__mode__(__unwind_word__)));
 typedef uintptr_t _Unwind_Ptr;
 typedef uintptr_t _Unwind_Internal_Ptr;
 typedef uint64_t _Unwind_Exception_Class;
diff --git a/lib/include/vadefs.h b/lib/include/vadefs.h
index 7fe9a74e3..b61756844 100644
--- a/lib/include/vadefs.h
+++ b/lib/include/vadefs.h
@@ -1,22 +1,8 @@
 /* ===-------- vadefs.h ---------------------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/vaesintrin.h b/lib/include/vaesintrin.h
index e4174bb82..c4d5c3e75 100644
--- a/lib/include/vaesintrin.h
+++ b/lib/include/vaesintrin.h
@@ -1,23 +1,9 @@
 /*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/varargs.h b/lib/include/varargs.h
index b5477d0a6..d241b7de3 100644
--- a/lib/include/varargs.h
+++ b/lib/include/varargs.h
@@ -1,22 +1,8 @@
 /*===---- varargs.h - Variable argument handling -------------------------------------===
 *
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-* THE SOFTWARE.
+* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
diff --git a/lib/include/vecintrin.h b/lib/include/vecintrin.h
index e62738983..c71b76a3e 100644
--- a/lib/include/vecintrin.h
+++ b/lib/include/vecintrin.h
@@ -1,22 +1,8 @@
 /*===---- vecintrin.h - Vector intrinsics ----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -531,6 +517,141 @@ vec_bperm_u128(vector unsigned char __a, vector unsigned char __b) {
 }
 #endif
 
+/*-- vec_revb ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_revb(vector signed short __vec) {
+  return (vector signed short)
+         __builtin_s390_vlbrh((vector unsigned short)__vec);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_revb(vector unsigned short __vec) {
+  return __builtin_s390_vlbrh(__vec);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_revb(vector signed int __vec) {
+  return (vector signed int)
+         __builtin_s390_vlbrf((vector unsigned int)__vec);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_revb(vector unsigned int __vec) {
+  return __builtin_s390_vlbrf(__vec);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_revb(vector signed long long __vec) {
+  return (vector signed long long)
+         __builtin_s390_vlbrg((vector unsigned long long)__vec);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_revb(vector unsigned long long __vec) {
+  return __builtin_s390_vlbrg(__vec);
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_revb(vector float __vec) {
+  return (vector float)
+         __builtin_s390_vlbrf((vector unsigned int)__vec);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
+vec_revb(vector double __vec) {
+  return (vector double)
+         __builtin_s390_vlbrg((vector unsigned long long)__vec);
+}
+
+/*-- vec_reve ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_reve(vector signed char __vec) {
+  return (vector signed char) { __vec[15], __vec[14], __vec[13], __vec[12],
+                                __vec[11], __vec[10], __vec[9], __vec[8],
+                                __vec[7], __vec[6], __vec[5], __vec[4],
+                                __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_reve(vector unsigned char __vec) {
+  return (vector unsigned char) { __vec[15], __vec[14], __vec[13], __vec[12],
+                                  __vec[11], __vec[10], __vec[9], __vec[8],
+                                  __vec[7], __vec[6], __vec[5], __vec[4],
+                                  __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_reve(vector bool char __vec) {
+  return (vector bool char) { __vec[15], __vec[14], __vec[13], __vec[12],
+                              __vec[11], __vec[10], __vec[9], __vec[8],
+                              __vec[7], __vec[6], __vec[5], __vec[4],
+                              __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_reve(vector signed short __vec) {
+  return (vector signed short) { __vec[7], __vec[6], __vec[5], __vec[4],
+                                 __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_reve(vector unsigned short __vec) {
+  return (vector unsigned short) { __vec[7], __vec[6], __vec[5], __vec[4],
+                                   __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_reve(vector bool short __vec) {
+  return (vector bool short) { __vec[7], __vec[6], __vec[5], __vec[4],
+                               __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_reve(vector signed int __vec) {
+  return (vector signed int) { __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_reve(vector unsigned int __vec) {
+  return (vector unsigned int) { __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_reve(vector bool int __vec) {
+  return (vector bool int) { __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_reve(vector signed long long __vec) {
+  return (vector signed long long) { __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_reve(vector unsigned long long __vec) {
+  return (vector unsigned long long) { __vec[1], __vec[0] };
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_reve(vector bool long long __vec) {
+  return (vector bool long long) { __vec[1], __vec[0] };
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_reve(vector float __vec) {
+  return (vector float) { __vec[3], __vec[2], __vec[1], __vec[0] };
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
+vec_reve(vector double __vec) {
+  return (vector double) { __vec[1], __vec[0] };
+}
+
 /*-- vec_sel ----------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed char
@@ -6849,6 +6970,56 @@ vec_sldw(vector double __a, vector double __b, int __c)
   __builtin_s390_vsldb((vector unsigned char)(X), \
                        (vector unsigned char)(Y), (Z) * 4))
 
+/*-- vec_sldb ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 13
+
+extern __ATTRS_o vector signed char
+vec_sldb(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector unsigned char
+vec_sldb(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector signed short
+vec_sldb(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector unsigned short
+vec_sldb(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector signed int
+vec_sldb(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector unsigned int
+vec_sldb(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector signed long long
+vec_sldb(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector unsigned long long
+vec_sldb(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector float
+vec_sldb(vector float __a, vector float __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector double
+vec_sldb(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 7);
+
+#define vec_sldb(X, Y, Z) ((__typeof__((vec_sldb)((X), (Y), (Z)))) \
+  __builtin_s390_vsld((vector unsigned char)(X), \
+                      (vector unsigned char)(Y), (Z)))
+
+#endif
+
 /*-- vec_sral ---------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed char
@@ -7579,6 +7750,56 @@ vec_srb(vector double __a, vector unsigned long long __b) {
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+/*-- vec_srdb ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 13
+
+extern __ATTRS_o vector signed char
+vec_srdb(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector unsigned char
+vec_srdb(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector signed short
+vec_srdb(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector unsigned short
+vec_srdb(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector signed int
+vec_srdb(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector unsigned int
+vec_srdb(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector signed long long
+vec_srdb(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector unsigned long long
+vec_srdb(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector float
+vec_srdb(vector float __a, vector float __b, int __c)
+  __constant_range(__c, 0, 7);
+
+extern __ATTRS_o vector double
+vec_srdb(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 7);
+
+#define vec_srdb(X, Y, Z) ((__typeof__((vec_srdb)((X), (Y), (Z)))) \
+  __builtin_s390_vsrd((vector unsigned char)(X), \
+                      (vector unsigned char)(Y), (Z)))
+
+#endif
+
 /*-- vec_abs ----------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed char
@@ -8725,6 +8946,22 @@ vec_double(vector unsigned long long __a) {
   return __builtin_convertvector(__a, vector double);
 }
 
+/*-- vec_float --------------------------------------------------------------*/
+
+#if __ARCH__ >= 13
+
+static inline __ATTRS_o_ai vector float
+vec_float(vector signed int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_float(vector unsigned int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+#endif
+
 /*-- vec_signed -------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed long long
@@ -8732,6 +8969,13 @@ vec_signed(vector double __a) {
   return __builtin_convertvector(__a, vector signed long long);
 }
 
+#if __ARCH__ >= 13
+static inline __ATTRS_o_ai vector signed int
+vec_signed(vector float __a) {
+  return __builtin_convertvector(__a, vector signed int);
+}
+#endif
+
 /*-- vec_unsigned -----------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector unsigned long long
@@ -8739,6 +8983,13 @@ vec_unsigned(vector double __a) {
   return __builtin_convertvector(__a, vector unsigned long long);
 }
 
+#if __ARCH__ >= 13
+static inline __ATTRS_o_ai vector unsigned int
+vec_unsigned(vector float __a) {
+  return __builtin_convertvector(__a, vector unsigned int);
+}
+#endif
+
 /*-- vec_roundp -------------------------------------------------------------*/
 
 #if __ARCH__ >= 12
@@ -10456,6 +10707,147 @@ vec_find_any_ne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
   return __builtin_s390_vfaezfs(__a, __b, 8, __cc);
 }
 
+/*-- vec_search_string_cc ---------------------------------------------------*/
+
+#if __ARCH__ >= 13
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector signed char __a, vector signed char __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsb((vector unsigned char)__a,
+                               (vector unsigned char)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector bool char __a, vector bool char __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsb((vector unsigned char)__a,
+                               (vector unsigned char)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector unsigned char __a, vector unsigned char __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsb(__a, __b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector signed short __a, vector signed short __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsh((vector unsigned short)__a,
+                               (vector unsigned short)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector bool short __a, vector bool short __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsh((vector unsigned short)__a,
+                               (vector unsigned short)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector unsigned short __a, vector unsigned short __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsh(__a, __b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector signed int __a, vector signed int __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsf((vector unsigned int)__a,
+                               (vector unsigned int)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector bool int __a, vector bool int __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsf((vector unsigned int)__a,
+                               (vector unsigned int)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_cc(vector unsigned int __a, vector unsigned int __b,
+                     vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrsf(__a, __b, __c, __cc);
+}
+
+#endif
+
+/*-- vec_search_string_until_zero_cc ----------------------------------------*/
+
+#if __ARCH__ >= 13
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector signed char __a,
+                                vector signed char __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszb((vector unsigned char)__a,
+                                (vector unsigned char)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector bool char __a,
+                                vector bool char __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszb((vector unsigned char)__a,
+                                (vector unsigned char)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector unsigned char __a,
+                                vector unsigned char __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszb(__a, __b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector signed short __a,
+                                vector signed short __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszh((vector unsigned short)__a,
+                                (vector unsigned short)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector bool short __a,
+                                vector bool short __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszh((vector unsigned short)__a,
+                                (vector unsigned short)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector unsigned short __a,
+                                vector unsigned short __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszh(__a, __b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector signed int __a,
+                                vector signed int __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszf((vector unsigned int)__a,
+                                (vector unsigned int)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector bool int __a,
+                                vector bool int __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszf((vector unsigned int)__a,
+                                (vector unsigned int)__b, __c, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_search_string_until_zero_cc(vector unsigned int __a,
+                                vector unsigned int __b,
+                                vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrszf(__a, __b, __c, __cc);
+}
+
+#endif
+
 #undef __constant_pow2_range
 #undef __constant_range
 #undef __constant
diff --git a/lib/include/vpclmulqdqintrin.h b/lib/include/vpclmulqdqintrin.h
index 86174a457..470d83254 100644
--- a/lib/include/vpclmulqdqintrin.h
+++ b/lib/include/vpclmulqdqintrin.h
@@ -1,23 +1,9 @@
 /*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
  *
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/waitpkgintrin.h b/lib/include/waitpkgintrin.h
index e29d6cfa5..7ecada4cf 100644
--- a/lib/include/waitpkgintrin.h
+++ b/lib/include/waitpkgintrin.h
@@ -1,22 +1,8 @@
 /*===----------------------- waitpkgintrin.h - WAITPKG --------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/wbnoinvdintrin.h b/lib/include/wbnoinvdintrin.h
index cad83368d..cac0347ef 100644
--- a/lib/include/wbnoinvdintrin.h
+++ b/lib/include/wbnoinvdintrin.h
@@ -1,22 +1,8 @@
 /*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/wmmintrin.h b/lib/include/wmmintrin.h
index 569a8d838..f932ca810 100644
--- a/lib/include/wmmintrin.h
+++ b/lib/include/wmmintrin.h
@@ -1,22 +1,8 @@
 /*===---- wmmintrin.h - AES intrinsics ------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/x86intrin.h b/lib/include/x86intrin.h
index 728c58c3e..a8b36622d 100644
--- a/lib/include/x86intrin.h
+++ b/lib/include/x86intrin.h
@@ -1,22 +1,8 @@
 /*===---- x86intrin.h - X86 intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/xmmintrin.h b/lib/include/xmmintrin.h
index 17af17267..75ff37655 100644
--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@@ -1,22 +1,8 @@
 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -28,7 +14,9 @@
 
 typedef int __v4si __attribute__((__vector_size__(16)));
 typedef float __v4sf __attribute__((__vector_size__(16)));
-typedef float __m128 __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+
+typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
 
 /* Unsigned types */
 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
@@ -1752,7 +1740,7 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadu_ps(const float *__p)
 {
   struct __loadu_ps {
-    __m128 __v;
+    __m128_u __v;
   } __attribute__((__packed__, __may_alias__));
   return ((struct __loadu_ps*)__p)->__v;
 }
@@ -1931,7 +1919,11 @@ _mm_setzero_ps(void)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeh_pi(__m64 *__p, __m128 __a)
 {
-  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
+  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_storeh_pi_struct {
+    __mm_storeh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
 }
 
 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
@@ -1948,7 +1940,11 @@ _mm_storeh_pi(__m64 *__p, __m128 __a)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_pi(__m64 *__p, __m128 __a)
 {
-  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
+  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_storeh_pi_struct {
+    __mm_storeh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
 }
 
 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
@@ -1987,7 +1983,7 @@ static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_ps(float *__p, __m128 __a)
 {
   struct __storeu_ps {
-    __m128 __v;
+    __m128_u __v;
   } __attribute__((__packed__, __may_alias__));
   ((struct __storeu_ps*)__p)->__v = __a;
 }
diff --git a/lib/include/xopintrin.h b/lib/include/xopintrin.h
index 9d540a2ab..5cedde41b 100644
--- a/lib/include/xopintrin.h
+++ b/lib/include/xopintrin.h
@@ -1,22 +1,8 @@
 /*===---- xopintrin.h - XOP intrinsics -------------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/xsavecintrin.h b/lib/include/xsavecintrin.h
index 25577a95f..5524947fa 100644
--- a/lib/include/xsavecintrin.h
+++ b/lib/include/xsavecintrin.h
@@ -1,22 +1,8 @@
 /*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/xsaveintrin.h b/lib/include/xsaveintrin.h
index 16f3a78d3..9429db6dd 100644
--- a/lib/include/xsaveintrin.h
+++ b/lib/include/xsaveintrin.h
@@ -1,22 +1,8 @@
 /*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
@@ -28,6 +14,10 @@
 #ifndef __XSAVEINTRIN_H
 #define __XSAVEINTRIN_H
 
+#ifdef _MSC_VER
+#define _XCR_XFEATURE_ENABLED_MASK 0
+#endif
+
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsave")))
 
@@ -41,6 +31,20 @@ _xrstor(void *__p, unsigned long long __m) {
   __builtin_ia32_xrstor(__p, __m);
 }
 
+#ifndef _MSC_VER
+#define _xgetbv(A) __builtin_ia32_xgetbv((long long)(A))
+#define _xsetbv(A, B) __builtin_ia32_xsetbv((unsigned int)(A), (unsigned long long)(B))
+#else
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+#ifdef __cplusplus
+}
+#endif
+#endif /* _MSC_VER */
+
 #ifdef __x86_64__
 static __inline__ void __DEFAULT_FN_ATTRS
 _xsave64(void *__p, unsigned long long __m) {
@@ -51,6 +55,7 @@ static __inline__ void __DEFAULT_FN_ATTRS
 _xrstor64(void *__p, unsigned long long __m) {
   __builtin_ia32_xrstor64(__p, __m);
 }
+
 #endif
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/include/xsaveoptintrin.h b/lib/include/xsaveoptintrin.h
index 792cf92d4..89a4c44db 100644
--- a/lib/include/xsaveoptintrin.h
+++ b/lib/include/xsaveoptintrin.h
@@ -1,22 +1,8 @@
 /*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/xsavesintrin.h b/lib/include/xsavesintrin.h
index fe2bc4b93..3f99219a2 100644
--- a/lib/include/xsavesintrin.h
+++ b/lib/include/xsavesintrin.h
@@ -1,22 +1,8 @@
 /*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */
diff --git a/lib/include/xtestintrin.h b/lib/include/xtestintrin.h
index 924424386..7d19e3733 100644
--- a/lib/include/xtestintrin.h
+++ b/lib/include/xtestintrin.h
@@ -1,22 +1,8 @@
 /*===---- xtestintrin.h - XTEST intrinsic ----------------------------------===
  *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  *===-----------------------------------------------------------------------===
  */