update C headers to clang 6.0.0rc3

2018-02-23 13:15:16 -05:00 · 2018-02-23 13:15:16 -05:00 · 4955c4b8f9
parent 1ba6e1641a
commit 4955c4b8f9
34 changed files with 9573 additions and 4424 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -541,10 +541,12 @@ set(ZIG_C_HEADER_FILES
    "adxintrin.h"
    "altivec.h"
    "ammintrin.h"
+    "arm64intr.h"
    "arm_acle.h"
    "arm_neon.h"
    "armintr.h"
    "avx2intrin.h"
+    "avx512bitalgintrin.h"
    "avx512bwintrin.h"
    "avx512cdintrin.h"
    "avx512dqintrin.h"
@ -553,17 +555,25 @@ set(ZIG_C_HEADER_FILES
    "avx512ifmaintrin.h"
    "avx512ifmavlintrin.h"
    "avx512pfintrin.h"
+    "avx512vbmi2intrin.h"
    "avx512vbmiintrin.h"
    "avx512vbmivlintrin.h"
+    "avx512vlbitalgintrin.h"
    "avx512vlbwintrin.h"
    "avx512vlcdintrin.h"
    "avx512vldqintrin.h"
    "avx512vlintrin.h"
+    "avx512vlvbmi2intrin.h"
+    "avx512vlvnniintrin.h"
+    "avx512vnniintrin.h"
    "avx512vpopcntdqintrin.h"
+    "avx512vpopcntdqvlintrin.h"
    "avxintrin.h"
    "bmi2intrin.h"
    "bmiintrin.h"
+    "cetintrin.h"
    "clflushoptintrin.h"
+    "clwbintrin.h"
    "clzerointrin.h"
    "cpuid.h"
    "cuda_wrappers/algorithm"
@ -575,6 +585,7 @@ set(ZIG_C_HEADER_FILES
    "fma4intrin.h"
    "fmaintrin.h"
    "fxsrintrin.h"
+    "gfniintrin.h"
    "htmintrin.h"
    "htmxlintrin.h"
    "ia32intrin.h"
@ -614,8 +625,10 @@ set(ZIG_C_HEADER_FILES
    "tmmintrin.h"
    "unwind.h"
    "vadefs.h"
+    "vaesintrin.h"
    "varargs.h"
    "vecintrin.h"
+    "vpclmulqdqintrin.h"
    "wmmintrin.h"
    "x86intrin.h"
    "xmmintrin.h"
--- a/c_headers/__clang_cuda_cmath.h
+++ b/c_headers/__clang_cuda_cmath.h
@ -131,15 +131,6 @@ __DEVICE__ float ldexp(float __arg, int __exp) {
 __DEVICE__ float log(float __x) { return ::logf(__x); }
 __DEVICE__ float log10(float __x) { return ::log10f(__x); }
 __DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
-__DEVICE__ float nexttoward(float __from, double __to) {
-  return __builtin_nexttowardf(__from, __to);
-}
-__DEVICE__ double nexttoward(double __from, double __to) {
-  return __builtin_nexttoward(__from, __to);
-}
-__DEVICE__ float nexttowardf(float __from, double __to) {
-  return __builtin_nexttowardf(__from, __to);
-}
 __DEVICE__ float pow(float __base, float __exp) {
  return ::powf(__base, __exp);
 }
@ -157,6 +148,10 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
 __DEVICE__ float tan(float __x) { return ::tanf(__x); }
 __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }

+// Notably missing above is nexttoward.  We omit it because
+// libdevice doesn't provide an implementation, and we don't want to be in the
+// business of implementing tricky libm functions in this header.
+
 // Now we've defined everything we promised we'd define in
 // __clang_cuda_math_forward_declares.h.  We need to do two additional things to
 // fix up our math functions.
@ -295,13 +290,6 @@ ldexp(__T __x, int __exp) {
  return std::ldexp((double)__x, __exp);
 }

-template <typename __T>
-__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
-                                           double>::type
-nexttoward(__T __from, double __to) {
-  return std::nexttoward((double)__from, __to);
-}
-
 template <typename __T1, typename __T2>
 __DEVICE__ typename __clang_cuda_enable_if<
    std::numeric_limits<__T1>::is_specialized &&
@ -388,7 +376,6 @@ using ::lrint;
 using ::lround;
 using ::nearbyint;
 using ::nextafter;
-using ::nexttoward;
 using ::pow;
 using ::remainder;
 using ::remquo;
@ -456,8 +443,6 @@ using ::lroundf;
 using ::modff;
 using ::nearbyintf;
 using ::nextafterf;
-using ::nexttowardf;
-using ::nexttowardf;
 using ::powf;
 using ::remainderf;
 using ::remquof;
--- a/c_headers/__clang_cuda_intrinsics.h
+++ b/c_headers/__clang_cuda_intrinsics.h
@ -34,23 +34,24 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300

 #pragma push_macro("__MAKE_SHUFFLES")
-#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask)    \
-  inline __device__ int __FnName(int __val, int __offset,                      \
+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask,    \
+                        __Type)                                                \
+  inline __device__ int __FnName(int __val, __Type __offset,                   \
                                 int __width = warpSize) {                     \
    return __IntIntrinsic(__val, __offset,                                     \
                          ((warpSize - __width) << 8) | (__Mask));             \
  }                                                                            \
-  inline __device__ float __FnName(float __val, int __offset,                  \
+  inline __device__ float __FnName(float __val, __Type __offset,               \
                                   int __width = warpSize) {                   \
    return __FloatIntrinsic(__val, __offset,                                   \
                            ((warpSize - __width) << 8) | (__Mask));           \
  }                                                                            \
-  inline __device__ unsigned int __FnName(unsigned int __val, int __offset,    \
+  inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
                                          int __width = warpSize) {            \
    return static_cast<unsigned int>(                                          \
        ::__FnName(static_cast<int>(__val), __offset, __width));               \
  }                                                                            \
-  inline __device__ long long __FnName(long long __val, int __offset,          \
+  inline __device__ long long __FnName(long long __val, __Type __offset,       \
                                       int __width = warpSize) {               \
    struct __Bits {                                                            \
      int __a, __b;                                                            \
@ -65,12 +66,29 @@
    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
    return __ret;                                                              \
  }                                                                            \
+  inline __device__ long __FnName(long __val, __Type __offset,                 \
+                                  int __width = warpSize) {                    \
+    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
+                   sizeof(long) == sizeof(int));                               \
+    if (sizeof(long) == sizeof(long long)) {                                   \
+      return static_cast<long>(                                                \
+          ::__FnName(static_cast<long long>(__val), __offset, __width));       \
+    } else if (sizeof(long) == sizeof(int)) {                                  \
+      return static_cast<long>(                                                \
+          ::__FnName(static_cast<int>(__val), __offset, __width));             \
+    }                                                                          \
+  }                                                                            \
+  inline __device__ unsigned long __FnName(                                    \
+      unsigned long __val, __Type __offset, int __width = warpSize) {          \
+    return static_cast<unsigned long>(                                         \
+        ::__FnName(static_cast<long>(__val), __offset, __width));              \
+  }                                                                            \
  inline __device__ unsigned long long __FnName(                               \
-      unsigned long long __val, int __offset, int __width = warpSize) {        \
+      unsigned long long __val, __Type __offset, int __width = warpSize) {     \
    return static_cast<unsigned long long>(::__FnName(                         \
        static_cast<unsigned long long>(__val), __offset, __width));           \
  }                                                                            \
-  inline __device__ double __FnName(double __val, int __offset,                \
+  inline __device__ double __FnName(double __val, __Type __offset,             \
                                    int __width = warpSize) {                  \
    long long __tmp;                                                           \
    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
@ -81,13 +99,15 @@
    return __ret;                                                              \
  }

-__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
+__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
 // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
 // maxLane.
-__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
-__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
-__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
-
+__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
+                unsigned int);
+__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
+                unsigned int);
+__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
+                int);
 #pragma pop_macro("__MAKE_SHUFFLES")

 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
@ -97,25 +117,26 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
 // __shfl_sync_* variants available in CUDA-9
 #pragma push_macro("__MAKE_SYNC_SHUFFLES")
 #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
-                             __Mask)                                           \
-  inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
-                                 int __width = warpSize) {                     \
+                             __Mask, __Type)                                   \
+  inline __device__ int __FnName(unsigned int __mask, int __val,               \
+                                 __Type __offset, int __width = warpSize) {    \
    return __IntIntrinsic(__mask, __val, __offset,                             \
                          ((warpSize - __width) << 8) | (__Mask));             \
  }                                                                            \
  inline __device__ float __FnName(unsigned int __mask, float __val,           \
-                                   int __offset, int __width = warpSize) {     \
+                                   __Type __offset, int __width = warpSize) {  \
    return __FloatIntrinsic(__mask, __val, __offset,                           \
                            ((warpSize - __width) << 8) | (__Mask));           \
  }                                                                            \
  inline __device__ unsigned int __FnName(unsigned int __mask,                 \
-                                          unsigned int __val, int __offset,    \
+                                          unsigned int __val, __Type __offset, \
                                          int __width = warpSize) {            \
    return static_cast<unsigned int>(                                          \
        ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
  }                                                                            \
  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
-                                       int __offset, int __width = warpSize) { \
+                                       __Type __offset,                        \
+                                       int __width = warpSize) {               \
    struct __Bits {                                                            \
      int __a, __b;                                                            \
    };                                                                         \
@ -130,13 +151,31 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
    return __ret;                                                              \
  }                                                                            \
  inline __device__ unsigned long long __FnName(                               \
-      unsigned int __mask, unsigned long long __val, int __offset,             \
+      unsigned int __mask, unsigned long long __val, __Type __offset,          \
      int __width = warpSize) {                                                \
    return static_cast<unsigned long long>(::__FnName(                         \
        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
  }                                                                            \
+  inline __device__ long __FnName(unsigned int __mask, long __val,             \
+                                  __Type __offset, int __width = warpSize) {   \
+    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
+                   sizeof(long) == sizeof(int));                               \
+    if (sizeof(long) == sizeof(long long)) {                                   \
+      return static_cast<long>(::__FnName(                                     \
+          __mask, static_cast<long long>(__val), __offset, __width));          \
+    } else if (sizeof(long) == sizeof(int)) {                                  \
+      return static_cast<long>(                                                \
+          ::__FnName(__mask, static_cast<int>(__val), __offset, __width));     \
+    }                                                                          \
+  }                                                                            \
+  inline __device__ unsigned long __FnName(                                    \
+      unsigned int __mask, unsigned long __val, __Type __offset,               \
+      int __width = warpSize) {                                                \
+    return static_cast<unsigned long>(                                         \
+        ::__FnName(__mask, static_cast<long>(__val), __offset, __width));      \
+  }                                                                            \
  inline __device__ double __FnName(unsigned int __mask, double __val,         \
-                                    int __offset, int __width = warpSize) {    \
+                                    __Type __offset, int __width = warpSize) { \
    long long __tmp;                                                           \
    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
    memcpy(&__tmp, &__val, sizeof(__val));                                     \
@ -146,15 +185,15 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
    return __ret;                                                              \
  }
 __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
-                     __nvvm_shfl_sync_idx_f32, 0x1f);
+                     __nvvm_shfl_sync_idx_f32, 0x1f, int);
 // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
 // maxLane.
 __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
-                     __nvvm_shfl_sync_up_f32, 0);
+                     __nvvm_shfl_sync_up_f32, 0, unsigned int);
 __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
-                     __nvvm_shfl_sync_down_f32, 0x1f);
+                     __nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
 __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
-                     __nvvm_shfl_sync_bfly_f32, 0x1f);
+                     __nvvm_shfl_sync_bfly_f32, 0x1f, int);
 #pragma pop_macro("__MAKE_SYNC_SHUFFLES")

 inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
@ -188,6 +227,10 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {

 inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }

+inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
+  return __nvvm_fns(mask, base, offset);
+}
+
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300

 // Define __match* builtins CUDA-9 headers expect to see.
--- a/c_headers/__clang_cuda_math_forward_declares.h
+++ b/c_headers/__clang_cuda_math_forward_declares.h
@ -149,9 +149,6 @@ __DEVICE__ double nearbyint(double);
 __DEVICE__ float nearbyint(float);
 __DEVICE__ double nextafter(double, double);
 __DEVICE__ float nextafter(float, float);
-__DEVICE__ double nexttoward(double, double);
-__DEVICE__ float nexttoward(float, double);
-__DEVICE__ float nexttowardf(float, double);
 __DEVICE__ double pow(double, double);
 __DEVICE__ double pow(double, int);
 __DEVICE__ float pow(float, float);
@ -185,6 +182,10 @@ __DEVICE__ float tgamma(float);
 __DEVICE__ double trunc(double);
 __DEVICE__ float trunc(float);

+// Notably missing above is nexttoward, which we don't define on
+// the device side because libdevice doesn't give us an implementation, and we
+// don't want to be in the business of writing one ourselves.
+
 // We need to define these overloads in exactly the namespace our standard
 // library uses (including the right inline namespace), otherwise they won't be
 // picked up by other functions in the standard library (e.g. functions in
@ -255,7 +256,6 @@ using ::nan;
 using ::nanf;
 using ::nearbyint;
 using ::nextafter;
-using ::nexttoward;
 using ::pow;
 using ::remainder;
 using ::remquo;
--- a/c_headers/__clang_cuda_runtime_wrapper.h
+++ b/c_headers/__clang_cuda_runtime_wrapper.h
@ -270,12 +270,18 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
 // include guard from math.h wrapper from libstdc++. We have to undo the header
 // guard temporarily to get the definitions we need.
 #pragma push_macro("_GLIBCXX_MATH_H")
+#pragma push_macro("_LIBCPP_VERSION")
 #if CUDA_VERSION >= 9000
 #undef _GLIBCXX_MATH_H
+// We also need to undo another guard that checks for libc++ 3.8+
+#ifdef _LIBCPP_VERSION
+#define _LIBCPP_VERSION 3700
+#endif
 #endif

 #include "math_functions.hpp"
 #pragma pop_macro("_GLIBCXX_MATH_H")
+#pragma pop_macro("_LIBCPP_VERSION")
 #pragma pop_macro("__GNUC__")
 #pragma pop_macro("signbit")

--- a/c_headers/arm_neon.h
+++ b/c_headers/arm_neon.h
--- a/c_headers/avx512bitalgintrin.h
+++ b/c_headers/avx512bitalgintrin.h
@ -0,0 +1,97 @@
+/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BITALGINTRIN_H
+#define __AVX512BITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi16(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
+              (__v32hi) _mm512_popcnt_epi16(__B),
+              (__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
+{
+  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(),
+              __U,
+              __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi8(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
+              (__v64qi) _mm512_popcnt_epi8(__B),
+              (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
+{
+  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(),
+              __U,
+              __B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
+              (__v64qi) __B,
+              __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
+{
+  return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
+              __A,
+              __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/avx512bwintrin.h
+++ b/c_headers/avx512bwintrin.h
@ -56,293 +56,145 @@ _mm512_setzero_hi(void) {

 /* Integer compare */

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
-                                                   (__mmask64)-1);
-}
+#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)-1); })

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
-                                                   __u);
-}
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)(m)); })

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
-                                                 (__mmask64)-1);
-}
+#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)-1); })

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
-                                                 __u);
-}
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)(m)); })

-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
-                                                   (__mmask32)-1);
-}
+#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)-1); })

-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
-                                                   __u);
-}
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)(m)); })

-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
-                                                 (__mmask32)-1);
-}
+#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)-1); })

-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
-                                                 __u);
-}
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)(m)); })

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
-                                                (__mmask64)-1);
-}
+#define _mm512_cmpeq_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi8_mask(A, B) \
+    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \
+    _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
-                                                __u);
-}
+#define _mm512_cmpeq_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu8_mask(A, B) \
+    _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \
+    _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
-                                                 (__mmask64)-1);
-}
+#define _mm512_cmpeq_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi16_mask(A, B) \
+    _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \
+    _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
-                                                 __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
-                                                   (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
-                                                   __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
-                                                 (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
-                                                   (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
-                                                   __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
-                                                 __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
-                                                (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
-                                                 (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
-                                                 __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
-                                                (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
-                                                 (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
-                                                 __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
-                                                (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
-                                                 (__mmask64)-1);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
-  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
-                                                 __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
-                                                (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
-                                                __u);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
-                                                 (__mmask32)-1);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
-  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
-                                                 __u);
-}
+#define _mm512_cmpeq_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu16_mask(A, B) \
+    _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \
+    _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_add_epi8 (__m512i __A, __m512i __B) {
@ -1541,46 +1393,6 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
 }


-#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
-  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                         (__v64qi)(__m512i)(b), (int)(p), \
-                                         (__mmask64)-1); })
-
-#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                         (__v64qi)(__m512i)(b), (int)(p), \
-                                         (__mmask64)(m)); })
-
-#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
-  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)-1); })
-
-#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
-  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)(m)); })
-
-#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                         (__v32hi)(__m512i)(b), (int)(p), \
-                                         (__mmask32)-1); })
-
-#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                         (__v32hi)(__m512i)(b), (int)(p), \
-                                         (__mmask32)(m)); })
-
-#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)-1); })
-
-#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
-  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)(m)); })
-
 #define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
                                   (__v32hi)_mm512_undefined_epi32(), \
@ -2042,15 +1854,13 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_kunpackd (__mmask64 __A, __mmask64 __B)
 {
-  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
-                (__mmask64) __B);
+  return (__mmask64)  (( __A  & 0xFFFFFFFF) | ( __B << 32));
 }

 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_kunpackw (__mmask32 __A, __mmask32 __B)
 {
-  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
-                (__mmask32) __B);
+return (__mmask32)  (( __A  & 0xFFFF) | ( __B << 16));
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
@ -2105,61 +1915,56 @@ _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_test_epi8_mask (__m512i __A, __m512i __B)
 {
-  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
-            (__v64qi) __B,
-            (__mmask64) -1);
+  return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B),
+                                  _mm512_setzero_qi());
 }

 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
-            (__v64qi) __B, __U);
+  return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
+                                       _mm512_setzero_qi());
 }

 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_test_epi16_mask (__m512i __A, __m512i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
-            (__v32hi) __B,
-            (__mmask32) -1);
+  return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B),
+                                   _mm512_setzero_qi());
 }

 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
-            (__v32hi) __B, __U);
+  return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
+                                        _mm512_setzero_qi());
 }

 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_testn_epi8_mask (__m512i __A, __m512i __B)
 {
-  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
-             (__v64qi) __B,
-             (__mmask64) -1);
+  return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_qi());
 }

 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
-             (__v64qi) __B, __U);
+  return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
+                                      _mm512_setzero_qi());
 }

 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_testn_epi16_mask (__m512i __A, __m512i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
-             (__v32hi) __B,
-             (__mmask32) -1);
+  return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B),
+                                  _mm512_setzero_qi());
 }

 static __inline__ __mmask32 __DEFAULT_FN_ATTRS
 _mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
-             (__v32hi) __B, __U);
+  return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
+                                       _mm512_setzero_qi());
 }

 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
--- a/c_headers/avx512cdintrin.h
+++ b/c_headers/avx512cdintrin.h
@ -130,13 +130,14 @@ _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastmb_epi64 (__mmask8 __A)
 {
-  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+  return (__m512i) _mm512_set1_epi64((long long) __A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastmw_epi32 (__mmask16 __A)
 {
-  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+  return (__m512i) _mm512_set1_epi32((int) __A);
+
 }

 #undef __DEFAULT_FN_ATTRS
--- a/c_headers/avx512fintrin.h
+++ b/c_headers/avx512fintrin.h
@ -8787,7 +8787,7 @@ _mm512_kortestz (__mmask16 __A, __mmask16 __B)
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
 {
-  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+  return (__mmask16)  (( __A  & 0xFF) | ( __B << 8));
 }

 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
--- a/c_headers/avx512vbmi2intrin.h
+++ b/c_headers/avx512vbmi2intrin.h
@ -0,0 +1,391 @@
+/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VBMI2INTRIN_H
+#define __AVX512VBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2")))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+              (__v32hi) _mm512_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+              (__v64qi) _mm512_setzero_qi(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
+{
+  __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
+{
+  __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+              (__v32hi) _mm512_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+              (__v64qi) _mm512_setzero_qi(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+              (__v32hi) _mm512_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+              (__v64qi) _mm512_setzero_qi(),
+              __U);
+}
+
+#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
+                                          (__v8di)(B), \
+                                          (int)(I), \
+                                          (__v8di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shldi_epi64(U, A, B, I) \
+  _mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shldi_epi64(A, B, I) \
+  _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
+                                          (__v16si)(B), \
+                                          (int)(I), \
+                                          (__v16si)(S), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_shldi_epi32(U, A, B, I) \
+  _mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shldi_epi32(A, B, I) \
+  _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
+
+#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
+                                          (__v32hi)(B), \
+                                          (int)(I), \
+                                          (__v32hi)(S), \
+                                          (__mmask32)(U)); })
+
+#define _mm512_maskz_shldi_epi16(U, A, B, I) \
+  _mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shldi_epi16(A, B, I) \
+  _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
+
+#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
+                                          (__v8di)(B), \
+                                          (int)(I), \
+                                          (__v8di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
+  _mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shrdi_epi64(A, B, I) \
+  _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
+                                          (__v16si)(B), \
+                                          (int)(I), \
+                                          (__v16si)(S), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
+  _mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shrdi_epi32(A, B, I) \
+  _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
+
+#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
+                                          (__v32hi)(B), \
+                                          (int)(I), \
+                                          (__v32hi)(S), \
+                                          (__mmask32)(U)); })
+
+#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
+  _mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shrdi_epi16(A, B, I) \
+  _mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
+
--- a/c_headers/avx512vlbitalgintrin.h
+++ b/c_headers/avx512vlbitalgintrin.h
@ -0,0 +1,157 @@
+/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBITALGINTRIN_H
+#define __AVX512VLBITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg")))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_popcnt_epi16(__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
+              (__v16hi) _mm256_popcnt_epi16(__B),
+              (__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
+{
+  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
+              __U,
+              __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_popcnt_epi16(__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
+              (__v8hi) _mm128_popcnt_epi16(__B),
+              (__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
+{
+  return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
+              __U,
+              __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_popcnt_epi8(__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
+              (__v32qi) _mm256_popcnt_epi8(__B),
+              (__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
+{
+  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
+              __U,
+              __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_popcnt_epi8(__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
+              (__v16qi) _mm128_popcnt_epi8(__B),
+              (__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
+{
+  return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
+              __U,
+              __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
+              (__v32qi) __B,
+              __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B)
+{
+  return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1,
+              __A,
+              __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
+              (__v16qi) __B,
+              __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B)
+{
+  return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1,
+              __A,
+              __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/avx512vlbwintrin.h
+++ b/c_headers/avx512vlbwintrin.h
--- a/c_headers/avx512vlcdintrin.h
+++ b/c_headers/avx512vlcdintrin.h
@ -33,26 +33,26 @@

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+{ 
+  return (__m128i) _mm_set1_epi64x((long long) __A);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastmb_epi64 (__mmask8 __A)
 {
-  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+  return (__m256i) _mm256_set1_epi64x((long long)__A);
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_broadcastmw_epi32 (__mmask16 __A)
 {
-  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+  return (__m128i) _mm_set1_epi32((int)__A);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastmw_epi32 (__mmask16 __A)
 {
-  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+  return (__m256i) _mm256_set1_epi32((int)__A);
 }


--- a/c_headers/avx512vlintrin.h
+++ b/c_headers/avx512vlintrin.h
--- a/c_headers/avx512vlvbmi2intrin.h
+++ b/c_headers/avx512vlvbmi2intrin.h
@ -0,0 +1,748 @@
+/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVBMI2INTRIN_H
+#define __AVX512VLVBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2")))
+
+static  __inline __m128i __DEFAULT_FN_ATTRS
+_mm128_setzero_hi(void) {
+  return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+              (__v8hi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+              (__v16qi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
+{
+  __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
+{
+  __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+              (__v8hi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+              (__v16qi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+              (__v8hi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+              (__v16qi) _mm128_setzero_hi(),
+              __U);
+}
+
+static  __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setzero_hi(void) {
+  return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+              (__v16hi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+              (__v32qi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
+{
+  __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
+{
+  __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+              (__v16hi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+              (__v32qi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+              (__v16hi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+              (__v32qi) _mm256_setzero_hi(),
+              __U);
+}
+
+#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
+                                          (__v4di)(B), \
+                                          (int)(I), \
+                                          (__v4di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_shldi_epi64(U, A, B, I) \
+  _mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shldi_epi64(A, B, I) \
+  _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
+                                          (__v2di)(B), \
+                                          (int)(I), \
+                                          (__v2di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shldi_epi64(U, A, B, I) \
+  _mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shldi_epi64(A, B, I) \
+  _mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
+                                          (__v8si)(B), \
+                                          (int)(I), \
+                                          (__v8si)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_shldi_epi32(U, A, B, I) \
+  _mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shldi_epi32(A, B, I) \
+  _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
+                                          (__v4si)(B), \
+                                          (int)(I), \
+                                          (__v4si)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shldi_epi32(U, A, B, I) \
+  _mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shldi_epi32(A, B, I) \
+  _mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
+                                          (__v16hi)(B), \
+                                          (int)(I), \
+                                          (__v16hi)(S), \
+                                          (__mmask16)(U)); })
+
+#define _mm256_maskz_shldi_epi16(U, A, B, I) \
+  _mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shldi_epi16(A, B, I) \
+  _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
+                                          (__v8hi)(B), \
+                                          (int)(I), \
+                                          (__v8hi)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shldi_epi16(U, A, B, I) \
+  _mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shldi_epi16(A, B, I) \
+  _mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
+                                          (__v4di)(B), \
+                                          (int)(I), \
+                                          (__v4di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
+  _mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shrdi_epi64(A, B, I) \
+  _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
+                                          (__v2di)(B), \
+                                          (int)(I), \
+                                          (__v2di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shrdi_epi64(U, A, B, I) \
+  _mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shrdi_epi64(A, B, I) \
+  _mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
+                                          (__v8si)(B), \
+                                          (int)(I), \
+                                          (__v8si)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
+  _mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shrdi_epi32(A, B, I) \
+  _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
+                                          (__v4si)(B), \
+                                          (int)(I), \
+                                          (__v4si)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shrdi_epi32(U, A, B, I) \
+  _mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shrdi_epi32(A, B, I) \
+  _mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
+                                          (__v16hi)(B), \
+                                          (int)(I), \
+                                          (__v16hi)(S), \
+                                          (__mmask16)(U)); })
+
+#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
+  _mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shrdi_epi16(A, B, I) \
+  _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
+                                          (__v8hi)(B), \
+                                          (int)(I), \
+                                          (__v8hi)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shrdi_epi16(U, A, B, I) \
+  _mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shrdi_epi16(A, B, I) \
+  _mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              (__mmask8) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/avx512vlvnniintrin.h
+++ b/c_headers/avx512vlvnniintrin.h
@ -0,0 +1,254 @@
+/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVNNIINTRIN_H
+#define __AVX512VLVNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni")))
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/avx512vnniintrin.h
+++ b/c_headers/avx512vnniintrin.h
@ -0,0 +1,146 @@
+/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VNNIINTRIN_H
+#define __AVX512VNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni")))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/avx512vpopcntdqvlintrin.h
+++ b/c_headers/avx512vpopcntdqvlintrin.h
@ -0,0 +1,99 @@
+/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
+ *------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQVLINTRIN_H
+#define __AVX512VPOPCNTDQVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
+  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
+  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
+  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
+  return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/cetintrin.h
+++ b/c_headers/cetintrin.h
@ -0,0 +1,93 @@
+/*===---- cetintrin.h - CET intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CETINTRIN_H
+#define __CETINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
+
+static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
+  __builtin_ia32_incsspd(__a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
+  __builtin_ia32_incsspq(__a);
+}
+#endif /* __x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
+  return __builtin_ia32_rdsspd(__a);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
+  return __builtin_ia32_rdsspq(__a);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
+  __builtin_ia32_saveprevssp();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
+  __builtin_ia32_rstorssp(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
+  __builtin_ia32_wrssd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
+  __builtin_ia32_wrssq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
+  __builtin_ia32_wrussd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
+  __builtin_ia32_wrussq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
+  __builtin_ia32_setssbsy();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
+  __builtin_ia32_clrssbsy(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CETINTRIN_H */
--- a/c_headers/cpuid.h
+++ b/c_headers/cpuid.h
@ -173,16 +173,24 @@
 #define bit_AVX512VL    0x80000000

 /* Features in %ecx for leaf 7 sub-leaf 0 */
-#define bit_PREFTCHWT1  0x00000001
-#define bit_AVX512VBMI  0x00000002
-#define bit_PKU         0x00000004
-#define bit_OSPKE       0x00000010
+#define bit_PREFTCHWT1       0x00000001
+#define bit_AVX512VBMI       0x00000002
+#define bit_PKU              0x00000004
+#define bit_OSPKE            0x00000010
+#define bit_AVX512VBMI2      0x00000040
+#define bit_SHSTK            0x00000080
+#define bit_GFNI             0x00000100
+#define bit_VAES             0x00000200
+#define bit_VPCLMULQDQ       0x00000400
+#define bit_AVX512VNNI       0x00000800
+#define bit_AVX512BITALG     0x00001000
 #define bit_AVX512VPOPCNTDQ  0x00004000
-#define bit_RDPID       0x00400000
+#define bit_RDPID            0x00400000

 /* Features in %edx for leaf 7 sub-leaf 0 */
 #define bit_AVX5124VNNIW  0x00000004
 #define bit_AVX5124FMAPS  0x00000008
+#define bit_IBT           0x00100000

 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
@ -192,6 +200,7 @@
 /* Features in %ecx for leaf 0x80000001 */
 #define bit_LAHF_LM     0x00000001
 #define bit_ABM         0x00000020
+#define bit_LZCNT       bit_ABM        /* for gcc compat */
 #define bit_SSE4a       0x00000040
 #define bit_PRFCHW      0x00000100
 #define bit_XOP         0x00000800
--- a/c_headers/cuda_wrappers/algorithm
+++ b/c_headers/cuda_wrappers/algorithm
@ -80,7 +80,7 @@ min(const __T &__a, const __T &__b, __Cmp __cmp) {
 template <class __T>
 inline __device__ const __T &
 min(const __T &__a, const __T &__b) {
-  return __a < __b ? __b : __a;
+  return __a < __b ? __a : __b;
 }

 #ifdef _LIBCPP_END_NAMESPACE_STD
--- a/c_headers/emmintrin.h
+++ b/c_headers/emmintrin.h
@ -217,8 +217,8 @@ _mm_div_pd(__m128d __a, __m128d __b)

 /// \brief Calculates the square root of the lower double-precision value of
 ///    the second operand and returns it in the lower 64 bits of the result.
-///    The upper 64 bits of the result are copied from the upper double-
-///    precision value of the first operand.
+///    The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -260,8 +260,8 @@ _mm_sqrt_pd(__m128d __a)

 /// \brief Compares lower 64-bit double-precision values of both operands, and
 ///    returns the lesser of the pair of values in the lower 64-bits of the
-///    result. The upper 64 bits of the result are copied from the upper double-
-///    precision value of the first operand.
+///    result. The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -304,8 +304,8 @@ _mm_min_pd(__m128d __a, __m128d __b)

 /// \brief Compares lower 64-bit double-precision values of both operands, and
 ///    returns the greater of the pair of values in the lower 64-bits of the
-///    result. The upper 64 bits of the result are copied from the upper double-
-///    precision value of the first operand.
+///    result. The upper 64 bits of the result are copied from the upper
+///    double-precision value of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -983,8 +983,10 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
 }

 /// \brief Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] for equality. The
-///    comparison yields 0 for false, 1 for true.
+///    the two 128-bit floating-point vectors of [2 x double] for equality.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -996,7 +998,8 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_sd(__m128d __a, __m128d __b)
 {
@ -1008,7 +1011,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1020,7 +1024,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_sd(__m128d __a, __m128d __b)
 {
@ -1032,7 +1037,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1044,7 +1050,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_sd(__m128d __a, __m128d __b)
 {
@ -1056,7 +1063,8 @@ _mm_comile_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1068,7 +1076,8 @@ _mm_comile_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_sd(__m128d __a, __m128d __b)
 {
@ -1080,7 +1089,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1092,7 +1102,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_sd(__m128d __a, __m128d __b)
 {
@ -1104,7 +1115,8 @@ _mm_comige_sd(__m128d __a, __m128d __b)
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.
+///    The comparison yields 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1116,7 +1128,8 @@ _mm_comige_sd(__m128d __a, __m128d __b)
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_sd(__m128d __a, __m128d __b)
 {
@ -1127,7 +1140,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
 ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
 ///    comparison yields 0 for false, 1 for true.
 ///
-///    If either of the two lower double-precision values is NaN, 1 is returned.
+///    If either of the two lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1140,7 +1153,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
 /// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 1 is returned.
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 {
@ -1153,7 +1166,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
 ///    the second parameter.
 ///
 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 1 is returned.
+///    double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1166,7 +1179,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
 /// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 1 is returned.
+///    lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 {
@ -1179,7 +1192,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
 ///    corresponding value in the second parameter.
 ///
 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 1 is returned.
+///    double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1192,7 +1205,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
 /// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 1 is returned.
+///     lower double-precision values is NaN, 0 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_sd(__m128d __a, __m128d __b)
 {
@ -1257,7 +1270,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
 ///    the second parameter.
 ///
 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 0 is returned.
+///    double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1270,7 +1283,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
 /// \returns An integer containing the comparison result. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    lower double-precision values is NaN, 1 is returned.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 {
@ -1935,14 +1948,15 @@ _mm_store_pd(double *__dp, __m128d __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
+/// This intrinsic corresponds to the
+///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
 ///
 /// \param __dp
 ///    A pointer to a memory location that can store two double-precision
 ///    values.
 /// \param __a
 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
-///    of the values in \a dp.
+///    of the values in \a __dp.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_pd(double *__dp, __m128d __a)
 {
@ -1950,18 +1964,20 @@ _mm_store1_pd(double *__dp, __m128d __a)
  _mm_store_pd(__dp, __a);
 }

-/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
-///    location.
+/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
+///    the upper and lower 64 bits of a memory location.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+/// This intrinsic corresponds to the
+///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
 ///
 /// \param __dp
-///    A pointer to a 128-bit memory location. The address of the memory
-///    location has to be 16-byte aligned.
+///    A pointer to a memory location that can store two double-precision
+///    values.
 /// \param __a
-///    A 128-bit vector of [2 x double] containing the values to be stored.
+///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
+///    of the values in \a __dp.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_pd1(double *__dp, __m128d __a)
 {
@ -3846,8 +3862,7 @@ _mm_set1_epi8(char __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
-///   instruction.
+/// This intrinsic does not correspond to a specific instruction.
 ///
 /// \param __q0
 ///    A 64-bit integral value used to initialize the lower 64 bits of the
@ -4018,7 +4033,7 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
 ///    specified unaligned memory location. When a mask bit is 1, the
 ///    corresponding byte is written, otherwise it is not written.
 ///
-///    To minimize caching, the date is flagged as non-temporal (unlikely to be
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon). Exception and trap behavior for elements not selected
 ///    for storage to memory are implementation dependent.
 ///
@ -4532,8 +4547,8 @@ _mm_unpackhi_epi32(__m128i __a, __m128i __b)
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
 }

-/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
-///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -4665,7 +4680,7 @@ _mm_unpacklo_epi64(__m128i __a, __m128i __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic has no corresponding instruction.
+/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
@ -4682,7 +4697,7 @@ _mm_movepi64_pi64(__m128i __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
+/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit value.
@ -4712,8 +4727,8 @@ _mm_move_epi64(__m128i __a)
  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
 }

-/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
-///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
+///    [2 x double] and interleaves them into a 128-bit vector of [2 x
 ///    double].
 ///
 /// \headerfile <x86intrin.h>
@ -4733,7 +4748,7 @@ _mm_unpackhi_pd(__m128d __a, __m128d __b)
  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
 }

-/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors
 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
 ///    double].
 ///
@ -4792,9 +4807,9 @@ _mm_movemask_pd(__m128d __a)
 ///    A 128-bit vector of [2 x double].
 /// \param i
 ///    An 8-bit immediate value. The least significant two bits specify which
-///    elements to copy from a and b: \n
-///    Bit[0] = 0: lower element of a copied to lower element of result. \n
-///    Bit[0] = 1: upper element of a copied to lower element of result. \n
+///    elements to copy from \a a and \a b: \n
+///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
+///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
--- a/c_headers/fma4intrin.h
+++ b/c_headers/fma4intrin.h
@ -60,73 +60,73 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
@ -144,13 +144,13 @@ _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -168,37 +168,37 @@ _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }

 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 }

 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

 static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -216,13 +216,13 @@ _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

 #undef __DEFAULT_FN_ATTRS
--- a/c_headers/fmaintrin.h
+++ b/c_headers/fmaintrin.h
@ -46,85 +46,85 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m128 __DEFAULT_FN_ATTRS
@ -142,13 +142,13 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
-  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
-  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

 static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -166,37 +166,37 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }

 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 }

 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

 static __inline__ __m256 __DEFAULT_FN_ATTRS
@ -214,13 +214,13 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
-  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
-  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

 #undef __DEFAULT_FN_ATTRS
--- a/c_headers/gfniintrin.h
+++ b/c_headers/gfniintrin.h
@ -0,0 +1,202 @@
+/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __GFNIINTRIN_H
+#define __GFNIINTRIN_H
+
+
+#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                   \
+  (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
+                                                  (__v16qi)(__m128i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
+        (__v16qi)(__m128i)(S)); })
+
+
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({          \
+  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
+        U, A, B, I); })
+
+
+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                \
+  (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
+                                                  (__v32qi)(__m256i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({     \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
+        (__v32qi)(__m256i)(S)); })
+
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({       \
+  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+        U, A, B, I); })
+
+
+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                \
+  (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
+                                                  (__v64qi)(__m512i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({     \
+   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
+        (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I),                       \
+        (__v64qi)(__m512i)(S)); })
+
+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({       \
+  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(),    \
+        U, A, B, I); })
+
+#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                      \
+  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
+                                                  (__v16qi)(__m128i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({           \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
+        (__v16qi)(__m128i)(S)); })
+
+
+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({             \
+  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
+        U, A, B, I); })
+
+
+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                   \
+  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
+                                                  (__v32qi)(__m256i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
+        (__v32qi)(__m256i)(S)); })
+
+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({          \
+  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
+        U, A, B, I); })
+
+
+#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                   \
+  (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
+                                                  (__v64qi)(__m512i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
+        (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I),                          \
+        (__v64qi)(__m512i)(S)); })
+
+#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({          \
+  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(),       \
+        U, A, B, I); })
+
+/* Default attributes for simple form (no masking). */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni")))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni")))
+
+/* Default attributes for VLX forms. */
+#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+              (__v16qi) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
+_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128(__U,
+              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
+              (__v16qi) __S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
+_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
+              __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
+              (__v32qi) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
+_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256(__U,
+              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
+              (__v32qi) __S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
+_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
+              __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
+              (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectb_512(__U,
+              (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
+              (__v64qi) __S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(),
+              __U, __A, __B);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_F
+#undef __DEFAULT_FN_ATTRS_VL
+
+#endif // __GFNIINTRIN_H
+
--- a/c_headers/immintrin.h
+++ b/c_headers/immintrin.h
@ -118,6 +118,10 @@ _mm256_cvtph_ps(__m128i __a)
 }
 #endif /* __AVX2__ */

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
+#include <vpclmulqdqintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
 #include <bmiintrin.h>
 #endif
@ -146,6 +150,10 @@ _mm256_cvtph_ps(__m128i __a)
 #include <avx512bwintrin.h>
 #endif

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__)
+#include <avx512bitalgintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
 #include <avx512cdintrin.h>
 #endif
@ -154,10 +162,29 @@ _mm256_cvtph_ps(__m128i __a)
 #include <avx512vpopcntdqintrin.h>
 #endif

+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
+#include <avx512vpopcntdqvlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__)
+#include <avx512vnniintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
+#include <avx512vlvnniintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
 #endif

+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
+#include <avx512vlbitalgintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || \
    (defined(__AVX512VL__) && defined(__AVX512BW__))
 #include <avx512vlbwintrin.h>
@ -195,6 +222,15 @@ _mm256_cvtph_ps(__m128i __a)
 #include <avx512vbmivlintrin.h>
 #endif

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__)
+#include <avx512vbmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
+#include <avx512vlvbmi2intrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
 #include <avx512pfintrin.h>
 #endif
@ -203,6 +239,14 @@ _mm256_cvtph_ps(__m128i __a)
 #include <pkuintrin.h>
 #endif

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__)
+#include <vaesintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__)
+#include <gfniintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
@ -319,6 +363,10 @@ _writegsbase_u64(unsigned long long __V)
 #include <xsavesintrin.h>
 #endif

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__)
+#include <cetintrin.h>
+#endif
+
 /* Some intrinsics inside adxintrin.h are available only on processors with ADX,
 * whereas others are also available at all times. */
 #include <adxintrin.h>
--- a/c_headers/opencl-c.h
+++ b/c_headers/opencl-c.h
@ -15886,6 +15886,313 @@ double  __ovld __conv sub_group_scan_inclusive_max(double x);

 #endif //cl_khr_subgroups cl_intel_subgroups

+#if defined(cl_intel_subgroups)
+// Intel-Specific Sub Group Functions
+float   __ovld __conv intel_sub_group_shuffle( float  x, uint c );
+float2  __ovld __conv intel_sub_group_shuffle( float2 x, uint c );
+float3  __ovld __conv intel_sub_group_shuffle( float3 x, uint c );
+float4  __ovld __conv intel_sub_group_shuffle( float4 x, uint c );
+float8  __ovld __conv intel_sub_group_shuffle( float8 x, uint c );
+float16 __ovld __conv intel_sub_group_shuffle( float16 x, uint c );
+
+int     __ovld __conv intel_sub_group_shuffle( int  x, uint c );
+int2    __ovld __conv intel_sub_group_shuffle( int2 x, uint c );
+int3    __ovld __conv intel_sub_group_shuffle( int3 x, uint c );
+int4    __ovld __conv intel_sub_group_shuffle( int4 x, uint c );
+int8    __ovld __conv intel_sub_group_shuffle( int8 x, uint c );
+int16   __ovld __conv intel_sub_group_shuffle( int16 x, uint c );
+
+uint    __ovld __conv intel_sub_group_shuffle( uint  x, uint c );
+uint2   __ovld __conv intel_sub_group_shuffle( uint2 x, uint c );
+uint3   __ovld __conv intel_sub_group_shuffle( uint3 x, uint c );
+uint4   __ovld __conv intel_sub_group_shuffle( uint4 x, uint c );
+uint8   __ovld __conv intel_sub_group_shuffle( uint8 x, uint c );
+uint16  __ovld __conv intel_sub_group_shuffle( uint16 x, uint c );
+
+long    __ovld __conv intel_sub_group_shuffle( long x, uint c );
+ulong   __ovld __conv intel_sub_group_shuffle( ulong x, uint c );
+
+float   __ovld __conv intel_sub_group_shuffle_down( float  cur, float  next, uint c );
+float2  __ovld __conv intel_sub_group_shuffle_down( float2 cur, float2 next, uint c );
+float3  __ovld __conv intel_sub_group_shuffle_down( float3 cur, float3 next, uint c );
+float4  __ovld __conv intel_sub_group_shuffle_down( float4 cur, float4 next, uint c );
+float8  __ovld __conv intel_sub_group_shuffle_down( float8 cur, float8 next, uint c );
+float16 __ovld __conv intel_sub_group_shuffle_down( float16 cur, float16 next, uint c );
+
+int     __ovld __conv intel_sub_group_shuffle_down( int  cur, int  next, uint c );
+int2    __ovld __conv intel_sub_group_shuffle_down( int2 cur, int2 next, uint c );
+int3    __ovld __conv intel_sub_group_shuffle_down( int3 cur, int3 next, uint c );
+int4    __ovld __conv intel_sub_group_shuffle_down( int4 cur, int4 next, uint c );
+int8    __ovld __conv intel_sub_group_shuffle_down( int8 cur, int8 next, uint c );
+int16   __ovld __conv intel_sub_group_shuffle_down( int16 cur, int16 next, uint c );
+
+uint    __ovld __conv intel_sub_group_shuffle_down( uint  cur, uint  next, uint c );
+uint2   __ovld __conv intel_sub_group_shuffle_down( uint2 cur, uint2 next, uint c );
+uint3   __ovld __conv intel_sub_group_shuffle_down( uint3 cur, uint3 next, uint c );
+uint4   __ovld __conv intel_sub_group_shuffle_down( uint4 cur, uint4 next, uint c );
+uint8   __ovld __conv intel_sub_group_shuffle_down( uint8 cur, uint8 next, uint c );
+uint16  __ovld __conv intel_sub_group_shuffle_down( uint16 cur, uint16 next, uint c );
+
+long    __ovld __conv intel_sub_group_shuffle_down( long prev, long cur, uint c );
+ulong   __ovld __conv intel_sub_group_shuffle_down( ulong prev, ulong cur, uint c );
+
+float   __ovld __conv intel_sub_group_shuffle_up( float  prev, float  cur, uint c );
+float2  __ovld __conv intel_sub_group_shuffle_up( float2 prev, float2 cur, uint c );
+float3  __ovld __conv intel_sub_group_shuffle_up( float3 prev, float3 cur, uint c );
+float4  __ovld __conv intel_sub_group_shuffle_up( float4 prev, float4 cur, uint c );
+float8  __ovld __conv intel_sub_group_shuffle_up( float8 prev, float8 cur, uint c );
+float16 __ovld __conv intel_sub_group_shuffle_up( float16 prev, float16 cur, uint c );
+
+int     __ovld __conv intel_sub_group_shuffle_up( int  prev, int  cur, uint c );
+int2    __ovld __conv intel_sub_group_shuffle_up( int2 prev, int2 cur, uint c );
+int3    __ovld __conv intel_sub_group_shuffle_up( int3 prev, int3 cur, uint c );
+int4    __ovld __conv intel_sub_group_shuffle_up( int4 prev, int4 cur, uint c );
+int8    __ovld __conv intel_sub_group_shuffle_up( int8 prev, int8 cur, uint c );
+int16   __ovld __conv intel_sub_group_shuffle_up( int16 prev, int16 cur, uint c );
+
+uint    __ovld __conv intel_sub_group_shuffle_up( uint  prev, uint  cur, uint c );
+uint2   __ovld __conv intel_sub_group_shuffle_up( uint2 prev, uint2 cur, uint c );
+uint3   __ovld __conv intel_sub_group_shuffle_up( uint3 prev, uint3 cur, uint c );
+uint4   __ovld __conv intel_sub_group_shuffle_up( uint4 prev, uint4 cur, uint c );
+uint8   __ovld __conv intel_sub_group_shuffle_up( uint8 prev, uint8 cur, uint c );
+uint16  __ovld __conv intel_sub_group_shuffle_up( uint16 prev, uint16 cur, uint c );
+
+long    __ovld __conv intel_sub_group_shuffle_up( long prev, long cur, uint c );
+ulong   __ovld __conv intel_sub_group_shuffle_up( ulong prev, ulong cur, uint c );
+
+float   __ovld __conv intel_sub_group_shuffle_xor( float  x, uint c );
+float2  __ovld __conv intel_sub_group_shuffle_xor( float2 x, uint c );
+float3  __ovld __conv intel_sub_group_shuffle_xor( float3 x, uint c );
+float4  __ovld __conv intel_sub_group_shuffle_xor( float4 x, uint c );
+float8  __ovld __conv intel_sub_group_shuffle_xor( float8 x, uint c );
+float16 __ovld __conv intel_sub_group_shuffle_xor( float16 x, uint c );
+
+int     __ovld __conv intel_sub_group_shuffle_xor( int  x, uint c );
+int2    __ovld __conv intel_sub_group_shuffle_xor( int2 x, uint c );
+int3    __ovld __conv intel_sub_group_shuffle_xor( int3 x, uint c );
+int4    __ovld __conv intel_sub_group_shuffle_xor( int4 x, uint c );
+int8    __ovld __conv intel_sub_group_shuffle_xor( int8 x, uint c );
+int16   __ovld __conv intel_sub_group_shuffle_xor( int16 x, uint c );
+
+uint    __ovld __conv intel_sub_group_shuffle_xor( uint  x, uint c );
+uint2   __ovld __conv intel_sub_group_shuffle_xor( uint2 x, uint c );
+uint3   __ovld __conv intel_sub_group_shuffle_xor( uint3 x, uint c );
+uint4   __ovld __conv intel_sub_group_shuffle_xor( uint4 x, uint c );
+uint8   __ovld __conv intel_sub_group_shuffle_xor( uint8 x, uint c );
+uint16  __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c );
+
+long    __ovld __conv intel_sub_group_shuffle_xor( long x, uint c );
+ulong   __ovld __conv intel_sub_group_shuffle_xor( ulong x, uint c );
+
+uint    __ovld __conv intel_sub_group_block_read( read_only image2d_t image, int2 coord );
+uint2   __ovld __conv intel_sub_group_block_read2( read_only image2d_t image, int2 coord );
+uint4   __ovld __conv intel_sub_group_block_read4( read_only image2d_t image, int2 coord );
+uint8   __ovld __conv intel_sub_group_block_read8( read_only image2d_t image, int2 coord );
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+uint    __ovld __conv intel_sub_group_block_read(read_write image2d_t image, int2 coord);
+uint2   __ovld __conv intel_sub_group_block_read2(read_write image2d_t image, int2 coord);
+uint4   __ovld __conv intel_sub_group_block_read4(read_write image2d_t image, int2 coord);
+uint8   __ovld __conv intel_sub_group_block_read8(read_write image2d_t image, int2 coord);
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+uint    __ovld __conv intel_sub_group_block_read( const __global uint* p );
+uint2   __ovld __conv intel_sub_group_block_read2( const __global uint* p );
+uint4   __ovld __conv intel_sub_group_block_read4( const __global uint* p );
+uint8   __ovld __conv intel_sub_group_block_read8( const __global uint* p );
+
+void    __ovld __conv intel_sub_group_block_write(write_only image2d_t image, int2 coord, uint data);
+void    __ovld __conv intel_sub_group_block_write2(write_only image2d_t image, int2 coord, uint2 data);
+void    __ovld __conv intel_sub_group_block_write4(write_only image2d_t image, int2 coord, uint4 data);
+void    __ovld __conv intel_sub_group_block_write8(write_only image2d_t image, int2 coord, uint8 data);
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+void    __ovld __conv intel_sub_group_block_write(read_write image2d_t image, int2 coord, uint data);
+void    __ovld __conv intel_sub_group_block_write2(read_write image2d_t image, int2 coord, uint2 data);
+void    __ovld __conv intel_sub_group_block_write4(read_write image2d_t image, int2 coord, uint4 data);
+void    __ovld __conv intel_sub_group_block_write8(read_write image2d_t image, int2 coord, uint8 data);
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+void    __ovld __conv intel_sub_group_block_write( __global uint* p, uint data );
+void    __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data );
+void    __ovld __conv intel_sub_group_block_write4( __global uint* p, uint4 data );
+void    __ovld __conv intel_sub_group_block_write8( __global uint* p, uint8 data );
+
+#ifdef cl_khr_fp16
+half    __ovld __conv intel_sub_group_shuffle( half x, uint c );
+half    __ovld __conv intel_sub_group_shuffle_down( half prev, half cur, uint c );
+half    __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c );
+half    __ovld __conv intel_sub_group_shuffle_xor( half x, uint c );
+#endif
+
+#if defined(cl_khr_fp64)
+double  __ovld __conv intel_sub_group_shuffle( double x, uint c );
+double  __ovld __conv intel_sub_group_shuffle_down( double prev, double cur, uint c );
+double  __ovld __conv intel_sub_group_shuffle_up( double prev, double cur, uint c );
+double  __ovld __conv intel_sub_group_shuffle_xor( double x, uint c );
+#endif
+
+#endif //cl_intel_subgroups
+
+#if defined(cl_intel_subgroups_short)
+short       __ovld __conv intel_sub_group_broadcast( short  x, uint sub_group_local_id );
+short2      __ovld __conv intel_sub_group_broadcast( short2 x, uint sub_group_local_id );
+short3      __ovld __conv intel_sub_group_broadcast( short3 x, uint sub_group_local_id );
+short4      __ovld __conv intel_sub_group_broadcast( short4 x, uint sub_group_local_id );
+short8      __ovld __conv intel_sub_group_broadcast( short8 x, uint sub_group_local_id );
+
+ushort      __ovld __conv intel_sub_group_broadcast( ushort  x, uint sub_group_local_id );
+ushort2     __ovld __conv intel_sub_group_broadcast( ushort2 x, uint sub_group_local_id );
+ushort3     __ovld __conv intel_sub_group_broadcast( ushort3 x, uint sub_group_local_id );
+ushort4     __ovld __conv intel_sub_group_broadcast( ushort4 x, uint sub_group_local_id );
+ushort8     __ovld __conv intel_sub_group_broadcast( ushort8 x, uint sub_group_local_id );
+
+short       __ovld __conv intel_sub_group_shuffle( short   x, uint c );
+short2      __ovld __conv intel_sub_group_shuffle( short2  x, uint c );
+short3      __ovld __conv intel_sub_group_shuffle( short3  x, uint c );
+short4      __ovld __conv intel_sub_group_shuffle( short4  x, uint c );
+short8      __ovld __conv intel_sub_group_shuffle( short8  x, uint c );
+short16     __ovld __conv intel_sub_group_shuffle( short16 x, uint c);
+
+ushort      __ovld __conv intel_sub_group_shuffle( ushort   x, uint c );
+ushort2     __ovld __conv intel_sub_group_shuffle( ushort2  x, uint c );
+ushort3     __ovld __conv intel_sub_group_shuffle( ushort3  x, uint c );
+ushort4     __ovld __conv intel_sub_group_shuffle( ushort4  x, uint c );
+ushort8     __ovld __conv intel_sub_group_shuffle( ushort8  x, uint c );
+ushort16    __ovld __conv intel_sub_group_shuffle( ushort16 x, uint c );
+
+short       __ovld __conv intel_sub_group_shuffle_down( short   cur, short   next, uint c );
+short2      __ovld __conv intel_sub_group_shuffle_down( short2  cur, short2  next, uint c );
+short3      __ovld __conv intel_sub_group_shuffle_down( short3  cur, short3  next, uint c );
+short4      __ovld __conv intel_sub_group_shuffle_down( short4  cur, short4  next, uint c );
+short8      __ovld __conv intel_sub_group_shuffle_down( short8  cur, short8  next, uint c );
+short16     __ovld __conv intel_sub_group_shuffle_down( short16 cur, short16 next, uint c );
+
+ushort      __ovld __conv intel_sub_group_shuffle_down( ushort   cur, ushort   next, uint c );
+ushort2     __ovld __conv intel_sub_group_shuffle_down( ushort2  cur, ushort2  next, uint c );
+ushort3     __ovld __conv intel_sub_group_shuffle_down( ushort3  cur, ushort3  next, uint c );
+ushort4     __ovld __conv intel_sub_group_shuffle_down( ushort4  cur, ushort4  next, uint c );
+ushort8     __ovld __conv intel_sub_group_shuffle_down( ushort8  cur, ushort8  next, uint c );
+ushort16    __ovld __conv intel_sub_group_shuffle_down( ushort16 cur, ushort16 next, uint c );
+
+short       __ovld __conv intel_sub_group_shuffle_up( short   cur, short   next, uint c );
+short2      __ovld __conv intel_sub_group_shuffle_up( short2  cur, short2  next, uint c );
+short3      __ovld __conv intel_sub_group_shuffle_up( short3  cur, short3  next, uint c );
+short4      __ovld __conv intel_sub_group_shuffle_up( short4  cur, short4  next, uint c );
+short8      __ovld __conv intel_sub_group_shuffle_up( short8  cur, short8  next, uint c );
+short16     __ovld __conv intel_sub_group_shuffle_up( short16 cur, short16 next, uint c );
+
+ushort      __ovld __conv intel_sub_group_shuffle_up( ushort   cur, ushort   next, uint c );
+ushort2     __ovld __conv intel_sub_group_shuffle_up( ushort2  cur, ushort2  next, uint c );
+ushort3     __ovld __conv intel_sub_group_shuffle_up( ushort3  cur, ushort3  next, uint c );
+ushort4     __ovld __conv intel_sub_group_shuffle_up( ushort4  cur, ushort4  next, uint c );
+ushort8     __ovld __conv intel_sub_group_shuffle_up( ushort8  cur, ushort8  next, uint c );
+ushort16    __ovld __conv intel_sub_group_shuffle_up( ushort16 cur, ushort16 next, uint c );
+
+short       __ovld __conv intel_sub_group_shuffle_xor( short   x, uint c );
+short2      __ovld __conv intel_sub_group_shuffle_xor( short2  x, uint c );
+short3      __ovld __conv intel_sub_group_shuffle_xor( short3  x, uint c );
+short4      __ovld __conv intel_sub_group_shuffle_xor( short4  x, uint c );
+short8      __ovld __conv intel_sub_group_shuffle_xor( short8  x, uint c );
+short16     __ovld __conv intel_sub_group_shuffle_xor( short16 x, uint c );
+
+ushort      __ovld __conv intel_sub_group_shuffle_xor( ushort   x, uint c );
+ushort2     __ovld __conv intel_sub_group_shuffle_xor( ushort2  x, uint c );
+ushort3     __ovld __conv intel_sub_group_shuffle_xor( ushort3  x, uint c );
+ushort4     __ovld __conv intel_sub_group_shuffle_xor( ushort4  x, uint c );
+ushort8     __ovld __conv intel_sub_group_shuffle_xor( ushort8  x, uint c );
+ushort16    __ovld __conv intel_sub_group_shuffle_xor( ushort16 x, uint c );
+
+short       __ovld __conv intel_sub_group_reduce_add( short   x );
+ushort      __ovld __conv intel_sub_group_reduce_add( ushort  x );
+short       __ovld __conv intel_sub_group_reduce_min( short   x );
+ushort      __ovld __conv intel_sub_group_reduce_min( ushort  x );
+short       __ovld __conv intel_sub_group_reduce_max( short   x );
+ushort      __ovld __conv intel_sub_group_reduce_max( ushort  x );
+
+short       __ovld __conv intel_sub_group_scan_exclusive_add( short   x );
+ushort      __ovld __conv intel_sub_group_scan_exclusive_add( ushort  x );
+short       __ovld __conv intel_sub_group_scan_exclusive_min( short   x );
+ushort      __ovld __conv intel_sub_group_scan_exclusive_min( ushort  x );
+short       __ovld __conv intel_sub_group_scan_exclusive_max( short   x );
+ushort      __ovld __conv intel_sub_group_scan_exclusive_max( ushort  x );
+
+short       __ovld __conv intel_sub_group_scan_inclusive_add( short   x );
+ushort      __ovld __conv intel_sub_group_scan_inclusive_add( ushort  x );
+short       __ovld __conv intel_sub_group_scan_inclusive_min( short   x );
+ushort      __ovld __conv intel_sub_group_scan_inclusive_min( ushort  x );
+short       __ovld __conv intel_sub_group_scan_inclusive_max( short   x );
+ushort      __ovld __conv intel_sub_group_scan_inclusive_max( ushort  x );
+
+uint       __ovld __conv intel_sub_group_block_read_ui( read_only image2d_t image, int2 byte_coord );
+uint2      __ovld __conv intel_sub_group_block_read_ui2( read_only image2d_t image, int2 byte_coord );
+uint4      __ovld __conv intel_sub_group_block_read_ui4( read_only image2d_t image, int2 byte_coord );
+uint8      __ovld __conv intel_sub_group_block_read_ui8( read_only image2d_t image, int2 byte_coord );
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+uint       __ovld __conv intel_sub_group_block_read_ui( read_write image2d_t image, int2 byte_coord );
+uint2      __ovld __conv intel_sub_group_block_read_ui2( read_write image2d_t image, int2 byte_coord );
+uint4      __ovld __conv intel_sub_group_block_read_ui4( read_write image2d_t image, int2 byte_coord );
+uint8      __ovld __conv intel_sub_group_block_read_ui8( read_write image2d_t image, int2 byte_coord );
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+uint       __ovld __conv intel_sub_group_block_read_ui( const __global uint* p );
+uint2      __ovld __conv intel_sub_group_block_read_ui2( const __global uint* p );
+uint4      __ovld __conv intel_sub_group_block_read_ui4( const __global uint* p );
+uint8      __ovld __conv intel_sub_group_block_read_ui8( const __global uint* p );
+
+void       __ovld __conv intel_sub_group_block_write_ui( read_only image2d_t image, int2 byte_coord, uint data );
+void       __ovld __conv intel_sub_group_block_write_ui2( read_only image2d_t image, int2 byte_coord, uint2 data );
+void       __ovld __conv intel_sub_group_block_write_ui4( read_only image2d_t image, int2 byte_coord, uint4 data );
+void       __ovld __conv intel_sub_group_block_write_ui8( read_only image2d_t image, int2 byte_coord, uint8 data );
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+void       __ovld __conv intel_sub_group_block_write_ui( read_write image2d_t image, int2 byte_coord, uint data );
+void       __ovld __conv intel_sub_group_block_write_ui2( read_write image2d_t image, int2 byte_coord, uint2 data );
+void       __ovld __conv intel_sub_group_block_write_ui4( read_write image2d_t image, int2 byte_coord, uint4 data );
+void       __ovld __conv intel_sub_group_block_write_ui8( read_write image2d_t image, int2 byte_coord, uint8 data );
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+void       __ovld __conv intel_sub_group_block_write_ui( __global uint* p, uint data );
+void       __ovld __conv intel_sub_group_block_write_ui2( __global uint* p, uint2 data );
+void       __ovld __conv intel_sub_group_block_write_ui4( __global uint* p, uint4 data );
+void       __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint8 data );
+
+ushort      __ovld __conv intel_sub_group_block_read_us( read_only image2d_t image, int2 coord );
+ushort2     __ovld __conv intel_sub_group_block_read_us2( read_only image2d_t image, int2 coord );
+ushort4     __ovld __conv intel_sub_group_block_read_us4( read_only image2d_t image, int2 coord );
+ushort8     __ovld __conv intel_sub_group_block_read_us8( read_only image2d_t image, int2 coord );
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+ushort      __ovld __conv intel_sub_group_block_read_us(read_write image2d_t image, int2 coord);
+ushort2     __ovld __conv intel_sub_group_block_read_us2(read_write image2d_t image, int2 coord);
+ushort4     __ovld __conv intel_sub_group_block_read_us4(read_write image2d_t image, int2 coord);
+ushort8     __ovld __conv intel_sub_group_block_read_us8(read_write image2d_t image, int2 coord);
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+ushort      __ovld __conv intel_sub_group_block_read_us(  const __global ushort* p );
+ushort2     __ovld __conv intel_sub_group_block_read_us2( const __global ushort* p );
+ushort4     __ovld __conv intel_sub_group_block_read_us4( const __global ushort* p );
+ushort8     __ovld __conv intel_sub_group_block_read_us8( const __global ushort* p );
+
+void        __ovld __conv intel_sub_group_block_write_us(write_only image2d_t image, int2 coord, ushort  data);
+void        __ovld __conv intel_sub_group_block_write_us2(write_only image2d_t image, int2 coord, ushort2 data);
+void        __ovld __conv intel_sub_group_block_write_us4(write_only image2d_t image, int2 coord, ushort4 data);
+void        __ovld __conv intel_sub_group_block_write_us8(write_only image2d_t image, int2 coord, ushort8 data);
+
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+void        __ovld __conv intel_sub_group_block_write_us(read_write image2d_t image, int2 coord, ushort  data);
+void        __ovld __conv intel_sub_group_block_write_us2(read_write image2d_t image, int2 coord, ushort2 data);
+void        __ovld __conv intel_sub_group_block_write_us4(read_write image2d_t image, int2 coord, ushort4 data);
+void        __ovld __conv intel_sub_group_block_write_us8(read_write image2d_t image, int2 coord, ushort8 data);
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+
+void        __ovld __conv intel_sub_group_block_write_us(  __global ushort* p, ushort  data );
+void        __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data );
+void        __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data );
+void        __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data );
+#endif // cl_intel_subgroups_short
+
 #ifdef cl_amd_media_ops
 uint __ovld amd_bitalign(uint a, uint b, uint c);
 uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
--- a/c_headers/pmmintrin.h
+++ b/c_headers/pmmintrin.h
@ -115,8 +115,8 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
 }

-/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
-///    vector of [4 x float] to float values stored in a 128-bit vector of
+/// \brief Moves and duplicates odd-indexed values from a 128-bit vector
+///    of [4 x float] to float values stored in a 128-bit vector of
 ///    [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a)
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
 }

-/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
+/// \brief Duplicates even-indexed values from a 128-bit vector of
 ///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
--- a/c_headers/smmintrin.h
+++ b/c_headers/smmintrin.h
@ -648,7 +648,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    input vectors are used as an input for dot product; otherwise that input
 ///    is treated as zero. Bits [1:0] determine which elements of the result
 ///    will receive a copy of the final dot product, with bit [0] corresponding
-///    to the lowest element and bit [3] corresponding to the highest element of
+///    to the lowest element and bit [1] corresponding to the highest element of
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
 #define _mm_dp_pd(X, Y, M) __extension__ ({\
@ -866,8 +866,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
 ///    Bits[3:0]: If any of these bits are set, the corresponding result
 ///    element is cleared.
-/// \returns A 128-bit vector of [4 x float] containing the copied single-
-///    precision floating point elements from the operands.
+/// \returns A 128-bit vector of [4 x float] containing the copied
+///    single-precision floating point elements from the operands.
 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))

 /// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
--- a/c_headers/stdbool.h
+++ b/c_headers/stdbool.h
@ -32,12 +32,15 @@
 #define true 1
 #define false 0
 #elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
-/* Define _Bool, bool, false, true as a GNU extension. */
+/* Define _Bool as a GNU extension. */
 #define _Bool bool
+#if __cplusplus < 201103L
+/* For C++98, define bool, false, true as a GNU extension. */
 #define bool  bool
 #define false false
 #define true  true
 #endif
+#endif

 #define __bool_true_false_are_defined 1

--- a/c_headers/vaesintrin.h
+++ b/c_headers/vaesintrin.h
@ -0,0 +1,98 @@
+/*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VAESINTRIN_H
+#define __VAESINTRIN_H
+
+/* Default attributes for YMM forms. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes")))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes")))
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesenc_epi128(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_aesenc256((__v4di) __A,
+              (__v4di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenc_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesdec_epi128(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_aesdec256((__v4di) __A,
+              (__v4di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesdec_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesenclast_epi128(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A,
+              (__v4di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A,
+              (__v4di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
+              (__v8di) __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_F
+
+#endif
--- a/c_headers/vpclmulqdqintrin.h
+++ b/c_headers/vpclmulqdqintrin.h
@ -0,0 +1,42 @@
+/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VPCLMULQDQINTRIN_H
+#define __VPCLMULQDQINTRIN_H
+
+#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({    \
+  (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A),  \
+                                       (__v4di)(__m256i)(B),  \
+                                       (char)(I)); })
+
+#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({    \
+  (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
+                                       (__v8di)(__m512i)(B),  \
+                                       (char)(I)); })
+
+#endif // __VPCLMULQDQINTRIN_H
+
--- a/c_headers/xmmintrin.h
+++ b/c_headers/xmmintrin.h
@ -2035,9 +2035,11 @@ _mm_storer_ps(float *__p, __m128 __a)
  _mm_store_ps(__p, __a);
 }

-#define _MM_HINT_T0 3
-#define _MM_HINT_T1 2
-#define _MM_HINT_T2 1
+#define _MM_HINT_ET0 7
+#define _MM_HINT_ET1 6
+#define _MM_HINT_T0  3
+#define _MM_HINT_T1  2
+#define _MM_HINT_T2  1
 #define _MM_HINT_NTA 0

 #ifndef _MSC_VER
@ -2068,7 +2070,8 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    be generated. \n
 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
 ///    be generated.
-#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
+                                                 ((sel) >> 2) & 1, (sel) & 0x3))
 #endif

 /// \brief Stores a 64-bit integer in the specified aligned memory location. To