update C headers to clang 4.0.0

closes #389
master
Andrew Kelley 2017-06-16 14:35:00 -04:00
parent c0f9012bed
commit 91afdc58d2
81 changed files with 142058 additions and 4404 deletions

1
.gitignore vendored
View File

@ -2,6 +2,7 @@ zig-cache/
build/
build-release/
build-windows/
build-llvm-5/
/.cproject
/.project
/.settings/

View File

@ -64,76 +64,6 @@ set(ZIG_SOURCES
"${CMAKE_SOURCE_DIR}/src/zig_llvm.cpp"
)
set(C_HEADERS
"${CMAKE_SOURCE_DIR}/c_headers/Intrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/__stddef_max_align_t.h"
"${CMAKE_SOURCE_DIR}/c_headers/__wmmintrin_aes.h"
"${CMAKE_SOURCE_DIR}/c_headers/__wmmintrin_pclmul.h"
"${CMAKE_SOURCE_DIR}/c_headers/adxintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/ammintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/arm_acle.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx2intrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx512bwintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx512cdintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx512dqintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx512erintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx512fintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx512vlbwintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx512vldqintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avx512vlintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/avxintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/bmi2intrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/bmiintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/cpuid.h"
"${CMAKE_SOURCE_DIR}/c_headers/cuda_builtin_vars.h"
"${CMAKE_SOURCE_DIR}/c_headers/emmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/f16cintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/float.h"
"${CMAKE_SOURCE_DIR}/c_headers/fma4intrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/fmaintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/fxsrintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/htmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/htmxlintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/ia32intrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/immintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/inttypes.h"
"${CMAKE_SOURCE_DIR}/c_headers/iso646.h"
"${CMAKE_SOURCE_DIR}/c_headers/limits.h"
"${CMAKE_SOURCE_DIR}/c_headers/lzcntintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/mm3dnow.h"
"${CMAKE_SOURCE_DIR}/c_headers/mm_malloc.h"
"${CMAKE_SOURCE_DIR}/c_headers/mmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/nmmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/pmmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/popcntintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/prfchwintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/rdseedintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/rtmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/s390intrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/shaintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/smmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/stdalign.h"
"${CMAKE_SOURCE_DIR}/c_headers/stdarg.h"
"${CMAKE_SOURCE_DIR}/c_headers/stdatomic.h"
"${CMAKE_SOURCE_DIR}/c_headers/stdbool.h"
"${CMAKE_SOURCE_DIR}/c_headers/stddef.h"
"${CMAKE_SOURCE_DIR}/c_headers/stdint.h"
"${CMAKE_SOURCE_DIR}/c_headers/stdnoreturn.h"
"${CMAKE_SOURCE_DIR}/c_headers/tbmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/tgmath.h"
"${CMAKE_SOURCE_DIR}/c_headers/tmmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/unwind.h"
"${CMAKE_SOURCE_DIR}/c_headers/vadefs.h"
"${CMAKE_SOURCE_DIR}/c_headers/varargs.h"
"${CMAKE_SOURCE_DIR}/c_headers/vecintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/wmmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/x86intrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/xmmintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/xopintrin.h"
"${CMAKE_SOURCE_DIR}/c_headers/xtestintrin.h"
)
set(ZIG_HOST_LINK_VERSION)
if (APPLE)
set(LD_V_OUTPUT)
@ -198,7 +128,99 @@ if(MINGW)
endif()
install(TARGETS zig DESTINATION bin)
install(FILES ${C_HEADERS} DESTINATION ${C_HEADERS_DEST})
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_builtin_vars.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_cmath.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_complex_builtins.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_intrinsics.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_math_forward_declares.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_runtime_wrapper.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__stddef_max_align_t.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__wmmintrin_aes.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__wmmintrin_pclmul.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/adxintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/altivec.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/ammintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/arm_acle.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/arm_neon.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/armintr.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx2intrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512bwintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512cdintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512dqintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512erintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512fintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512ifmaintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512ifmavlintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512pfintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vbmiintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vbmivlintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlbwintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlcdintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vldqintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avxintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmi2intrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmiintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/clflushoptintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cpuid.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/algorithm" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/complex" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/new" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/emmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/f16cintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/float.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/fma4intrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/fmaintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/fxsrintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/htmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/htmxlintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/ia32intrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/immintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/intrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/inttypes.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/iso646.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/limits.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/lzcntintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm3dnow.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm_malloc.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/module.modulemap" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/msa.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mwaitxintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/nmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/opencl-c.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/pkuintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/pmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/popcntintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/prfchwintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/rdseedintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/rtmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/s390intrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/shaintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/smmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdalign.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdarg.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdatomic.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdbool.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stddef.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdint.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdnoreturn.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/tbmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/tgmath.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/tmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/unwind.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/vadefs.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/varargs.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/vecintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/wmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/x86intrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xopintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xsavecintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xsaveintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xsaveoptintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xsavesintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xtestintrin.h" DESTINATION "${C_HEADERS_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/std/array_list.zig" DESTINATION "${ZIG_STD_DEST}")
install(FILES "${CMAKE_SOURCE_DIR}/std/base64.zig" DESTINATION "${ZIG_STD_DEST}")

View File

@ -24,16 +24,20 @@
#ifndef __CUDA_BUILTIN_VARS_H
#define __CUDA_BUILTIN_VARS_H
// Forward declares from vector_types.h.
struct uint3;
struct dim3;
// The file implements built-in CUDA variables using __declspec(property).
// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
// All read accesses of built-in variable fields get converted into calls to a
// getter function which in turn would call appropriate builtin to fetch the
// getter function which in turn calls the appropriate builtin to fetch the
// value.
//
// Example:
// int x = threadIdx.x;
// IR output:
// %0 = call i32 @llvm.ptx.read.tid.x() #3
// %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
// PTX output:
// mov.u32 %r2, %tid.x;
@ -60,33 +64,45 @@
__attribute__((device)) TypeName *operator&() const __DELETE
struct __cuda_builtin_threadIdx_t {
__CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_tid_x());
__CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_tid_y());
__CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_tid_z());
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
// threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
// uint3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator uint3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
};
struct __cuda_builtin_blockIdx_t {
__CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ctaid_x());
__CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ctaid_y());
__CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ctaid_z());
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
// blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
// uint3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator uint3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
};
struct __cuda_builtin_blockDim_t {
__CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ntid_x());
__CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ntid_y());
__CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ntid_z());
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
// blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
// dim3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator dim3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
};
struct __cuda_builtin_gridDim_t {
__CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_nctaid_x());
__CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_nctaid_y());
__CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_nctaid_z());
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
// gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
// dim3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator dim3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
};

View File

@ -0,0 +1,487 @@
/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_CMATH_H__
#define __CLANG_CUDA_CMATH_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
#include <limits>
// CUDA lets us use various std math functions on the device side. This file
// works in concert with __clang_cuda_math_forward_declares.h to make this work.
//
// Specifically, the forward-declares header declares __device__ overloads for
// these functions in the global namespace, then pulls them into namespace std
// with 'using' statements. Then this file implements those functions, after
// their implementations have been pulled in.
//
// It's important that we declare the functions in the global namespace and pull
// them into namespace std with using statements, as opposed to simply declaring
// these functions in namespace std, because our device functions need to
// overload the standard library functions, which may be declared in the global
// namespace or in std, depending on the degree of conformance of the stdlib
// implementation. Declaring in the global namespace and pulling into namespace
// std covers all of the known knowns.
#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
__DEVICE__ long abs(long __n) { return ::labs(__n); }
__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
__DEVICE__ double abs(double __x) { return ::fabs(__x); }
__DEVICE__ float acos(float __x) { return ::acosf(__x); }
__DEVICE__ float asin(float __x) { return ::asinf(__x); }
__DEVICE__ float atan(float __x) { return ::atanf(__x); }
__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
__DEVICE__ float cos(float __x) { return ::cosf(__x); }
__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
__DEVICE__ float exp(float __x) { return ::expf(__x); }
__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
__DEVICE__ float floor(float __x) { return ::floorf(__x); }
__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
__DEVICE__ int fpclassify(float __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
__DEVICE__ int fpclassify(double __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
__DEVICE__ float frexp(float __arg, int *__exp) {
return ::frexpf(__arg, __exp);
}
// For inscrutable reasons, the CUDA headers define these functions for us on
// Windows.
#ifndef _MSC_VER
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
// For inscrutable reasons, __finite(), the double-precision version of
// __finitef, does not exist when compiling for MacOS. __isfinited is available
// everywhere and is just as good.
__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
#endif
__DEVICE__ bool isgreater(float __x, float __y) {
return __builtin_isgreater(__x, __y);
}
__DEVICE__ bool isgreater(double __x, double __y) {
return __builtin_isgreater(__x, __y);
}
__DEVICE__ bool isgreaterequal(float __x, float __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ bool isgreaterequal(double __x, double __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ bool isless(float __x, float __y) {
return __builtin_isless(__x, __y);
}
__DEVICE__ bool isless(double __x, double __y) {
return __builtin_isless(__x, __y);
}
__DEVICE__ bool islessequal(float __x, float __y) {
return __builtin_islessequal(__x, __y);
}
__DEVICE__ bool islessequal(double __x, double __y) {
return __builtin_islessequal(__x, __y);
}
__DEVICE__ bool islessgreater(float __x, float __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ bool islessgreater(double __x, double __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
__DEVICE__ bool isunordered(float __x, float __y) {
return __builtin_isunordered(__x, __y);
}
__DEVICE__ bool isunordered(double __x, double __y) {
return __builtin_isunordered(__x, __y);
}
__DEVICE__ float ldexp(float __arg, int __exp) {
return ::ldexpf(__arg, __exp);
}
__DEVICE__ float log(float __x) { return ::logf(__x); }
__DEVICE__ float log10(float __x) { return ::log10f(__x); }
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
__DEVICE__ float nexttoward(float __from, double __to) {
return __builtin_nexttowardf(__from, __to);
}
__DEVICE__ double nexttoward(double __from, double __to) {
return __builtin_nexttoward(__from, __to);
}
__DEVICE__ float nexttowardf(float __from, double __to) {
return __builtin_nexttowardf(__from, __to);
}
__DEVICE__ float pow(float __base, float __exp) {
return ::powf(__base, __exp);
}
__DEVICE__ float pow(float __base, int __iexp) {
return ::powif(__base, __iexp);
}
__DEVICE__ double pow(double __base, int __iexp) {
return ::powi(__base, __iexp);
}
__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
__DEVICE__ float sin(float __x) { return ::sinf(__x); }
__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
// Now we've defined everything we promised we'd define in
// __clang_cuda_math_forward_declares.h. We need to do two additional things to
// fix up our math functions.
//
// 1) Define __device__ overloads for e.g. sin(int). The CUDA headers define
// only sin(float) and sin(double), which means that e.g. sin(0) is
// ambiguous.
//
// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
// std. These are defined in the CUDA headers in the global namespace,
// independent of everything else we've done here.
// We can't use std::enable_if, because we want to be pre-C++11 compatible. But
// we go ahead and unconditionally define functions that are only available when
// compiling for C++11 to match the behavior of the CUDA headers.
template<bool __B, class __T = void>
struct __clang_cuda_enable_if {};
template <class __T> struct __clang_cuda_enable_if<true, __T> {
typedef __T type;
};
// Defines an overload of __fn that accepts one integral argument, calls
// __fn((double)x), and returns __retty.
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn) \
template <typename __T> \
__DEVICE__ \
typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, \
__retty>::type \
__fn(__T __x) { \
return ::__fn((double)__x); \
}
// Defines an overload of __fn that accepts one two arithmetic arguments, calls
// __fn((double)x, (double)y), and returns a double.
//
// Note this is different from OVERLOAD_1, which generates an overload that
// accepts only *integral* arguments.
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn) \
template <typename __T1, typename __T2> \
__DEVICE__ typename __clang_cuda_enable_if< \
std::numeric_limits<__T1>::is_specialized && \
std::numeric_limits<__T2>::is_specialized, \
__retty>::type \
__fn(__T1 __x, __T2 __y) { \
return __fn((double)__x, (double)__y); \
}
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
// Overloads for functions that don't match the patterns expected by
// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
template <typename __T1, typename __T2, typename __T3>
__DEVICE__ typename __clang_cuda_enable_if<
std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized &&
std::numeric_limits<__T3>::is_specialized,
double>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
return std::fma((double)__x, (double)__y, (double)__z);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
frexp(__T __x, int *__exp) {
return std::frexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
ldexp(__T __x, int __exp) {
return std::ldexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
nexttoward(__T __from, double __to) {
return std::nexttoward((double)__from, __to);
}
template <typename __T1, typename __T2>
__DEVICE__ typename __clang_cuda_enable_if<
std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized,
double>::type
remquo(__T1 __x, __T2 __y, int *__quo) {
return std::remquo((double)__x, (double)__y, __quo);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
scalbln(__T __x, long __exp) {
return std::scalbln((double)__x, __exp);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
scalbn(__T __x, int __exp) {
return std::scalbn((double)__x, __exp);
}
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
// Pull the new overloads we defined above into namespace std.
using ::acos;
using ::acosh;
using ::asin;
using ::asinh;
using ::atan;
using ::atan2;
using ::atanh;
using ::cbrt;
using ::ceil;
using ::copysign;
using ::cos;
using ::cosh;
using ::erf;
using ::erfc;
using ::exp;
using ::exp2;
using ::expm1;
using ::fabs;
using ::fdim;
using ::floor;
using ::fma;
using ::fmax;
using ::fmin;
using ::fmod;
using ::fpclassify;
using ::frexp;
using ::hypot;
using ::ilogb;
using ::isfinite;
using ::isgreater;
using ::isgreaterequal;
using ::isless;
using ::islessequal;
using ::islessgreater;
using ::isnormal;
using ::isunordered;
using ::ldexp;
using ::lgamma;
using ::llrint;
using ::llround;
using ::log;
using ::log10;
using ::log1p;
using ::log2;
using ::logb;
using ::lrint;
using ::lround;
using ::nearbyint;
using ::nextafter;
using ::nexttoward;
using ::pow;
using ::remainder;
using ::remquo;
using ::rint;
using ::round;
using ::scalbln;
using ::scalbn;
using ::signbit;
using ::sin;
using ::sinh;
using ::sqrt;
using ::tan;
using ::tanh;
using ::tgamma;
using ::trunc;
// Well this is fun: We need to pull these symbols in for libc++, but we can't
// pull them in with libstdc++, because its ::isinf and ::isnan are different
// than its std::isinf and std::isnan.
#ifndef __GLIBCXX__
using ::isinf;
using ::isnan;
#endif
// Finally, pull the "foobarf" functions that CUDA defines in its headers into
// namespace std.
using ::acosf;
using ::acoshf;
using ::asinf;
using ::asinhf;
using ::atan2f;
using ::atanf;
using ::atanhf;
using ::cbrtf;
using ::ceilf;
using ::copysignf;
using ::cosf;
using ::coshf;
using ::erfcf;
using ::erff;
using ::exp2f;
using ::expf;
using ::expm1f;
using ::fabsf;
using ::fdimf;
using ::floorf;
using ::fmaf;
using ::fmaxf;
using ::fminf;
using ::fmodf;
using ::frexpf;
using ::hypotf;
using ::ilogbf;
using ::ldexpf;
using ::lgammaf;
using ::llrintf;
using ::llroundf;
using ::log10f;
using ::log1pf;
using ::log2f;
using ::logbf;
using ::logf;
using ::lrintf;
using ::lroundf;
using ::modff;
using ::nearbyintf;
using ::nextafterf;
using ::nexttowardf;
using ::nexttowardf;
using ::powf;
using ::remainderf;
using ::remquof;
using ::rintf;
using ::roundf;
using ::scalblnf;
using ::scalbnf;
using ::sinf;
using ::sinhf;
using ::sqrtf;
using ::tanf;
using ::tanhf;
using ::tgammaf;
using ::truncf;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#undef __DEVICE__
#endif

View File

@ -0,0 +1,203 @@
/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
#define __CLANG_CUDA_COMPLEX_BUILTINS
// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are
// libgcc functions that clang assumes are available when compiling c99 complex
// operations. (These implementations come from libc++, and have been modified
// to work with CUDA.)
extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
double __c, double __d) {
double __ac = __a * __c;
double __bd = __b * __d;
double __ad = __a * __d;
double __bc = __b * __c;
double _Complex z;
__real__(z) = __ac - __bd;
__imag__(z) = __ad + __bc;
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
int __recalc = 0;
if (std::isinf(__a) || std::isinf(__b)) {
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
if (std::isnan(__c))
__c = std::copysign(0, __c);
if (std::isnan(__d))
__d = std::copysign(0, __d);
__recalc = 1;
}
if (std::isinf(__c) || std::isinf(__d)) {
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
if (std::isnan(__a))
__a = std::copysign(0, __a);
if (std::isnan(__b))
__b = std::copysign(0, __b);
__recalc = 1;
}
if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
std::isinf(__ad) || std::isinf(__bc))) {
if (std::isnan(__a))
__a = std::copysign(0, __a);
if (std::isnan(__b))
__b = std::copysign(0, __b);
if (std::isnan(__c))
__c = std::copysign(0, __c);
if (std::isnan(__d))
__d = std::copysign(0, __d);
__recalc = 1;
}
if (__recalc) {
// Can't use std::numeric_limits<double>::infinity() -- that doesn't have
// a device overload (and isn't constexpr before C++11, naturally).
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
}
}
return z;
}
extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
float __c, float __d) {
float __ac = __a * __c;
float __bd = __b * __d;
float __ad = __a * __d;
float __bc = __b * __c;
float _Complex z;
__real__(z) = __ac - __bd;
__imag__(z) = __ad + __bc;
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
int __recalc = 0;
if (std::isinf(__a) || std::isinf(__b)) {
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
if (std::isnan(__c))
__c = std::copysign(0, __c);
if (std::isnan(__d))
__d = std::copysign(0, __d);
__recalc = 1;
}
if (std::isinf(__c) || std::isinf(__d)) {
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
if (std::isnan(__a))
__a = std::copysign(0, __a);
if (std::isnan(__b))
__b = std::copysign(0, __b);
__recalc = 1;
}
if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
std::isinf(__ad) || std::isinf(__bc))) {
if (std::isnan(__a))
__a = std::copysign(0, __a);
if (std::isnan(__b))
__b = std::copysign(0, __b);
if (std::isnan(__c))
__c = std::copysign(0, __c);
if (std::isnan(__d))
__d = std::copysign(0, __d);
__recalc = 1;
}
if (__recalc) {
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
}
}
return z;
}
extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
double __c, double __d) {
int __ilogbw = 0;
// Can't use std::max, because that's defined in <algorithm>, and we don't
// want to pull that in for every compile. The CUDA headers define
// ::max(float, float) and ::max(double, double), which is sufficient for us.
double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
if (std::isfinite(__logbw)) {
__ilogbw = (int)__logbw;
__c = std::scalbn(__c, -__ilogbw);
__d = std::scalbn(__d, -__ilogbw);
}
double __denom = __c * __c + __d * __d;
double _Complex z;
__real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
__imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
__real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
__imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
} else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
std::isfinite(__d)) {
__a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
__b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
} else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
std::isfinite(__b)) {
__c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
__d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
__real__(z) = 0.0 * (__a * __c + __b * __d);
__imag__(z) = 0.0 * (__b * __c - __a * __d);
}
}
return z;
}
extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
float __c, float __d) {
int __ilogbw = 0;
float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
if (std::isfinite(__logbw)) {
__ilogbw = (int)__logbw;
__c = std::scalbn(__c, -__ilogbw);
__d = std::scalbn(__d, -__ilogbw);
}
float __denom = __c * __c + __d * __d;
float _Complex z;
__real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
__imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
__real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
__imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
} else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
std::isfinite(__d)) {
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
} else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
std::isfinite(__b)) {
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
__real__(z) = 0 * (__a * __c + __b * __d);
__imag__(z) = 0 * (__b * __c - __a * __d);
}
}
return z;
}
#endif // __CLANG_CUDA_COMPLEX_BUILTINS

View File

@ -0,0 +1,322 @@
/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_INTRINSICS_H__
#define __CLANG_CUDA_INTRINSICS_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
// sm_30 intrinsics: __shfl_{up,down,xor}.
#define __SM_30_INTRINSICS_H__
#define __SM_30_INTRINSICS_HPP__
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
#pragma push_macro("__MAKE_SHUFFLES")
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \
inline __device__ int __FnName(int __val, int __offset, \
int __width = warpSize) { \
return __IntIntrinsic(__val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(float __val, int __offset, \
int __width = warpSize) { \
return __FloatIntrinsic(__val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(static_cast<int>(__val), __offset, __width)); \
} \
inline __device__ long long __FnName(long long __val, int __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
__Bits __tmp; \
memcpy(&__val, &__tmp, sizeof(__val)); \
__tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
__tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
long long __ret; \
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \
} \
inline __device__ unsigned long long __FnName( \
unsigned long long __val, int __offset, int __width = warpSize) { \
return static_cast<unsigned long long>(::__FnName( \
static_cast<unsigned long long>(__val), __offset, __width)); \
} \
inline __device__ double __FnName(double __val, int __offset, \
int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp = ::__FnName(__tmp, __offset, __width); \
double __ret; \
memcpy(&__ret, &__tmp, sizeof(__ret)); \
return __ret; \
}
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
#pragma pop_macro("__MAKE_SHUFFLES")
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
// Prevent the vanilla sm_32 intrinsics header from being included.
#define __SM_32_INTRINSICS_H__
#define __SM_32_INTRINSICS_HPP__
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
inline __device__ long long __ldg(const long long *ptr) {
return __nvvm_ldg_ll(ptr);
}
inline __device__ unsigned char __ldg(const unsigned char *ptr) {
return __nvvm_ldg_uc(ptr);
}
inline __device__ unsigned short __ldg(const unsigned short *ptr) {
return __nvvm_ldg_us(ptr);
}
inline __device__ unsigned int __ldg(const unsigned int *ptr) {
return __nvvm_ldg_ui(ptr);
}
inline __device__ unsigned long __ldg(const unsigned long *ptr) {
return __nvvm_ldg_ul(ptr);
}
inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
return __nvvm_ldg_ull(ptr);
}
inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
inline __device__ char2 __ldg(const char2 *ptr) {
typedef char c2 __attribute__((ext_vector_type(2)));
// We can assume that ptr is aligned at least to char2's alignment, but the
// load will assume that ptr is aligned to char2's alignment. This is only
// safe if alignof(c2) <= alignof(char2).
c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
char2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ char4 __ldg(const char4 *ptr) {
typedef char c4 __attribute__((ext_vector_type(4)));
c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
char4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ short2 __ldg(const short2 *ptr) {
typedef short s2 __attribute__((ext_vector_type(2)));
s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
short2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ short4 __ldg(const short4 *ptr) {
typedef short s4 __attribute__((ext_vector_type(4)));
s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
short4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ int2 __ldg(const int2 *ptr) {
typedef int i2 __attribute__((ext_vector_type(2)));
i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
int2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ int4 __ldg(const int4 *ptr) {
typedef int i4 __attribute__((ext_vector_type(4)));
i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
int4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ longlong2 __ldg(const longlong2 *ptr) {
typedef long long ll2 __attribute__((ext_vector_type(2)));
ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
longlong2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uchar2 __ldg(const uchar2 *ptr) {
typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
uchar2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uchar4 __ldg(const uchar4 *ptr) {
typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
uchar4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ ushort2 __ldg(const ushort2 *ptr) {
typedef unsigned short us2 __attribute__((ext_vector_type(2)));
us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
ushort2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ ushort4 __ldg(const ushort4 *ptr) {
typedef unsigned short us4 __attribute__((ext_vector_type(4)));
us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
ushort4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ uint2 __ldg(const uint2 *ptr) {
typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
uint2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uint4 __ldg(const uint4 *ptr) {
typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
uint4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
ulonglong2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ float2 __ldg(const float2 *ptr) {
typedef float f2 __attribute__((ext_vector_type(2)));
f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
float2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ float4 __ldg(const float4 *ptr) {
typedef float f4 __attribute__((ext_vector_type(4)));
f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
float4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ double2 __ldg(const double2 *ptr) {
typedef double d2 __attribute__((ext_vector_type(2)));
d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
double2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
// TODO: Implement these as intrinsics, so the backend can work its magic on
// these. Alternatively, we could implement these as plain C and try to get
// llvm to recognize the relevant patterns.
inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.l.clamp.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned ret;
asm("shf.r.clamp.b32 %0, %1, %2, %3;"
: "=r"(ret)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return ret;
}
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)

View File

@ -0,0 +1,286 @@
/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
// This file forward-declares of some math functions we (or the CUDA headers)
// will define later. We need to do this, and do it before cmath is included,
// because the standard library may have constexpr math functions. In the
// absence of a prior __device__ decl, those constexpr functions may become
// implicitly host+device. host+device functions can't be overloaded, so that
// would preclude the use of our own __device__ overloads for these functions.
#pragma push_macro("__DEVICE__")
#define __DEVICE__ \
static __inline__ __attribute__((always_inline)) __attribute__((device))
__DEVICE__ double abs(double);
__DEVICE__ float abs(float);
__DEVICE__ int abs(int);
__DEVICE__ long abs(long);
__DEVICE__ long long abs(long long);
__DEVICE__ double acos(double);
__DEVICE__ float acos(float);
__DEVICE__ double acosh(double);
__DEVICE__ float acosh(float);
__DEVICE__ double asin(double);
__DEVICE__ float asin(float);
__DEVICE__ double asinh(double);
__DEVICE__ float asinh(float);
__DEVICE__ double atan2(double, double);
__DEVICE__ float atan2(float, float);
__DEVICE__ double atan(double);
__DEVICE__ float atan(float);
__DEVICE__ double atanh(double);
__DEVICE__ float atanh(float);
__DEVICE__ double cbrt(double);
__DEVICE__ float cbrt(float);
__DEVICE__ double ceil(double);
__DEVICE__ float ceil(float);
__DEVICE__ double copysign(double, double);
__DEVICE__ float copysign(float, float);
__DEVICE__ double cos(double);
__DEVICE__ float cos(float);
__DEVICE__ double cosh(double);
__DEVICE__ float cosh(float);
__DEVICE__ double erfc(double);
__DEVICE__ float erfc(float);
__DEVICE__ double erf(double);
__DEVICE__ float erf(float);
__DEVICE__ double exp2(double);
__DEVICE__ float exp2(float);
__DEVICE__ double exp(double);
__DEVICE__ float exp(float);
__DEVICE__ double expm1(double);
__DEVICE__ float expm1(float);
__DEVICE__ double fabs(double);
__DEVICE__ float fabs(float);
__DEVICE__ double fdim(double, double);
__DEVICE__ float fdim(float, float);
__DEVICE__ double floor(double);
__DEVICE__ float floor(float);
__DEVICE__ double fma(double, double, double);
__DEVICE__ float fma(float, float, float);
__DEVICE__ double fmax(double, double);
__DEVICE__ float fmax(float, float);
__DEVICE__ double fmin(double, double);
__DEVICE__ float fmin(float, float);
__DEVICE__ double fmod(double, double);
__DEVICE__ float fmod(float, float);
__DEVICE__ int fpclassify(double);
__DEVICE__ int fpclassify(float);
__DEVICE__ double frexp(double, int *);
__DEVICE__ float frexp(float, int *);
__DEVICE__ double hypot(double, double);
__DEVICE__ float hypot(float, float);
__DEVICE__ int ilogb(double);
__DEVICE__ int ilogb(float);
__DEVICE__ bool isfinite(double);
__DEVICE__ bool isfinite(float);
__DEVICE__ bool isgreater(double, double);
__DEVICE__ bool isgreaterequal(double, double);
__DEVICE__ bool isgreaterequal(float, float);
__DEVICE__ bool isgreater(float, float);
__DEVICE__ bool isinf(double);
__DEVICE__ bool isinf(float);
__DEVICE__ bool isless(double, double);
__DEVICE__ bool islessequal(double, double);
__DEVICE__ bool islessequal(float, float);
__DEVICE__ bool isless(float, float);
__DEVICE__ bool islessgreater(double, double);
__DEVICE__ bool islessgreater(float, float);
__DEVICE__ bool isnan(double);
__DEVICE__ bool isnan(float);
__DEVICE__ bool isnormal(double);
__DEVICE__ bool isnormal(float);
__DEVICE__ bool isunordered(double, double);
__DEVICE__ bool isunordered(float, float);
__DEVICE__ long labs(long);
__DEVICE__ double ldexp(double, int);
__DEVICE__ float ldexp(float, int);
__DEVICE__ double lgamma(double);
__DEVICE__ float lgamma(float);
__DEVICE__ long long llabs(long long);
__DEVICE__ long long llrint(double);
__DEVICE__ long long llrint(float);
__DEVICE__ double log10(double);
__DEVICE__ float log10(float);
__DEVICE__ double log1p(double);
__DEVICE__ float log1p(float);
__DEVICE__ double log2(double);
__DEVICE__ float log2(float);
__DEVICE__ double logb(double);
__DEVICE__ float logb(float);
__DEVICE__ double log(double);
__DEVICE__ float log(float);
__DEVICE__ long lrint(double);
__DEVICE__ long lrint(float);
__DEVICE__ long lround(double);
__DEVICE__ long lround(float);
__DEVICE__ long long llround(float); // No llround(double).
__DEVICE__ double modf(double, double *);
__DEVICE__ float modf(float, float *);
__DEVICE__ double nan(const char *);
__DEVICE__ float nanf(const char *);
__DEVICE__ double nearbyint(double);
__DEVICE__ float nearbyint(float);
__DEVICE__ double nextafter(double, double);
__DEVICE__ float nextafter(float, float);
__DEVICE__ double nexttoward(double, double);
__DEVICE__ float nexttoward(float, double);
__DEVICE__ float nexttowardf(float, double);
__DEVICE__ double pow(double, double);
__DEVICE__ double pow(double, int);
__DEVICE__ float pow(float, float);
__DEVICE__ float pow(float, int);
__DEVICE__ double remainder(double, double);
__DEVICE__ float remainder(float, float);
__DEVICE__ double remquo(double, double, int *);
__DEVICE__ float remquo(float, float, int *);
__DEVICE__ double rint(double);
__DEVICE__ float rint(float);
__DEVICE__ double round(double);
__DEVICE__ float round(float);
__DEVICE__ double scalbln(double, long);
__DEVICE__ float scalbln(float, long);
__DEVICE__ double scalbn(double, int);
__DEVICE__ float scalbn(float, int);
__DEVICE__ bool signbit(double);
__DEVICE__ bool signbit(float);
__DEVICE__ double sin(double);
__DEVICE__ float sin(float);
__DEVICE__ double sinh(double);
__DEVICE__ float sinh(float);
__DEVICE__ double sqrt(double);
__DEVICE__ float sqrt(float);
__DEVICE__ double tan(double);
__DEVICE__ float tan(float);
__DEVICE__ double tanh(double);
__DEVICE__ float tanh(float);
__DEVICE__ double tgamma(double);
__DEVICE__ float tgamma(float);
__DEVICE__ double trunc(double);
__DEVICE__ float trunc(float);
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
using ::abs;
using ::acos;
using ::acosh;
using ::asin;
using ::asinh;
using ::atan;
using ::atan2;
using ::atanh;
using ::cbrt;
using ::ceil;
using ::copysign;
using ::cos;
using ::cosh;
using ::erf;
using ::erfc;
using ::exp;
using ::exp2;
using ::expm1;
using ::fabs;
using ::fdim;
using ::floor;
using ::fma;
using ::fmax;
using ::fmin;
using ::fmod;
using ::fpclassify;
using ::frexp;
using ::hypot;
using ::ilogb;
using ::isfinite;
using ::isgreater;
using ::isgreaterequal;
using ::isinf;
using ::isless;
using ::islessequal;
using ::islessgreater;
using ::isnan;
using ::isnormal;
using ::isunordered;
using ::labs;
using ::ldexp;
using ::lgamma;
using ::llabs;
using ::llrint;
using ::log;
using ::log10;
using ::log1p;
using ::log2;
using ::logb;
using ::lrint;
using ::lround;
using ::llround;
using ::modf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nextafter;
using ::nexttoward;
using ::pow;
using ::remainder;
using ::remquo;
using ::rint;
using ::round;
using ::scalbln;
using ::scalbn;
using ::signbit;
using ::sin;
using ::sinh;
using ::sqrt;
using ::tan;
using ::tanh;
using ::tgamma;
using ::trunc;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#pragma pop_macro("__DEVICE__")
#endif

View File

@ -0,0 +1,347 @@
/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
/*
* WARNING: This header is intended to be directly -include'd by
* the compiler and is not supposed to be included by users.
*
* CUDA headers are implemented in a way that currently makes it
* impossible for user code to #include directly when compiling with
* Clang. They present different view of CUDA-supplied functions
* depending on where in NVCC's compilation pipeline the headers are
* included. Neither of these modes provides function definitions with
* correct attributes, so we use preprocessor to force the headers
* into a form that Clang can use.
*
* Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
* this file during every CUDA compilation.
*/
#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
#if defined(__CUDA__) && defined(__clang__)
// Include some forward declares that must come before cmath.
#include <__clang_cuda_math_forward_declares.h>
// Include some standard headers to avoid CUDA headers including them
// while some required macros (like __THROW) are in a weird state.
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
// Preserve common macros that will be changed below by us or by CUDA
// headers.
#pragma push_macro("__THROW")
#pragma push_macro("__CUDA_ARCH__")
// WARNING: Preprocessor hacks below are based on specific details of
// CUDA-7.x headers and are not expected to work with any other
// version of CUDA headers.
#include "cuda.h"
#if !defined(CUDA_VERSION)
#error "cuda.h did not define CUDA_VERSION"
#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
#error "Unsupported CUDA version!"
#endif
// Make largest subset of device functions available during host
// compilation -- SM_35 for the time being.
#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 350
#endif
#include "__clang_cuda_builtin_vars.h"
// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
// has taken care of builtin variables declared in the file.
#define __DEVICE_LAUNCH_PARAMETERS_H__
// {math,device}_functions.h only have declarations of the
// functions. We don't need them as we're going to pull in their
// definitions from .hpp files.
#define __DEVICE_FUNCTIONS_H__
#define __MATH_FUNCTIONS_H__
#define __COMMON_FUNCTIONS_H__
#undef __CUDACC__
#define __CUDABE__
// Disables definitions of device-side runtime support stubs in
// cuda_device_runtime_api.h
#include "driver_types.h"
#include "host_config.h"
#include "host_defines.h"
#undef __CUDABE__
#define __CUDACC__
#include "cuda_runtime.h"
#undef __CUDACC__
#define __CUDABE__
// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
// not have at the moment. Emulate them with a builtin memcpy/memset.
#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
#include "crt/device_runtime.h"
#include "crt/host_runtime.h"
// device_runtime.h defines __cxa_* macros that will conflict with
// cxxabi.h.
// FIXME: redefine these as __device__ functions.
#undef __cxa_vec_ctor
#undef __cxa_vec_cctor
#undef __cxa_vec_dtor
#undef __cxa_vec_new
#undef __cxa_vec_new2
#undef __cxa_vec_new3
#undef __cxa_vec_delete2
#undef __cxa_vec_delete
#undef __cxa_vec_delete3
#undef __cxa_pure_virtual
// math_functions.hpp expects this host function be defined on MacOS, but it
// ends up not being there because of the games we play here. Just define it
// ourselves; it's simple enough.
#ifdef __APPLE__
inline __host__ double __signbitd(double x) {
return std::signbit(x);
}
#endif
// We need decls for functions in CUDA's libdevice with __device__
// attribute only. Alas they come either as __host__ __device__ or
// with no attributes at all. To work around that, define __CUDA_RTC__
// which produces HD variant and undef __host__ which gives us desided
// decls with __device__ attribute.
#pragma push_macro("__host__")
#define __host__
#define __CUDACC_RTC__
#include "device_functions_decls.h"
#undef __CUDACC_RTC__
// Temporarily poison __host__ macro to ensure it's not used by any of
// the headers we're about to include.
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
// Previous versions used to check whether they are defined or not.
// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
// here to detect the switch.
#if defined(CU_DEVICE_INVALID)
#if !defined(__USE_FAST_MATH__)
#define __USE_FAST_MATH__ 0
#endif
#if !defined(__CUDA_PREC_DIV)
#define __CUDA_PREC_DIV 0
#endif
#endif
// device_functions.hpp and math_functions*.hpp use 'static
// __forceinline__' (with no __device__) for definitions of device
// functions. Temporarily redefine __forceinline__ to include
// __device__.
#pragma push_macro("__forceinline__")
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
#include "device_functions.hpp"
// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
// get the slow-but-accurate or fast-but-inaccurate versions of functions like
// sin and exp. This is controlled in clang by -fcuda-approx-transcendentals.
//
// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
// slow divides), so we need to scope our define carefully here.
#pragma push_macro("__USE_FAST_MATH__")
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
#define __USE_FAST_MATH__ 1
#endif
#include "math_functions.hpp"
#pragma pop_macro("__USE_FAST_MATH__")
#include "math_functions_dbl_ptx3.hpp"
#pragma pop_macro("__forceinline__")
// Pull in host-only functions that are only available when neither
// __CUDACC__ nor __CUDABE__ are defined.
#undef __MATH_FUNCTIONS_HPP__
#undef __CUDABE__
#include "math_functions.hpp"
// Alas, additional overloads for these functions are hard to get to.
// Considering that we only need these overloads for a few functions,
// we can provide them here.
static inline float rsqrt(float __a) { return rsqrtf(__a); }
static inline float rcbrt(float __a) { return rcbrtf(__a); }
static inline float sinpi(float __a) { return sinpif(__a); }
static inline float cospi(float __a) { return cospif(__a); }
static inline void sincospi(float __a, float *__b, float *__c) {
return sincospif(__a, __b, __c);
}
static inline float erfcinv(float __a) { return erfcinvf(__a); }
static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
static inline float normcdf(float __a) { return normcdff(__a); }
static inline float erfcx(float __a) { return erfcxf(__a); }
// For some reason single-argument variant is not always declared by
// CUDA headers. Alas, device_functions.hpp included below needs it.
static inline __device__ void __brkpt(int __c) { __brkpt(); }
// Now include *.hpp with definitions of various GPU functions. Alas,
// a lot of thins get declared/defined with __host__ attribute which
// we don't want and we have to define it out. We also have to include
// {device,math}_functions.hpp again in order to extract the other
// branch of #if/else inside.
#define __host__
#undef __CUDABE__
#define __CUDACC__
#undef __DEVICE_FUNCTIONS_HPP__
#include "device_atomic_functions.hpp"
#include "device_functions.hpp"
#include "sm_20_atomic_functions.hpp"
#include "sm_20_intrinsics.hpp"
#include "sm_32_atomic_functions.hpp"
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
// define them using builtins so that the optimizer can reason about and across
// these instructions. In particular, using intrinsics for ldg gets us the
// [addr+imm] addressing mode, which, although it doesn't actually exist in the
// hardware, seems to generate faster machine code because ptxas can more easily
// reason about our code.
#if CUDA_VERSION >= 8000
#include "sm_60_atomic_functions.hpp"
#include "sm_61_intrinsics.hpp"
#endif
#undef __MATH_FUNCTIONS_HPP__
// math_functions.hpp defines ::signbit as a __host__ __device__ function. This
// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
// math_function.hpp's ::signbit. It's guarded by #undef signbit, but that's
// conditional on __GNUC__. :)
#pragma push_macro("signbit")
#pragma push_macro("__GNUC__")
#undef __GNUC__
#define signbit __ignored_cuda_signbit
#include "math_functions.hpp"
#pragma pop_macro("__GNUC__")
#pragma pop_macro("signbit")
#pragma pop_macro("__host__")
#include "texture_indirect_functions.h"
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop_macro("__THROW")
// Set up compiler macros expected to be seen during compilation.
#undef __CUDABE__
#define __CUDACC__
extern "C" {
// Device-side CUDA system calls.
// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
// We need these declarations and wrappers for device-side
// malloc/free/printf calls to work without relying on
// -fcuda-disable-target-call-checks option.
__device__ int vprintf(const char *, const char *);
__device__ void free(void *) __attribute((nothrow));
__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
__device__ void __assertfail(const char *__message, const char *__file,
unsigned __line, const char *__function,
size_t __charSize) __attribute__((noreturn));
// In order for standard assert() macro on linux to work we need to
// provide device-side __assert_fail()
__device__ static inline void __assert_fail(const char *__message,
const char *__file, unsigned __line,
const char *__function) {
__assertfail(__message, __file, __line, __function, sizeof(char));
}
// Clang will convert printf into vprintf, but we still need
// device-side declaration for it.
__device__ int printf(const char *, ...);
} // extern "C"
// We also need device-side std::malloc and std::free.
namespace std {
__device__ static inline void free(void *__ptr) { ::free(__ptr); }
__device__ static inline void *malloc(size_t __size) {
return ::malloc(__size);
}
} // namespace std
// Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to
// come after we've pulled in the definition of uint3 and dim3.
__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
uint3 ret;
ret.x = x;
ret.y = y;
ret.z = z;
return ret;
}
__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
uint3 ret;
ret.x = x;
ret.y = y;
ret.z = z;
return ret;
}
__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
return dim3(x, y, z);
}
__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
return dim3(x, y, z);
}
#include <__clang_cuda_cmath.h>
#include <__clang_cuda_intrinsics.h>
#include <__clang_cuda_complex_builtins.h>
// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
// mode, giving them their "proper" types of dim3 and uint3. This is
// incompatible with the types we give in __clang_cuda_builtin_vars.h. As as
// hack, force-include the header (nvcc doesn't include it by default) but
// redefine dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are
// only used here for the redeclarations of blockDim and threadIdx.)
#pragma push_macro("dim3")
#pragma push_macro("uint3")
#define dim3 __cuda_builtin_blockDim_t
#define uint3 __cuda_builtin_threadIdx_t
#include "curand_mtgp32_kernel.h"
#pragma pop_macro("dim3")
#pragma pop_macro("uint3")
#pragma pop_macro("__USE_FAST_MATH__")
#endif // __CUDA__
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__

View File

@ -25,48 +25,127 @@
#include <emmintrin.h>
#if !defined (__AES__)
# error "AES instructions not enabled"
#else
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
/// \brief Performs a single round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the encrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesenc_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesenc128(__V, __R);
return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs the final round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the encrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesenclast_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesenclast128(__V, __R);
return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs a single round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the decrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesdec_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesdec128(__V, __R);
return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs the final round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the decrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesdeclast_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesdeclast128(__V, __R);
return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
}
/// \brief Applies the AES InvMixColumns() transformation to an expanded key
/// contained in the source operand, and writes the result to the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the expanded key.
/// \returns A 128-bit integer vector containing the transformed value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesimc_si128(__m128i __V)
{
return (__m128i)__builtin_ia32_aesimc128(__V);
return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
}
/// \brief Generates a round key for AES encyption, operating on 128-bit data
/// specified in the first source operand and using an 8-bit round constant
/// specified by the second source operand, and writes the result to the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
/// \endcode
///
/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
///
/// \param C
/// A 128-bit integer vector that is used to generate the AES encryption key.
/// \param R
/// An 8-bit round constant used to generate the AES encryption key.
/// \returns A 128-bit round key for AES encryption.
#define _mm_aeskeygenassist_si128(C, R) \
__builtin_ia32_aeskeygenassist128((C), (R))
(__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
#undef __DEFAULT_FN_ATTRS
#endif
#endif /* _WMMINTRIN_AES_H */

View File

@ -1,4 +1,4 @@
/*===---- __wmmintrin_pclmul.h - AES intrinsics ----------------------------===
/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@ -23,12 +23,35 @@
#ifndef _WMMINTRIN_PCLMUL_H
#define _WMMINTRIN_PCLMUL_H
#if !defined (__PCLMUL__)
# error "PCLMUL instruction is not enabled"
#else
/// \brief Multiplies two 64-bit integer values, which are selected from source
/// operands using the immediate-value operand. The multiplication is a
/// carry-less multiplication, and the 128-bit integer product is stored in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
/// \endcode
///
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
///
/// \param __X
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param __Y
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param __I
/// An immediate value specifying which 64-bit values to select from the
/// operands. Bit 0 is used to select a value from operand \a __X, and bit
/// 4 is used to select a value from operand \a __Y: \n
/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
/// \returns The 128-bit integer vector containing the result of the carry-less
/// multiplication of the selected 64-bit values.
#define _mm_clmulepi64_si128(__X, __Y, __I) \
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
(__v2di)(__m128i)(__Y), (char)(__I)))
#endif
#endif /* _WMMINTRIN_PCLMUL_H */

View File

@ -32,8 +32,7 @@
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
/* Intrinsics that are available only if __ADX__ defined */
#ifdef __ADX__
static __inline unsigned char __DEFAULT_FN_ATTRS
static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
unsigned int *__p)
{
@ -41,14 +40,13 @@ _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
}
#ifdef __x86_64__
static __inline unsigned char __DEFAULT_FN_ATTRS
static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
_addcarryx_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p)
{
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
}
#endif
#endif
/* Intrinsics that are also available if __ADX__ undefined */
static __inline unsigned char __DEFAULT_FN_ATTRS

16733
c_headers/altivec.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -24,27 +24,21 @@
#ifndef __AMMINTRIN_H
#define __AMMINTRIN_H
#ifndef __SSE4A__
#error "SSE4A instruction set not enabled"
#else
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index idx and of the length len.
/// integer vector operand at the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// \code
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
/// \endcode
/// \endcode
///
/// \code
/// This intrinsic corresponds to the \c EXTRQ instruction.
/// \endcode
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param x
/// The value from which bits are extracted.
@ -52,11 +46,11 @@
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than
/// 64, the result is undefined. If the length and index are both zero,
/// bits [63:0] of parameter x are extracted. If the length is zero
/// but the index is non-zero, the result is undefined.
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a x are extracted. If the length is zero but the
/// index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
/// extracted from the source operand.
#define _mm_extracti_si64(x, len, idx) \
@ -64,25 +58,23 @@
(char)(len), (char)(idx)))
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index and of the length specified by __y.
/// integer vector operand at the index and of the length specified by
/// \a __y.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// This intrinsic corresponds to the \c EXTRQ instruction.
/// \endcode
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param __x
/// The value from which bits are extracted.
/// \param __y
/// Specifies the index of the least significant bit at [13:8]
/// and the length at [5:0]; all other bits are ignored.
/// If bits [5:0] are zero, the length is interpreted as 64.
/// If the sum of the index and length is greater than 64, the result is
/// undefined. If the length and index are both zero, bits [63:0] of
/// parameter __x are extracted. If the length is zero but the index is
/// non-zero, the result is undefined.
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
/// Specifies the index of the least significant bit at [13:8] and the
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
/// length is interpreted as 64. If the sum of the index and length is
/// greater than 64, the result is undefined. If the length and index are
/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
/// from the source operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_extract_si64(__m128i __x, __m128i __y)
@ -90,97 +82,88 @@ _mm_extract_si64(__m128i __x, __m128i __y)
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
}
/// \brief Inserts bits of a specified length from the source integer vector
/// y into the lower 64 bits of the destination integer vector x at the
/// index idx and of the length len.
/// \brief Inserts bits of a specified length from the source integer vector
/// \a y into the lower 64 bits of the destination integer vector \a x at
/// the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// \code
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
/// const int idx);
/// \endcode
/// \endcode
///
/// \code
/// This intrinsic corresponds to the \c INSERTQ instruction.
/// \endcode
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length len and by the index idx specifying the least
/// significant bit.
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length \a len and by the index \a idx specifying the
/// least significant bit.
/// \param y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand y of length len.
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a y of length \a len.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than
/// 64, the result is undefined. If the length and index are both zero,
/// bits [63:0] of parameter y are inserted into parameter x. If the
/// length is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits
/// of destination operand x with the specified bitfields replaced by the
/// lower bits of source operand y. The upper 64 bits of the return value
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a x with the specified bitfields replaced by the
/// lower bits of source operand \a y. The upper 64 bits of the return value
/// are undefined.
#define _mm_inserti_si64(x, y, len, idx) \
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
(__v2di)(__m128i)(y), \
(char)(len), (char)(idx)))
/// \brief Inserts bits of a specified length from the source integer vector
/// __y into the lower 64 bits of the destination integer vector __x at
/// the index and of the length specified by __y.
/// \brief Inserts bits of a specified length from the source integer vector
/// \a __y into the lower 64 bits of the destination integer vector \a __x
/// at the index and of the length specified by \a __y.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// This intrinsic corresponds to the \c INSERTQ instruction.
/// \endcode
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param __x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length and by the index of the least significant bit
/// specified by operand __y.
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length and by the index of the least significant bit
/// specified by operand \a __y.
/// \param __y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand __y with length specified
/// by bits [69:64]. These are inserted into the destination at the index
/// specified by bits [77:72]; all other bits are ignored.
/// If bits [69:64] are zero, the length is interpreted as 64.
/// If the sum of the index and length is greater than 64, the result is
/// undefined. If the length and index are both zero, bits [63:0] of
/// parameter __y are inserted into parameter __x. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits
/// of destination operand __x with the specified bitfields replaced by the
/// lower bits of source operand __y. The upper 64 bits of the return value
/// are undefined.
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a __y with length
/// specified by bits [69:64]. These are inserted into the destination at the
/// index specified by bits [77:72]; all other bits are ignored. If bits
/// [69:64] are zero, the length is interpreted as 64. If the sum of the
/// index and length is greater than 64, the result is undefined. If the
/// length and index are both zero, bits [63:0] of parameter \a __y are
/// inserted into parameter \a __x. If the length is zero but the index is
/// non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a __x with the specified bitfields replaced by the
/// lower bits of source operand \a __y. The upper 64 bits of the return
/// value are undefined.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_insert_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
}
/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
///
/// \code
/// This intrinsic corresponds to the \c MOVNTSD instruction.
/// \endcode
/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
///
/// \param __p
/// The 64-bit memory location used to store the register value.
/// \param __a
/// The 64-bit double-precision floating-point register value to
/// be stored.
/// The 64-bit double-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_sd(double *__p, __m128d __a)
{
@ -193,15 +176,12 @@ _mm_stream_sd(double *__p, __m128d __a)
///
/// \headerfile <x86intrin.h>
///
/// \code
/// This intrinsic corresponds to the \c MOVNTSS instruction.
/// \endcode
/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
///
/// \param __p
/// The 32-bit memory location used to store the register value.
/// \param __a
/// The 32-bit single-precision floating-point register value to
/// be stored.
/// The 32-bit single-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ss(float *__p, __m128 __a)
{
@ -210,6 +190,4 @@ _mm_stream_ss(float *__p, __m128 __a)
#undef __DEFAULT_FN_ATTRS
#endif /* __SSE4A__ */
#endif /* __AMMINTRIN_H */

View File

@ -72,9 +72,11 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
/* 8.5 Swap */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__swp(uint32_t x, volatile uint32_t *p) {
__swp(uint32_t __x, volatile uint32_t *__p) {
uint32_t v;
do v = __builtin_arm_ldrex(p); while (__builtin_arm_strex(x, p));
do
v = __builtin_arm_ldrex(__p);
while (__builtin_arm_strex(__x, __p));
return v;
}
@ -110,109 +112,115 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(voi
/* 9.2 Miscellaneous data-processing intrinsics */
/* ROR */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__ror(uint32_t x, uint32_t y) {
y %= 32;
if (y == 0) return x;
return (x >> y) | (x << (32 - y));
__ror(uint32_t __x, uint32_t __y) {
__y %= 32;
if (__y == 0)
return __x;
return (__x >> __y) | (__x << (32 - __y));
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rorll(uint64_t x, uint32_t y) {
y %= 64;
if (y == 0) return x;
return (x >> y) | (x << (64 - y));
__rorll(uint64_t __x, uint32_t __y) {
__y %= 64;
if (__y == 0)
return __x;
return (__x >> __y) | (__x << (64 - __y));
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rorl(unsigned long x, uint32_t y) {
__rorl(unsigned long __x, uint32_t __y) {
#if __SIZEOF_LONG__ == 4
return __ror(x, y);
return __ror(__x, __y);
#else
return __rorll(x, y);
return __rorll(__x, __y);
#endif
}
/* CLZ */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__clz(uint32_t t) {
return __builtin_clz(t);
__clz(uint32_t __t) {
return __builtin_clz(__t);
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__clzl(unsigned long t) {
return __builtin_clzl(t);
__clzl(unsigned long __t) {
return __builtin_clzl(__t);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__clzll(uint64_t t) {
return __builtin_clzll(t);
__clzll(uint64_t __t) {
return __builtin_clzll(__t);
}
/* REV */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rev(uint32_t t) {
return __builtin_bswap32(t);
__rev(uint32_t __t) {
return __builtin_bswap32(__t);
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__revl(unsigned long t) {
__revl(unsigned long __t) {
#if __SIZEOF_LONG__ == 4
return __builtin_bswap32(t);
return __builtin_bswap32(__t);
#else
return __builtin_bswap64(t);
return __builtin_bswap64(__t);
#endif
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__revll(uint64_t t) {
return __builtin_bswap64(t);
__revll(uint64_t __t) {
return __builtin_bswap64(__t);
}
/* REV16 */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rev16(uint32_t t) {
return __ror(__rev(t), 16);
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rev16l(unsigned long t) {
return __rorl(__revl(t), sizeof(long) / 2);
__rev16(uint32_t __t) {
return __ror(__rev(__t), 16);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rev16ll(uint64_t t) {
return __rorll(__revll(t), 32);
__rev16ll(uint64_t __t) {
return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rev16l(unsigned long __t) {
#if __SIZEOF_LONG__ == 4
return __rev16(__t);
#else
return __rev16ll(__t);
#endif
}
/* REVSH */
static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
__revsh(int16_t t) {
return __builtin_bswap16(t);
__revsh(int16_t __t) {
return __builtin_bswap16(__t);
}
/* RBIT */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rbit(uint32_t t) {
return __builtin_arm_rbit(t);
__rbit(uint32_t __t) {
return __builtin_arm_rbit(__t);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rbitll(uint64_t t) {
__rbitll(uint64_t __t) {
#if __ARM_32BIT_STATE
return (((uint64_t) __builtin_arm_rbit(t)) << 32) |
__builtin_arm_rbit(t >> 32);
return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
__builtin_arm_rbit(__t >> 32);
#else
return __builtin_arm_rbit64(t);
return __builtin_arm_rbit64(__t);
#endif
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rbitl(unsigned long t) {
__rbitl(unsigned long __t) {
#if __SIZEOF_LONG__ == 4
return __rbit(t);
return __rbit(__t);
#else
return __rbitll(t);
return __rbitll(__t);
#endif
}
@ -231,61 +239,61 @@ static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
/* 9.4.2 Saturating addition and subtraction intrinsics */
#if __ARM_32BIT_STATE
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qadd(int32_t t, int32_t v) {
return __builtin_arm_qadd(t, v);
__qadd(int32_t __t, int32_t __v) {
return __builtin_arm_qadd(__t, __v);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qsub(int32_t t, int32_t v) {
return __builtin_arm_qsub(t, v);
__qsub(int32_t __t, int32_t __v) {
return __builtin_arm_qsub(__t, __v);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qdbl(int32_t t) {
return __builtin_arm_qadd(t, t);
__qdbl(int32_t __t) {
return __builtin_arm_qadd(__t, __t);
}
#endif
/* 9.7 CRC32 intrinsics */
#if __ARM_FEATURE_CRC32
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32b(uint32_t a, uint8_t b) {
return __builtin_arm_crc32b(a, b);
__crc32b(uint32_t __a, uint8_t __b) {
return __builtin_arm_crc32b(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32h(uint32_t a, uint16_t b) {
return __builtin_arm_crc32h(a, b);
__crc32h(uint32_t __a, uint16_t __b) {
return __builtin_arm_crc32h(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32w(uint32_t a, uint32_t b) {
return __builtin_arm_crc32w(a, b);
__crc32w(uint32_t __a, uint32_t __b) {
return __builtin_arm_crc32w(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32d(uint32_t a, uint64_t b) {
return __builtin_arm_crc32d(a, b);
__crc32d(uint32_t __a, uint64_t __b) {
return __builtin_arm_crc32d(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32cb(uint32_t a, uint8_t b) {
return __builtin_arm_crc32cb(a, b);
__crc32cb(uint32_t __a, uint8_t __b) {
return __builtin_arm_crc32cb(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32ch(uint32_t a, uint16_t b) {
return __builtin_arm_crc32ch(a, b);
__crc32ch(uint32_t __a, uint16_t __b) {
return __builtin_arm_crc32ch(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32cw(uint32_t a, uint32_t b) {
return __builtin_arm_crc32cw(a, b);
__crc32cw(uint32_t __a, uint32_t __b) {
return __builtin_arm_crc32cw(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__crc32cd(uint32_t a, uint64_t b) {
return __builtin_arm_crc32cd(a, b);
__crc32cd(uint32_t __a, uint64_t __b) {
return __builtin_arm_crc32cd(__a, __b);
}
#endif

69231
c_headers/arm_neon.h Normal file

File diff suppressed because it is too large Load Diff

45
c_headers/armintr.h Normal file
View File

@ -0,0 +1,45 @@
/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
/* Only include this if we're compiling for the windows platform. */
#ifndef _MSC_VER
#include_next <armintr.h>
#else
#ifndef __ARMINTR_H
#define __ARMINTR_H
typedef enum
{
_ARM_BARRIER_SY = 0xF,
_ARM_BARRIER_ST = 0xE,
_ARM_BARRIER_ISH = 0xB,
_ARM_BARRIER_ISHST = 0xA,
_ARM_BARRIER_NSH = 0x7,
_ARM_BARRIER_NSHST = 0x6,
_ARM_BARRIER_OSH = 0x3,
_ARM_BARRIER_OSHST = 0x2
} _ARMINTR_BARRIER_TYPE;
#endif /* __ARMINTR_H */
#endif /* _MSC_VER */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,131 +1,144 @@
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512CDINTRIN_H
#define __AVX512CDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) __W,
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) __W,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) __W,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) __W,
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) __U);
}
#undef __DEFAULT_FN_ATTRS
#endif
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512CDINTRIN_H
#define __AVX512CDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) __W,
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) __W,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) __W,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) __W,
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmb_epi64 (__mmask8 __A)
{
return (__m512i) __builtin_ia32_broadcastmb512 (__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmw_epi32 (__mmask16 __A)
{
return (__m512i) __builtin_ia32_broadcastmw512 (__A);
}
#undef __DEFAULT_FN_ATTRS
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
/*===---- avx512fintrin.h - AVX2 intrinsics -----------------------------------===
/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@ -31,66 +31,66 @@
#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (R)); })
(__mmask8)-1, (int)(R)); })
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), \
(__mmask8)(M), (R)); })
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); })
#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (R)); })
(__mmask8)(M), (int)(R)); })
#define _mm512_exp2a23_pd(A) \
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_exp2a23_pd(S, M, A) \
_mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
_mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_exp2a23_pd(M, A) \
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask8)-1, (R)); })
(__mmask16)-1, (int)(R)); })
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), \
(__mmask8)(M), (R)); })
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); })
#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask8)(M), (R)); })
(__mmask16)(M), (int)(R)); })
#define _mm512_exp2a23_ps(A) \
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_exp2a23_ps(S, M, A) \
_mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
_mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_exp2a23_ps(M, A) \
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
// rsqrt28
#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (R)); })
(__mmask8)-1, (int)(R)); })
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), \
(__mmask8)(M), (R)); })
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); })
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (R)); })
(__mmask8)(M), (int)(R)); })
#define _mm512_rsqrt28_pd(A) \
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -104,17 +104,17 @@
#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (R)); })
(__mmask16)-1, (int)(R)); })
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), \
(__mmask16)(M), (R)); })
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); })
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (R)); })
(__mmask16)(M), (int)(R)); })
#define _mm512_rsqrt28_ps(A) \
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -126,22 +126,22 @@
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (R)); })
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)); })
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (R)); })
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)); })
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (R)); })
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)); })
#define _mm_rsqrt28_ss(A, B) \
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -153,22 +153,22 @@
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (R)); })
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)); })
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (R)); })
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)); })
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (R)); })
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)); })
#define _mm_rsqrt28_sd(A, B) \
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -177,23 +177,23 @@
_mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rsqrt28_sd(M, A, B) \
_mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
_mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
// rcp28
#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (R)); })
(__mmask8)-1, (int)(R)); })
#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), \
(__mmask8)(M), (R)); })
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); })
#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (R)); })
(__mmask8)(M), (int)(R)); })
#define _mm512_rcp28_pd(A) \
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -207,17 +207,17 @@
#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (R)); })
(__mmask16)-1, (int)(R)); })
#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), \
(__mmask16)(M), (R)); })
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); })
#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (R)); })
(__mmask16)(M), (int)(R)); })
#define _mm512_rcp28_ps(A) \
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -229,22 +229,22 @@
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (R)); })
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)); })
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (R)); })
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)); })
#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
(__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (R)); })
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)); })
#define _mm_rcp28_ss(A, B) \
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -256,22 +256,22 @@
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (R)); })
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)); })
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (R)); })
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)); })
#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
(__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (R)); })
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)); })
#define _mm_rcp28_sd(A, B) \
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,92 @@
/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __IFMAINTRIN_H
#define __IFMAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
(__v8di) __Y,
(__v8di) __Z,
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
__m512i __Y)
{
return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
(__v8di) __X,
(__v8di) __Y,
(__mmask8) __M);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
(__v8di) __Y,
(__v8di) __Z,
(__mmask8) __M);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
(__v8di) __Y,
(__v8di) __Z,
(__mmask8) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
__m512i __Y)
{
return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
(__v8di) __X,
(__v8di) __Y,
(__mmask8) __M);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
(__v8di) __Y,
(__v8di) __Z,
(__mmask8) __M);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -0,0 +1,149 @@
/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __IFMAVLINTRIN_H
#define __IFMAVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
(__v2di) __Y,
(__v2di) __Z,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
(__v2di) __X,
(__v2di) __Y,
(__mmask8) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
(__v2di) __Y,
(__v2di) __Z,
(__mmask8) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
(__v4di) __Y,
(__v4di) __Z,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
__m256i __Y)
{
return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
(__v4di) __X,
(__v4di) __Y,
(__mmask8) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
(__v4di) __Y,
(__v4di) __Z,
(__mmask8) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
(__v2di) __Y,
(__v2di) __Z,
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
(__v2di) __X,
(__v2di) __Y,
(__mmask8) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
(__v2di) __Y,
(__v2di) __Z,
(__mmask8) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
(__v4di) __Y,
(__v4di) __Z,
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
__m256i __Y)
{
return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
(__v4di) __X,
(__v4di) __Y,
(__mmask8) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
(__v4di) __Y,
(__v4di) __Z,
(__mmask8) __M);
}
#undef __DEFAULT_FN_ATTRS
#endif

111
c_headers/avx512pfintrin.h Normal file
View File

@ -0,0 +1,111 @@
/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512PFINTRIN_H
#define __AVX512PFINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
__builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(long long const *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
__builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
(long long const *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
__builtin_ia32_gatherpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (int const *)(addr), \
(int)(scale), (int)(hint)); })
#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
__builtin_ia32_gatherpfdps((__mmask16) -1, \
(__v16si)(__m512i)(index), (int const *)(addr), \
(int)(scale), (int)(hint)); })
#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
__builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(long long const *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
__builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
(long long const *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
__builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(int const *)(addr), (int)(scale), (int)(hint)); })
#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
__builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
(int const *)(addr), (int)(scale), (int)(hint)); })
#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
__builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
(long long *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
__builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(long long *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
__builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); })
#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
__builtin_ia32_scatterpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (int *)(addr), \
(int)(scale), (int)(hint)); })
#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
__builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
(long long *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
__builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(long long *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
__builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); })
#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
__builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); })
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -0,0 +1,137 @@
/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VBMIINTRIN_H
#define __VBMIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
__mmask64 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
(__v64qi) __I
/* idx */ ,
(__v64qi) __B,
(__mmask64) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
{
return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
/* idx */ ,
(__v64qi) __A,
(__v64qi) __B,
(__mmask64) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
__m512i __I, __m512i __B)
{
return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
/* idx */ ,
(__v64qi) __A,
(__v64qi) __B,
(__mmask64) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
__m512i __I, __m512i __B)
{
return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
/* idx */ ,
(__v64qi) __A,
(__v64qi) __B,
(__mmask64) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
(__v64qi) __A,
(__v64qi) _mm512_undefined_epi32 (),
(__mmask64) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
(__v64qi) __A,
(__v64qi) _mm512_setzero_si512(),
(__mmask64) __M);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
(__v64qi) __A,
(__v64qi) __W,
(__mmask64) __M);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
{
return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
(__v64qi) __Y,
(__v64qi) __W,
(__mmask64) __M);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
{
return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
(__v64qi) __Y,
(__v64qi) _mm512_setzero_si512 (),
(__mmask64) __M);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
{
return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
(__v64qi) __Y,
(__v64qi) _mm512_undefined_epi32 (),
(__mmask64) -1);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -0,0 +1,247 @@
/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VBMIVLINTRIN_H
#define __VBMIVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
__m128i __B)
{
return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
(__v16qi) __I
/* idx */ ,
(__v16qi) __B,
(__mmask16)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
__mmask32 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
(__v32qi) __I
/* idx */ ,
(__v32qi) __B,
(__mmask32)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
{
return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
/* idx */ ,
(__v16qi) __A,
(__v16qi) __B,
(__mmask16) -
1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
__m128i __B)
{
return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
/* idx */ ,
(__v16qi) __A,
(__v16qi) __B,
(__mmask16)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
__m128i __B)
{
return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
/* idx */ ,
(__v16qi) __A,
(__v16qi) __B,
(__mmask16)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
{
return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
/* idx */ ,
(__v32qi) __A,
(__v32qi) __B,
(__mmask32) -
1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
__m256i __I, __m256i __B)
{
return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
/* idx */ ,
(__v32qi) __A,
(__v32qi) __B,
(__mmask32)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
__m256i __I, __m256i __B)
{
return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
/* idx */ ,
(__v32qi) __A,
(__v32qi) __B,
(__mmask32)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
(__v16qi) __A,
(__v16qi) _mm_undefined_si128 (),
(__mmask16) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
(__v16qi) __A,
(__v16qi) _mm_setzero_si128 (),
(__mmask16) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
__m128i __B)
{
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
(__v16qi) __A,
(__v16qi) __W,
(__mmask16) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
(__v32qi) __A,
(__v32qi) _mm256_undefined_si256 (),
(__mmask32) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
__m256i __B)
{
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
(__v32qi) __A,
(__v32qi) _mm256_setzero_si256 (),
(__mmask32) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
__m256i __B)
{
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
(__v32qi) __A,
(__v32qi) __W,
(__mmask32) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
(__v16qi) __Y,
(__v16qi) __W,
(__mmask16) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
(__v16qi) __Y,
(__v16qi)
_mm_setzero_si128 (),
(__mmask16) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
(__v16qi) __Y,
(__v16qi)
_mm_undefined_si128 (),
(__mmask16) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
{
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
(__v32qi) __Y,
(__v32qi) __W,
(__mmask32) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
{
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
(__v32qi) __Y,
(__v32qi)
_mm256_setzero_si256 (),
(__mmask32) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
{
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
(__v32qi) __Y,
(__v32qi)
_mm256_undefined_si256 (),
(__mmask32) -1);
}
#undef __DEFAULT_FN_ATTRS
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,263 @@
/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLCDINTRIN_H
#define __AVX512VLCDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastmb_epi64 (__mmask8 __A)
{
return (__m128i) __builtin_ia32_broadcastmb128 (__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastmb_epi64 (__mmask8 __A)
{
return (__m256i) __builtin_ia32_broadcastmb256 (__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastmw_epi32 (__mmask16 __A)
{
return (__m128i) __builtin_ia32_broadcastmw128 (__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastmw_epi32 (__mmask16 __A)
{
return (__m256i) __builtin_ia32_broadcastmw256 (__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_conflict_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
(__v2di) _mm_undefined_si128 (),
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
(__v2di) __W,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
(__v2di)
_mm_setzero_di (),
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_conflict_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
(__v4di) _mm256_undefined_si256 (),
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
(__v4di) __W,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
(__v4di) _mm256_setzero_si256 (),
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_conflict_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
(__v4si) _mm_undefined_si128 (),
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
(__v4si) __W,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
(__v4si) _mm_setzero_si128 (),
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_conflict_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
(__v8si) _mm256_undefined_si256 (),
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
(__v8si) __W,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
(__v8si)
_mm256_setzero_si256 (),
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lzcnt_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
(__v4si)
_mm_setzero_si128 (),
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
(__v4si) __W,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
(__v4si)
_mm_setzero_si128 (),
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_lzcnt_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
(__v8si)
_mm256_setzero_si256 (),
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
(__v8si) __W,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
(__v8si)
_mm256_setzero_si256 (),
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lzcnt_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
(__v2di)
_mm_setzero_di (),
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
(__v2di) __W,
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
(__v2di)
_mm_setzero_di (),
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_lzcnt_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
(__v4di)
_mm256_setzero_si256 (),
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
(__v4di) __W,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
(__v4di)
_mm256_setzero_si256 (),
(__mmask8) __U);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __AVX512VLCDINTRIN_H */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -25,15 +25,11 @@
#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __BMI2__
# error "BMI2 instruction set not enabled"
#endif /* __BMI2__ */
#ifndef __BMI2INTRIN_H
#define __BMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_bzhi_u32(unsigned int __X, unsigned int __Y)

View File

@ -25,30 +25,149 @@
#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __BMI__
# error "BMI instruction set not enabled"
#endif /* __BMI__ */
#ifndef __BMIINTRIN_H
#define __BMIINTRIN_H
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned short _tzcnt_u16(unsigned short a);
/// \endcode
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 16-bit integer containing the number of trailing zero
/// bits in the operand.
#define _tzcnt_u16(a) (__tzcnt_u16((a)))
/// \brief Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _andn_u32(unsigned int a, unsigned int b);
/// \endcode
///
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param a
/// An unsigned integer containing one of the operands.
/// \param b
/// An unsigned integer containing one of the operands.
/// \returns An unsigned integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
#define _andn_u32(a, b) (__andn_u32((a), (b)))
/* _bextr_u32 != __bextr_u32 */
/// \brief Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _blsi_u32(unsigned int a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param a
/// An unsigned integer whose bits are to be cleared.
/// \returns An unsigned integer containing the result of clearing the bits from
/// the source operand.
#define _blsi_u32(a) (__blsi_u32((a)))
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _blsmsk_u32(unsigned int a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param a
/// An unsigned integer used to create the mask.
/// \returns An unsigned integer containing the newly created mask.
#define _blsmsk_u32(a) (__blsmsk_u32((a)))
/// \brief Clears the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _blsr_u32(unsigned int a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param a
/// An unsigned integer containing the operand to be cleared.
/// \returns An unsigned integer containing the result of clearing the source
/// operand.
#define _blsr_u32(a) (__blsr_u32((a)))
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _tzcnt_u32(unsigned int a);
/// \endcode
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of trailing zero
/// bits in the operand.
#define _tzcnt_u32(a) (__tzcnt_u32((a)))
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
static __inline__ unsigned short __DEFAULT_FN_ATTRS
/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
instruction behaves as BSF on non-BMI targets, there is code that expects
to use it as a potentially faster version of BSF. */
#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 16-bit integer containing the number of trailing zero
/// bits in the operand.
static __inline__ unsigned short __RELAXED_FN_ATTRS
__tzcnt_u16(unsigned short __X)
{
return __X ? __builtin_ctzs(__X) : 16;
}
/// \brief Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param __X
/// An unsigned integer containing one of the operands.
/// \param __Y
/// An unsigned integer containing one of the operands.
/// \returns An unsigned integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__andn_u32(unsigned int __X, unsigned int __Y)
{
@ -56,6 +175,21 @@ __andn_u32(unsigned int __X, unsigned int __Y)
}
/* AMD-specified, double-leading-underscore version of BEXTR */
/// \brief Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be extracted.
/// \param __Y
/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
/// specify the index of the least significant bit. Bits [15:8] specify the
/// number of bits to be extracted.
/// \returns An unsigned integer whose least significant bits contain the
/// extracted bits.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__bextr_u32(unsigned int __X, unsigned int __Y)
{
@ -63,45 +197,214 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
}
/* Intel-specified, single-leading-underscore version of BEXTR */
/// \brief Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be extracted.
/// \param __Y
/// An unsigned integer used to specify the index of the least significant
/// bit for the bits to be extracted. Bits [7:0] specify the index.
/// \param __Z
/// An unsigned integer used to specify the number of bits to be extracted.
/// Bits [7:0] specify the number of bits.
/// \returns An unsigned integer whose least significant bits contain the
/// extracted bits.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
{
return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
}
/// \brief Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be cleared.
/// \returns An unsigned integer containing the result of clearing the bits from
/// the source operand.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsi_u32(unsigned int __X)
{
return __X & -__X;
}
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param __X
/// An unsigned integer used to create the mask.
/// \returns An unsigned integer containing the newly created mask.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsmsk_u32(unsigned int __X)
{
return __X ^ (__X - 1);
}
/// \brief Clears the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param __X
/// An unsigned integer containing the operand to be cleared.
/// \returns An unsigned integer containing the result of clearing the source
/// operand.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsr_u32(unsigned int __X)
{
return __X & (__X - 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of trailing zero
/// bits in the operand.
static __inline__ unsigned int __RELAXED_FN_ATTRS
__tzcnt_u32(unsigned int __X)
{
return __X ? __builtin_ctz(__X) : 32;
}
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
/// \returns An 32-bit integer containing the number of trailing zero bits in
/// the operand.
static __inline__ int __RELAXED_FN_ATTRS
_mm_tzcnt_32(unsigned int __X)
{
return __X ? __builtin_ctz(__X) : 32;
}
#ifdef __x86_64__
/// \brief Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
/// \endcode
///
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer containing one of the operands.
/// \param b
/// An unsigned 64-bit integer containing one of the operands.
/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
#define _andn_u64(a, b) (__andn_u64((a), (b)))
/* _bextr_u64 != __bextr_u64 */
/// \brief Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsi_u64(unsigned long long a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer whose bits are to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// bits from the source operand.
#define _blsi_u64(a) (__blsi_u64((a)))
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsmsk_u64(unsigned long long a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer used to create the mask.
/// \returns A unsigned 64-bit integer containing the newly created mask.
#define _blsmsk_u64(a) (__blsmsk_u64((a)))
/// \brief Clears the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsr_u64(unsigned long long a);
/// \endcode
///
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer containing the operand to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// source operand.
#define _blsr_u64(a) (__blsr_u64((a)))
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _tzcnt_u64(unsigned long long a);
/// \endcode
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param a
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of trailing zero
/// bits in the operand.
#define _tzcnt_u64(a) (__tzcnt_u64((a)))
/// \brief Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer containing one of the operands.
/// \param __Y
/// An unsigned 64-bit integer containing one of the operands.
/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__andn_u64 (unsigned long long __X, unsigned long long __Y)
{
@ -109,6 +412,21 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
}
/* AMD-specified, double-leading-underscore version of BEXTR */
/// \brief Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be extracted.
/// \param __Y
/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
/// the number of bits to be extracted.
/// \returns An unsigned 64-bit integer whose least significant bits contain the
/// extracted bits.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__bextr_u64(unsigned long long __X, unsigned long long __Y)
{
@ -116,38 +434,115 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
}
/* Intel-specified, single-leading-underscore version of BEXTR */
/// \brief Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be extracted.
/// \param __Y
/// An unsigned integer used to specify the index of the least significant
/// bit for the bits to be extracted. Bits [7:0] specify the index.
/// \param __Z
/// An unsigned integer used to specify the number of bits to be extracted.
/// Bits [7:0] specify the number of bits.
/// \returns An unsigned 64-bit integer whose least significant bits contain the
/// extracted bits.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
{
return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
}
/// \brief Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// bits from the source operand.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsi_u64(unsigned long long __X)
{
return __X & -__X;
}
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer used to create the mask.
/// \returns A unsigned 64-bit integer containing the newly created mask.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsmsk_u64(unsigned long long __X)
{
return __X ^ (__X - 1);
}
/// \brief Clears the least siginificant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer containing the operand to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// source operand.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsr_u64(unsigned long long __X)
{
return __X & (__X - 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of trailing zero
/// bits in the operand.
static __inline__ unsigned long long __RELAXED_FN_ATTRS
__tzcnt_u64(unsigned long long __X)
{
return __X ? __builtin_ctzll(__X) : 64;
}
/// \brief Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
/// \returns An 64-bit integer containing the number of trailing zero bits in
/// the operand.
static __inline__ long long __RELAXED_FN_ATTRS
_mm_tzcnt_64(unsigned long long __X)
{
return __X ? __builtin_ctzll(__X) : 64;
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#undef __RELAXED_FN_ATTRS
#endif /* __BMIINTRIN_H */

View File

@ -0,0 +1,41 @@
/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __CLFLUSHOPTINTRIN_H
#define __CLFLUSHOPTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt")))
static __inline__ void __DEFAULT_FN_ATTRS
_mm_clflushopt(char * __m) {
__builtin_ia32_clflushopt(__m);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -82,6 +82,7 @@
/* Features in %ecx for level 1 */
#define bit_SSE3 0x00000001
#define bit_PCLMULQDQ 0x00000002
#define bit_PCLMUL bit_PCLMULQDQ /* for gcc compat */
#define bit_DTES64 0x00000004
#define bit_MONITOR 0x00000008
#define bit_DSCPL 0x00000010
@ -98,15 +99,19 @@
#define bit_PCID 0x00020000
#define bit_DCA 0x00040000
#define bit_SSE41 0x00080000
#define bit_SSE4_1 bit_SSE41 /* for gcc compat */
#define bit_SSE42 0x00100000
#define bit_SSE4_2 bit_SSE42 /* for gcc compat */
#define bit_x2APIC 0x00200000
#define bit_MOVBE 0x00400000
#define bit_POPCNT 0x00800000
#define bit_TSCDeadline 0x01000000
#define bit_AESNI 0x02000000
#define bit_AES bit_AESNI /* for gcc compat */
#define bit_XSAVE 0x04000000
#define bit_OSXSAVE 0x08000000
#define bit_AVX 0x10000000
#define bit_F16C 0x20000000
#define bit_RDRND 0x40000000
/* Features in %edx for level 1 */
@ -119,6 +124,7 @@
#define bit_PAE 0x00000040
#define bit_MCE 0x00000080
#define bit_CX8 0x00000100
#define bit_CMPXCHG8B bit_CX8 /* for gcc compat */
#define bit_APIC 0x00000200
#define bit_SEP 0x00000800
#define bit_MTRR 0x00001000
@ -133,7 +139,7 @@
#define bit_ACPI 0x00400000
#define bit_MMX 0x00800000
#define bit_FXSR 0x01000000
#define bit_FXSAVE bit_FXSR /* for gcc compat */
#define bit_FXSAVE bit_FXSR /* for gcc compat */
#define bit_SSE 0x02000000
#define bit_SSE2 0x04000000
#define bit_SS 0x08000000

View File

@ -0,0 +1,96 @@
/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
#define __CLANG_CUDA_WRAPPERS_ALGORITHM
// This header defines __device__ overloads of std::min/max, but only if we're
// <= C++11. In C++14, these functions are constexpr, and so are implicitly
// __host__ __device__.
//
// We don't support the initializer_list overloads because
// initializer_list::begin() and end() are not __host__ __device__ functions.
//
// When compiling in C++14 mode, we could force std::min/max to have different
// implementations for host and device, by declaring the device overloads
// before the constexpr overloads appear. We choose not to do this because
// a) why write our own implementation when we can use one from the standard
// library? and
// b) libstdc++ is evil and declares min/max inside a header that is included
// *before* we include <algorithm>. So we'd have to unconditionally
// declare our __device__ overloads of min/max, but that would pollute
// things for people who choose not to include <algorithm>.
#include_next <algorithm>
#if __cplusplus <= 201103L
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
template <class __T, class __Cmp>
inline __device__ const __T &
max(const __T &__a, const __T &__b, __Cmp __cmp) {
return __cmp(__a, __b) ? __b : __a;
}
template <class __T>
inline __device__ const __T &
max(const __T &__a, const __T &__b) {
return __a < __b ? __b : __a;
}
template <class __T, class __Cmp>
inline __device__ const __T &
min(const __T &__a, const __T &__b, __Cmp __cmp) {
return __cmp(__b, __a) ? __b : __a;
}
template <class __T>
inline __device__ const __T &
min(const __T &__a, const __T &__b) {
return __a < __b ? __b : __a;
}
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#endif // __cplusplus <= 201103L
#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM

View File

@ -0,0 +1,82 @@
/*===---- complex - CUDA wrapper for <complex> ------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
#define __CLANG_CUDA_WRAPPERS_COMPLEX
// Wrapper around <complex> that forces its functions to be __host__
// __device__.
// First, include host-only headers we think are likely to be included by
// <complex>, so that the pragma below only applies to <complex> itself.
#if __cplusplus >= 201103L
#include <type_traits>
#endif
#include <stdexcept>
#include <cmath>
#include <sstream>
// Next, include our <algorithm> wrapper, to ensure that device overloads of
// std::min/max are available.
#include <algorithm>
#pragma clang force_cuda_host_device begin
// When compiling for device, ask libstdc++ to use its own implements of
// complex functions, rather than calling builtins (which resolve to library
// functions that don't exist when compiling CUDA device code).
//
// This is a little dicey, because it causes libstdc++ to define a different
// set of overloads on host and device.
//
// // Present only when compiling for host.
// __host__ __device__ void complex<float> sin(const complex<float>& x) {
// return __builtin_csinf(x);
// }
//
// // Present when compiling for host and for device.
// template <typename T>
// void __host__ __device__ complex<T> sin(const complex<T>& x) {
// return complex<T>(sin(x.real()) * cosh(x.imag()),
// cos(x.real()), sinh(x.imag()));
// }
//
// This is safe because when compiling for device, all function calls in
// __host__ code to sin() will still resolve to *something*, even if they don't
// resolve to the same function as they resolve to when compiling for host. We
// don't care that they don't resolve to the right function because we won't
// codegen this host code when compiling for device.
#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
#define _GLIBCXX_USE_C99_COMPLEX 0
#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
#include_next <complex>
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
#pragma clang force_cuda_host_device end
#endif // include guard

View File

@ -0,0 +1,47 @@
/*===---- complex - CUDA wrapper for <new> ------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_NEW
#define __CLANG_CUDA_WRAPPERS_NEW
#include_next <new>
// Device overrides for placement new and delete.
#pragma push_macro("CUDA_NOEXCEPT")
#if __cplusplus >= 201103L
#define CUDA_NOEXCEPT noexcept
#else
#define CUDA_NOEXCEPT
#endif
__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
return __ptr;
}
__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
return __ptr;
}
__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
#pragma pop_macro("CUDA_NOEXCEPT")
#endif // include guard

File diff suppressed because it is too large Load Diff

View File

@ -21,43 +21,104 @@
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <f16cintrin.h> directly; include <x86intrin.h> instead."
#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
#endif
#ifndef __F16C__
# error "F16C instruction is not enabled"
#endif /* __F16C__ */
#ifndef __F16CINTRIN_H
#define __F16CINTRIN_H
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
typedef float __m256 __attribute__ ((__vector_size__ (32)));
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("f16c")))
#define _mm_cvtps_ph(a, imm) __extension__ ({ \
__m128 __a = (a); \
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__a, (imm)); })
/// \brief Converts a 16-bit half-precision float value into a 32-bit float
/// value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 16-bit half-precision float value.
/// \returns The converted 32-bit float value.
static __inline float __DEFAULT_FN_ATTRS
_cvtsh_ss(unsigned short __a)
{
__v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
__v4sf r = __builtin_ia32_vcvtph2ps(v);
return r[0];
}
#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
__m256 __a = (a); \
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__a, (imm)); })
/// \brief Converts a 32-bit single-precision float value to a 16-bit
/// half-precision float value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned short _cvtss_sh(float a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 32-bit single-precision float value to be converted to a 16-bit
/// half-precision float value.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) \
((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0]))
/// \brief Converts a 128-bit vector containing 32-bit float values into a
/// 128-bit vector containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 128-bit vector containing 32-bit float values.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing converted 16-bit half-precision float
/// values. The lower 64 bits are used to store the converted 16-bit
/// half-precision floating-point values.
#define _mm_cvtps_ph(a, imm) \
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
/// \brief Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 128-bit vector containing 32-bit float values.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values. The lower
/// 64 bits are used in the conversion.
/// \returns A 128-bit vector of [4 x float] containing converted float values.
static __inline __m128 __DEFAULT_FN_ATTRS
_mm_cvtph_ps(__m128i __a)
{
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
}
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_cvtph_ps(__m128i __a)
{
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __F16CINTRIN_H */

View File

@ -27,9 +27,12 @@
/* If we're on MinGW, fall back to the system's float.h, which might have
* additional definitions provided for Windows.
* For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
*
* Also fall back on Darwin to allow additional definitions and
* implementation-defined values.
*/
#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \
__has_include_next(<float.h>)
#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
__STDC_HOSTED__ && __has_include_next(<float.h>)
# include_next <float.h>
/* Undefine anything that we'll be redefining below. */
@ -39,7 +42,9 @@
# undef FLT_MANT_DIG
# undef DBL_MANT_DIG
# undef LDBL_MANT_DIG
# undef DECIMAL_DIG
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
# undef DECIMAL_DIG
# endif
# undef FLT_DIG
# undef DBL_DIG
# undef LDBL_DIG
@ -68,6 +73,9 @@
# undef FLT_TRUE_MIN
# undef DBL_TRUE_MIN
# undef LDBL_TRUE_MIN
# undef FLT_DECIMAL_DIG
# undef DBL_DECIMAL_DIG
# undef LDBL_DECIMAL_DIG
# endif
#endif
@ -81,7 +89,9 @@
#define DBL_MANT_DIG __DBL_MANT_DIG__
#define LDBL_MANT_DIG __LDBL_MANT_DIG__
#define DECIMAL_DIG __DECIMAL_DIG__
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
# define DECIMAL_DIG __DECIMAL_DIG__
#endif
#define FLT_DIG __FLT_DIG__
#define DBL_DIG __DBL_DIG__
@ -119,6 +129,9 @@
# define FLT_TRUE_MIN __FLT_DENORM_MIN__
# define DBL_TRUE_MIN __DBL_DENORM_MIN__
# define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
# define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
# define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
# define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
#endif
#endif /* __FLOAT_H */

View File

@ -28,209 +28,203 @@
#ifndef __FMA4INTRIN_H
#define __FMA4INTRIN_H
#ifndef __FMA4__
# error "FMA4 instruction set is not enabled"
#else
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __FMA4__ */
#endif /* __FMA4INTRIN_H */

View File

@ -28,207 +28,201 @@
#ifndef __FMAINTRIN_H
#define __FMAINTRIN_H
#ifndef __FMA__
# error "FMA instruction set is not enabled"
#else
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __FMA__ */
#endif /* __FMAINTRIN_H */

View File

@ -28,27 +28,77 @@
#ifndef __FXSRINTRIN_H
#define __FXSRINTRIN_H
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr")))
/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
/// memory region pointed to by the input parameter \a __p.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
///
/// \param __p
/// A pointer to a 512-byte memory region. The beginning of this memory
/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
_fxsave(void *__p) {
_fxsave(void *__p)
{
return __builtin_ia32_fxsave(__p);
}
/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
/// memory region pointed to by the input parameter \a __p. The contents of
/// this memory region should have been written to by a previous \c _fxsave
/// or \c _fxsave64 intrinsic.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
///
/// \param __p
/// A pointer to a 512-byte memory region. The beginning of this memory
/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
_fxsave64(void *__p) {
return __builtin_ia32_fxsave64(__p);
}
static __inline__ void __DEFAULT_FN_ATTRS
_fxrstor(void *__p) {
_fxrstor(void *__p)
{
return __builtin_ia32_fxrstor(__p);
}
#ifdef __x86_64__
/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
/// memory region pointed to by the input parameter \a __p.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
///
/// \param __p
/// A pointer to a 512-byte memory region. The beginning of this memory
/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
_fxrstor64(void *__p) {
_fxsave64(void *__p)
{
return __builtin_ia32_fxsave64(__p);
}
/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
/// memory region pointed to by the input parameter \a __p. The contents of
/// this memory region should have been written to by a previous \c _fxsave
/// or \c _fxsave64 intrinsic.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
///
/// \param __p
/// A pointer to a 512-byte memory region. The beginning of this memory
/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
_fxrstor64(void *__p)
{
return __builtin_ia32_fxrstor64(__p);
}
#endif
#undef __DEFAULT_FN_ATTRS

View File

@ -164,24 +164,24 @@ struct __htm_tdb {
/* Helper intrinsics to retry tbegin in case of transient failure. */
static __inline int __attribute__((__always_inline__, __nodebug__))
__builtin_tbegin_retry_null (int retry)
__builtin_tbegin_retry_null (int __retry)
{
int cc, i = 0;
while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
&& i++ < retry)
&& i++ < __retry)
__builtin_tx_assist(i);
return cc;
}
static __inline int __attribute__((__always_inline__, __nodebug__))
__builtin_tbegin_retry_tdb (void *tdb, int retry)
__builtin_tbegin_retry_tdb (void *__tdb, int __retry)
{
int cc, i = 0;
while ((cc = __builtin_tbegin(tdb)) == _HTM_TBEGIN_TRANSIENT
&& i++ < retry)
while ((cc = __builtin_tbegin(__tdb)) == _HTM_TBEGIN_TRANSIENT
&& i++ < __retry)
__builtin_tx_assist(i);
return cc;
@ -193,24 +193,24 @@ __builtin_tbegin_retry_tdb (void *tdb, int retry)
__builtin_tbegin_retry_tdb(tdb, retry))
static __inline int __attribute__((__always_inline__, __nodebug__))
__builtin_tbegin_retry_nofloat_null (int retry)
__builtin_tbegin_retry_nofloat_null (int __retry)
{
int cc, i = 0;
while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
&& i++ < retry)
&& i++ < __retry)
__builtin_tx_assist(i);
return cc;
}
static __inline int __attribute__((__always_inline__, __nodebug__))
__builtin_tbegin_retry_nofloat_tdb (void *tdb, int retry)
__builtin_tbegin_retry_nofloat_tdb (void *__tdb, int __retry)
{
int cc, i = 0;
while ((cc = __builtin_tbegin_nofloat(tdb)) == _HTM_TBEGIN_TRANSIENT
&& i++ < retry)
while ((cc = __builtin_tbegin_nofloat(__tdb)) == _HTM_TBEGIN_TRANSIENT
&& i++ < __retry)
__builtin_tx_assist(i);
return cc;

View File

@ -46,7 +46,7 @@ extern "C" {
typedef char TM_buff_type[16];
/* This macro can be used to determine whether a transaction was successfully
/* This macro can be used to determine whether a transaction was successfully
started from the __TM_begin() and __TM_simple_begin() intrinsic functions
below. */
#define _HTM_TBEGIN_STARTED 1
@ -62,18 +62,18 @@ __TM_simple_begin (void)
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_begin (void* const TM_buff)
__TM_begin (void* const __TM_buff)
{
*_TEXASRL_PTR (TM_buff) = 0;
*_TEXASRL_PTR (__TM_buff) = 0;
if (__builtin_expect (__builtin_tbegin (0), 1))
return _HTM_TBEGIN_STARTED;
#ifdef __powerpc64__
*_TEXASR_PTR (TM_buff) = __builtin_get_texasr ();
*_TEXASR_PTR (__TM_buff) = __builtin_get_texasr ();
#else
*_TEXASRU_PTR (TM_buff) = __builtin_get_texasru ();
*_TEXASRL_PTR (TM_buff) = __builtin_get_texasr ();
*_TEXASRU_PTR (__TM_buff) = __builtin_get_texasru ();
*_TEXASRL_PTR (__TM_buff) = __builtin_get_texasr ();
#endif
*_TFIAR_PTR (TM_buff) = __builtin_get_tfiar ();
*_TFIAR_PTR (__TM_buff) = __builtin_get_tfiar ();
return 0;
}
@ -95,9 +95,9 @@ __TM_abort (void)
extern __inline void
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_named_abort (unsigned char const code)
__TM_named_abort (unsigned char const __code)
{
__builtin_tabort (code);
__builtin_tabort (__code);
}
extern __inline void
@ -116,47 +116,47 @@ __TM_suspend (void)
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_is_user_abort (void* const TM_buff)
__TM_is_user_abort (void* const __TM_buff)
{
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
return _TEXASRU_ABORT (texasru);
}
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_is_named_user_abort (void* const TM_buff, unsigned char *code)
__TM_is_named_user_abort (void* const __TM_buff, unsigned char *__code)
{
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
*code = _TEXASRU_FAILURE_CODE (texasru);
*__code = _TEXASRU_FAILURE_CODE (texasru);
return _TEXASRU_ABORT (texasru);
}
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_is_illegal (void* const TM_buff)
__TM_is_illegal (void* const __TM_buff)
{
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
return _TEXASRU_DISALLOWED (texasru);
}
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_is_footprint_exceeded (void* const TM_buff)
__TM_is_footprint_exceeded (void* const __TM_buff)
{
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
}
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_nesting_depth (void* const TM_buff)
__TM_nesting_depth (void* const __TM_buff)
{
texasrl_t texasrl;
if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
{
texasrl = *_TEXASRL_PTR (TM_buff);
texasrl = *_TEXASRL_PTR (__TM_buff);
if (!_TEXASR_FAILURE_SUMMARY (texasrl))
texasrl = 0;
}
@ -168,15 +168,15 @@ __TM_nesting_depth (void* const TM_buff)
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_is_nested_too_deep(void* const TM_buff)
__TM_is_nested_too_deep(void* const __TM_buff)
{
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
return _TEXASRU_NESTING_OVERFLOW (texasru);
}
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_is_conflict(void* const TM_buff)
__TM_is_conflict(void* const __TM_buff)
{
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
/* Return TEXASR bits 11 (Self-Induced Conflict) through
@ -186,24 +186,24 @@ __TM_is_conflict(void* const TM_buff)
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_is_failure_persistent(void* const TM_buff)
__TM_is_failure_persistent(void* const __TM_buff)
{
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
return _TEXASRU_FAILURE_PERSISTENT (texasru);
}
extern __inline long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_failure_address(void* const TM_buff)
__TM_failure_address(void* const __TM_buff)
{
return *_TFIAR_PTR (TM_buff);
return *_TFIAR_PTR (__TM_buff);
}
extern __inline long long
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
__TM_failure_code(void* const TM_buff)
__TM_failure_code(void* const __TM_buff)
{
return *_TEXASR_PTR (TM_buff);
return *_TEXASR_PTR (__TM_buff);
}
#ifdef __cplusplus
@ -227,9 +227,9 @@ __TM_simple_begin ()
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_begin (void* const tdb)
__TM_begin (void* const __tdb)
{
return __builtin_tbegin_nofloat (tdb);
return __builtin_tbegin_nofloat (__tdb);
}
static __inline long __attribute__((__always_inline__, __nodebug__))
@ -245,22 +245,22 @@ __TM_abort ()
}
static __inline void __attribute__((__always_inline__, __nodebug__))
__TM_named_abort (unsigned char const code)
__TM_named_abort (unsigned char const __code)
{
return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + code);
return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + __code);
}
static __inline void __attribute__((__always_inline__, __nodebug__))
__TM_non_transactional_store (void* const addr, long long const value)
__TM_non_transactional_store (void* const __addr, long long const __value)
{
__builtin_non_tx_store ((uint64_t*)addr, (uint64_t)value);
__builtin_non_tx_store ((uint64_t*)__addr, (uint64_t)__value);
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_nesting_depth (void* const tdb_ptr)
__TM_nesting_depth (void* const __tdb_ptr)
{
int depth = __builtin_tx_nesting_depth ();
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
if (depth != 0)
return depth;
@ -273,9 +273,9 @@ __TM_nesting_depth (void* const tdb_ptr)
/* Transaction failure diagnostics */
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_is_user_abort (void* const tdb_ptr)
__TM_is_user_abort (void* const __tdb_ptr)
{
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
if (tdb->format != 1)
return 0;
@ -284,25 +284,25 @@ __TM_is_user_abort (void* const tdb_ptr)
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_is_named_user_abort (void* const tdb_ptr, unsigned char* code)
__TM_is_named_user_abort (void* const __tdb_ptr, unsigned char* __code)
{
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
if (tdb->format != 1)
return 0;
if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
{
*code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
*__code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
return 1;
}
return 0;
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_is_illegal (void* const tdb_ptr)
__TM_is_illegal (void* const __tdb_ptr)
{
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
return (tdb->format == 1
&& (tdb->abort_code == 4 /* unfiltered program interruption */
@ -310,9 +310,9 @@ __TM_is_illegal (void* const tdb_ptr)
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_is_footprint_exceeded (void* const tdb_ptr)
__TM_is_footprint_exceeded (void* const __tdb_ptr)
{
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
return (tdb->format == 1
&& (tdb->abort_code == 7 /* fetch overflow */
@ -320,17 +320,17 @@ __TM_is_footprint_exceeded (void* const tdb_ptr)
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_is_nested_too_deep (void* const tdb_ptr)
__TM_is_nested_too_deep (void* const __tdb_ptr)
{
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_is_conflict (void* const tdb_ptr)
__TM_is_conflict (void* const __tdb_ptr)
{
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
return (tdb->format == 1
&& (tdb->abort_code == 9 /* fetch conflict */
@ -338,22 +338,22 @@ __TM_is_conflict (void* const tdb_ptr)
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_is_failure_persistent (long const result)
__TM_is_failure_persistent (long const __result)
{
return result == _HTM_TBEGIN_PERSISTENT;
return __result == _HTM_TBEGIN_PERSISTENT;
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_failure_address (void* const tdb_ptr)
__TM_failure_address (void* const __tdb_ptr)
{
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
return tdb->atia;
}
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_failure_code (void* const tdb_ptr)
__TM_failure_code (void* const __tdb_ptr)
{
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
return tdb->abort_code;
}

View File

@ -32,50 +32,26 @@
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
__readeflags(void)
{
unsigned long long __res = 0;
__asm__ __volatile__ ("pushf\n\t"
"popq %0\n"
:"=r"(__res)
:
:
);
return __res;
return __builtin_ia32_readeflags_u64();
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
__writeeflags(unsigned long long __f)
{
__asm__ __volatile__ ("pushq %0\n\t"
"popf\n"
:
:"r"(__f)
:"flags"
);
__builtin_ia32_writeeflags_u64(__f);
}
#else /* !__x86_64__ */
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__readeflags(void)
{
unsigned int __res = 0;
__asm__ __volatile__ ("pushf\n\t"
"popl %0\n"
:"=r"(__res)
:
:
);
return __res;
return __builtin_ia32_readeflags_u32();
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
__writeeflags(unsigned int __f)
{
__asm__ __volatile__ ("pushl %0\n\t"
"popf\n"
:
:"r"(__f)
:"flags"
);
__builtin_ia32_writeeflags_u32(__f);
}
#endif /* !__x86_64__ */
@ -84,12 +60,6 @@ __rdpmc(int __A) {
return __builtin_ia32_rdpmc(__A);
}
/* __rdtsc */
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
__rdtsc(void) {
return __builtin_ia32_rdtsc();
}
/* __rdtscp */
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
__rdtscp(unsigned int *__A) {
@ -98,4 +68,6 @@ __rdtscp(unsigned int *__A) {
#define _rdtsc() __rdtsc()
#define _rdpmc(A) __rdpmc(A)
#endif /* __IA32INTRIN_H */

View File

@ -24,105 +24,204 @@
#ifndef __IMMINTRIN_H
#define __IMMINTRIN_H
#ifdef __MMX__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
#include <mmintrin.h>
#endif
#ifdef __SSE__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
#include <xmmintrin.h>
#endif
#ifdef __SSE2__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
#include <emmintrin.h>
#endif
#ifdef __SSE3__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
#include <pmmintrin.h>
#endif
#ifdef __SSSE3__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
#include <tmmintrin.h>
#endif
#if defined (__SSE4_2__) || defined (__SSE4_1__)
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__SSE4_2__) || defined(__SSE4_1__))
#include <smmintrin.h>
#endif
#if defined (__AES__) || defined (__PCLMUL__)
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AES__) || defined(__PCLMUL__))
#include <wmmintrin.h>
#endif
#ifdef __AVX__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
#include <clflushoptintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
#include <avxintrin.h>
#endif
#ifdef __AVX2__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
#include <avx2intrin.h>
#endif
#ifdef __BMI__
/* The 256-bit versions of functions in f16cintrin.h.
Intel documents these as being in immintrin.h, and
they depend on typedefs from avxintrin.h. */
/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
/// containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 256-bit vector containing 32-bit single-precision float values to be
/// converted to 16-bit half-precision float values.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing the converted 16-bit half-precision
/// float values.
#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
/// \brief Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 256-bit vector of [8 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values to be
/// converted to 32-bit single-precision float values.
/// \returns A vector of [8 x float] containing the converted 32-bit
/// single-precision float values.
static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
_mm256_cvtph_ps(__m128i __a)
{
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
}
#endif /* __AVX2__ */
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
#include <bmiintrin.h>
#endif
#ifdef __BMI2__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
#include <bmi2intrin.h>
#endif
#ifdef __LZCNT__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
#include <lzcntintrin.h>
#endif
#ifdef __FMA__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
#include <fmaintrin.h>
#endif
#ifdef __AVX512F__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
#include <avx512fintrin.h>
#endif
#ifdef __AVX512VL__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
#include <avx512vlintrin.h>
#endif
#ifdef __AVX512BW__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
#include <avx512bwintrin.h>
#endif
#ifdef __AVX512CD__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
#include <avx512cdintrin.h>
#endif
#ifdef __AVX512DQ__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
#include <avx512dqintrin.h>
#endif
#if defined (__AVX512VL__) && defined (__AVX512BW__)
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512BW__))
#include <avx512vlbwintrin.h>
#endif
#if defined (__AVX512VL__) && defined (__AVX512DQ__)
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512CD__))
#include <avx512vlcdintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512DQ__))
#include <avx512vldqintrin.h>
#endif
#ifdef __AVX512ER__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
#include <avx512erintrin.h>
#endif
#ifdef __RDRND__
static __inline__ int __attribute__((__always_inline__, __nodebug__))
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
#include <avx512ifmaintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512IFMA__) && defined(__AVX512VL__))
#include <avx512ifmavlintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
#include <avx512vbmiintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
(defined(__AVX512VBMI__) && defined(__AVX512VL__))
#include <avx512vbmivlintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
#include <avx512pfintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
#include <pkuintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand16_step(unsigned short *__p)
{
return __builtin_ia32_rdrand16_step(__p);
}
static __inline__ int __attribute__((__always_inline__, __nodebug__))
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand32_step(unsigned int *__p)
{
return __builtin_ia32_rdrand32_step(__p);
}
#ifdef __x86_64__
/* __bit_scan_forward */
static __inline__ int __attribute__((__always_inline__, __nodebug__))
_bit_scan_forward(int __A) {
return __builtin_ctz(__A);
}
/* __bit_scan_reverse */
static __inline__ int __attribute__((__always_inline__, __nodebug__))
_bit_scan_reverse(int __A) {
return 31 - __builtin_clz(__A);
}
#ifdef __x86_64__
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand64_step(unsigned long long *__p)
{
return __builtin_ia32_rdrand64_step(__p);
@ -130,71 +229,87 @@ _rdrand64_step(unsigned long long *__p)
#endif
#endif /* __RDRND__ */
#ifdef __FSGSBASE__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
#ifdef __x86_64__
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_readfsbase_u32(void)
{
return __builtin_ia32_rdfsbase32();
}
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_readfsbase_u64(void)
{
return __builtin_ia32_rdfsbase64();
}
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_readgsbase_u32(void)
{
return __builtin_ia32_rdgsbase32();
}
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_readgsbase_u64(void)
{
return __builtin_ia32_rdgsbase64();
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writefsbase_u32(unsigned int __V)
{
return __builtin_ia32_wrfsbase32(__V);
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writefsbase_u64(unsigned long long __V)
{
return __builtin_ia32_wrfsbase64(__V);
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writegsbase_u32(unsigned int __V)
{
return __builtin_ia32_wrgsbase32(__V);
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writegsbase_u64(unsigned long long __V)
{
return __builtin_ia32_wrgsbase64(__V);
}
#endif
#endif /* __FSGSBASE__ */
#ifdef __RTM__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
#include <rtmintrin.h>
#endif
#ifdef __RTM__
#include <xtestintrin.h>
#endif
#ifdef __SHA__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
#include <shaintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
#include <fxsrintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
#include <xsaveintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
#include <xsaveoptintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
#include <xsavecintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
#include <xsavesintrin.h>
#endif
/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
* whereas others are also available at all times. */

File diff suppressed because it is too large Load Diff

View File

@ -23,6 +23,10 @@
#ifndef __CLANG_INTTYPES_H
#define __CLANG_INTTYPES_H
#if defined(_MSC_VER) && _MSC_VER < 1800
#error MSVC does not have inttypes.h prior to Visual Studio 2013
#endif
#include_next <inttypes.h>
#if defined(_MSC_VER) && _MSC_VER < 1900

View File

@ -25,28 +25,54 @@
#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __LZCNT__
# error "LZCNT instruction is not enabled"
#endif /* __LZCNT__ */
#ifndef __LZCNTINTRIN_H
#define __LZCNTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
/// \brief Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 16-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 16-bit integer containing the number of leading zero
/// bits in the operand.
static __inline__ unsigned short __DEFAULT_FN_ATTRS
__lzcnt16(unsigned short __X)
{
return __X ? __builtin_clzs(__X) : 16;
}
/// \brief Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of leading zero
/// bits in the operand.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__lzcnt32(unsigned int __X)
{
return __X ? __builtin_clz(__X) : 32;
}
/// \brief Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of leading zero
/// bits in the operand.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_lzcnt_u32(unsigned int __X)
{
@ -54,12 +80,32 @@ _lzcnt_u32(unsigned int __X)
}
#ifdef __x86_64__
/// \brief Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of leading zero
/// bits in the operand.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__lzcnt64(unsigned long long __X)
{
return __X ? __builtin_clzll(__X) : 64;
}
/// \brief Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of leading zero
/// bits in the operand.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_lzcnt_u64(unsigned long long __X)
{

View File

@ -30,10 +30,10 @@
typedef float __v2sf __attribute__((__vector_size__(8)));
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
static __inline__ void __DEFAULT_FN_ATTRS
_m_femms() {
_m_femms(void) {
__builtin_ia32_femms();
}
@ -132,6 +132,10 @@ _m_pmulhrw(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
}
/* Handle the 3dnowa instructions here. */
#undef __DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
static __inline__ __m64 __DEFAULT_FN_ATTRS
_m_pf2iw(__m64 __m) {
return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);

File diff suppressed because it is too large Load Diff

166
c_headers/module.modulemap Normal file
View File

@ -0,0 +1,166 @@
/*===---- module.modulemap - intrinsics module map -------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
module _Builtin_intrinsics [system] [extern_c] {
explicit module altivec {
requires altivec
header "altivec.h"
}
explicit module arm {
requires arm
explicit module acle {
header "arm_acle.h"
export *
}
explicit module neon {
requires neon
header "arm_neon.h"
export *
}
}
explicit module intel {
requires x86
export *
header "immintrin.h"
textual header "f16cintrin.h"
textual header "avxintrin.h"
textual header "avx2intrin.h"
textual header "avx512fintrin.h"
textual header "avx512erintrin.h"
textual header "fmaintrin.h"
header "x86intrin.h"
textual header "bmiintrin.h"
textual header "bmi2intrin.h"
textual header "lzcntintrin.h"
textual header "xopintrin.h"
textual header "fma4intrin.h"
textual header "mwaitxintrin.h"
explicit module mm_malloc {
requires !freestanding
header "mm_malloc.h"
export * // note: for <stdlib.h> dependency
}
explicit module cpuid {
requires gnuinlineasm
header "cpuid.h"
}
explicit module mmx {
header "mmintrin.h"
}
explicit module sse {
export mm_malloc
export mmx
export sse2 // note: for hackish <emmintrin.h> dependency
header "xmmintrin.h"
}
explicit module sse2 {
export sse
header "emmintrin.h"
}
explicit module sse3 {
export sse2
header "pmmintrin.h"
}
explicit module ssse3 {
export sse3
header "tmmintrin.h"
}
explicit module sse4_1 {
export ssse3
header "smmintrin.h"
}
explicit module sse4_2 {
export sse4_1
header "nmmintrin.h"
}
explicit module sse4a {
export sse3
header "ammintrin.h"
}
explicit module popcnt {
header "popcntintrin.h"
}
explicit module mm3dnow {
header "mm3dnow.h"
}
explicit module aes_pclmul {
header "wmmintrin.h"
export aes
export pclmul
}
explicit module aes {
header "__wmmintrin_aes.h"
}
explicit module pclmul {
header "__wmmintrin_pclmul.h"
}
}
explicit module systemz {
requires systemz
export *
header "s390intrin.h"
explicit module htm {
requires htm
header "htmintrin.h"
header "htmxlintrin.h"
}
explicit module zvector {
requires zvector, vx
header "vecintrin.h"
}
}
}
module _Builtin_stddef_max_align_t [system] [extern_c] {
header "__stddef_max_align_t.h"
}
module opencl_c {
requires opencl
header "opencl-c.h"
}

583
c_headers/msa.h Normal file
View File

@ -0,0 +1,583 @@
/*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef _MSA_H
#define _MSA_H 1
#if defined(__mips_msa)
typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
#define __msa_sll_b __builtin_msa_sll_b
#define __msa_sll_h __builtin_msa_sll_h
#define __msa_sll_w __builtin_msa_sll_w
#define __msa_sll_d __builtin_msa_sll_d
#define __msa_slli_b __builtin_msa_slli_b
#define __msa_slli_h __builtin_msa_slli_h
#define __msa_slli_w __builtin_msa_slli_w
#define __msa_slli_d __builtin_msa_slli_d
#define __msa_sra_b __builtin_msa_sra_b
#define __msa_sra_h __builtin_msa_sra_h
#define __msa_sra_w __builtin_msa_sra_w
#define __msa_sra_d __builtin_msa_sra_d
#define __msa_srai_b __builtin_msa_srai_b
#define __msa_srai_h __builtin_msa_srai_h
#define __msa_srai_w __builtin_msa_srai_w
#define __msa_srai_d __builtin_msa_srai_d
#define __msa_srar_b __builtin_msa_srar_b
#define __msa_srar_h __builtin_msa_srar_h
#define __msa_srar_w __builtin_msa_srar_w
#define __msa_srar_d __builtin_msa_srar_d
#define __msa_srari_b __builtin_msa_srari_b
#define __msa_srari_h __builtin_msa_srari_h
#define __msa_srari_w __builtin_msa_srari_w
#define __msa_srari_d __builtin_msa_srari_d
#define __msa_srl_b __builtin_msa_srl_b
#define __msa_srl_h __builtin_msa_srl_h
#define __msa_srl_w __builtin_msa_srl_w
#define __msa_srl_d __builtin_msa_srl_d
#define __msa_srli_b __builtin_msa_srli_b
#define __msa_srli_h __builtin_msa_srli_h
#define __msa_srli_w __builtin_msa_srli_w
#define __msa_srli_d __builtin_msa_srli_d
#define __msa_srlr_b __builtin_msa_srlr_b
#define __msa_srlr_h __builtin_msa_srlr_h
#define __msa_srlr_w __builtin_msa_srlr_w
#define __msa_srlr_d __builtin_msa_srlr_d
#define __msa_srlri_b __builtin_msa_srlri_b
#define __msa_srlri_h __builtin_msa_srlri_h
#define __msa_srlri_w __builtin_msa_srlri_w
#define __msa_srlri_d __builtin_msa_srlri_d
#define __msa_bclr_b __builtin_msa_bclr_b
#define __msa_bclr_h __builtin_msa_bclr_h
#define __msa_bclr_w __builtin_msa_bclr_w
#define __msa_bclr_d __builtin_msa_bclr_d
#define __msa_bclri_b __builtin_msa_bclri_b
#define __msa_bclri_h __builtin_msa_bclri_h
#define __msa_bclri_w __builtin_msa_bclri_w
#define __msa_bclri_d __builtin_msa_bclri_d
#define __msa_bset_b __builtin_msa_bset_b
#define __msa_bset_h __builtin_msa_bset_h
#define __msa_bset_w __builtin_msa_bset_w
#define __msa_bset_d __builtin_msa_bset_d
#define __msa_bseti_b __builtin_msa_bseti_b
#define __msa_bseti_h __builtin_msa_bseti_h
#define __msa_bseti_w __builtin_msa_bseti_w
#define __msa_bseti_d __builtin_msa_bseti_d
#define __msa_bneg_b __builtin_msa_bneg_b
#define __msa_bneg_h __builtin_msa_bneg_h
#define __msa_bneg_w __builtin_msa_bneg_w
#define __msa_bneg_d __builtin_msa_bneg_d
#define __msa_bnegi_b __builtin_msa_bnegi_b
#define __msa_bnegi_h __builtin_msa_bnegi_h
#define __msa_bnegi_w __builtin_msa_bnegi_w
#define __msa_bnegi_d __builtin_msa_bnegi_d
#define __msa_binsl_b __builtin_msa_binsl_b
#define __msa_binsl_h __builtin_msa_binsl_h
#define __msa_binsl_w __builtin_msa_binsl_w
#define __msa_binsl_d __builtin_msa_binsl_d
#define __msa_binsli_b __builtin_msa_binsli_b
#define __msa_binsli_h __builtin_msa_binsli_h
#define __msa_binsli_w __builtin_msa_binsli_w
#define __msa_binsli_d __builtin_msa_binsli_d
#define __msa_binsr_b __builtin_msa_binsr_b
#define __msa_binsr_h __builtin_msa_binsr_h
#define __msa_binsr_w __builtin_msa_binsr_w
#define __msa_binsr_d __builtin_msa_binsr_d
#define __msa_binsri_b __builtin_msa_binsri_b
#define __msa_binsri_h __builtin_msa_binsri_h
#define __msa_binsri_w __builtin_msa_binsri_w
#define __msa_binsri_d __builtin_msa_binsri_d
#define __msa_addv_b __builtin_msa_addv_b
#define __msa_addv_h __builtin_msa_addv_h
#define __msa_addv_w __builtin_msa_addv_w
#define __msa_addv_d __builtin_msa_addv_d
#define __msa_addvi_b __builtin_msa_addvi_b
#define __msa_addvi_h __builtin_msa_addvi_h
#define __msa_addvi_w __builtin_msa_addvi_w
#define __msa_addvi_d __builtin_msa_addvi_d
#define __msa_subv_b __builtin_msa_subv_b
#define __msa_subv_h __builtin_msa_subv_h
#define __msa_subv_w __builtin_msa_subv_w
#define __msa_subv_d __builtin_msa_subv_d
#define __msa_subvi_b __builtin_msa_subvi_b
#define __msa_subvi_h __builtin_msa_subvi_h
#define __msa_subvi_w __builtin_msa_subvi_w
#define __msa_subvi_d __builtin_msa_subvi_d
#define __msa_max_s_b __builtin_msa_max_s_b
#define __msa_max_s_h __builtin_msa_max_s_h
#define __msa_max_s_w __builtin_msa_max_s_w
#define __msa_max_s_d __builtin_msa_max_s_d
#define __msa_maxi_s_b __builtin_msa_maxi_s_b
#define __msa_maxi_s_h __builtin_msa_maxi_s_h
#define __msa_maxi_s_w __builtin_msa_maxi_s_w
#define __msa_maxi_s_d __builtin_msa_maxi_s_d
#define __msa_max_u_b __builtin_msa_max_u_b
#define __msa_max_u_h __builtin_msa_max_u_h
#define __msa_max_u_w __builtin_msa_max_u_w
#define __msa_max_u_d __builtin_msa_max_u_d
#define __msa_maxi_u_b __builtin_msa_maxi_u_b
#define __msa_maxi_u_h __builtin_msa_maxi_u_h
#define __msa_maxi_u_w __builtin_msa_maxi_u_w
#define __msa_maxi_u_d __builtin_msa_maxi_u_d
#define __msa_min_s_b __builtin_msa_min_s_b
#define __msa_min_s_h __builtin_msa_min_s_h
#define __msa_min_s_w __builtin_msa_min_s_w
#define __msa_min_s_d __builtin_msa_min_s_d
#define __msa_mini_s_b __builtin_msa_mini_s_b
#define __msa_mini_s_h __builtin_msa_mini_s_h
#define __msa_mini_s_w __builtin_msa_mini_s_w
#define __msa_mini_s_d __builtin_msa_mini_s_d
#define __msa_min_u_b __builtin_msa_min_u_b
#define __msa_min_u_h __builtin_msa_min_u_h
#define __msa_min_u_w __builtin_msa_min_u_w
#define __msa_min_u_d __builtin_msa_min_u_d
#define __msa_mini_u_b __builtin_msa_mini_u_b
#define __msa_mini_u_h __builtin_msa_mini_u_h
#define __msa_mini_u_w __builtin_msa_mini_u_w
#define __msa_mini_u_d __builtin_msa_mini_u_d
#define __msa_max_a_b __builtin_msa_max_a_b
#define __msa_max_a_h __builtin_msa_max_a_h
#define __msa_max_a_w __builtin_msa_max_a_w
#define __msa_max_a_d __builtin_msa_max_a_d
#define __msa_min_a_b __builtin_msa_min_a_b
#define __msa_min_a_h __builtin_msa_min_a_h
#define __msa_min_a_w __builtin_msa_min_a_w
#define __msa_min_a_d __builtin_msa_min_a_d
#define __msa_ceq_b __builtin_msa_ceq_b
#define __msa_ceq_h __builtin_msa_ceq_h
#define __msa_ceq_w __builtin_msa_ceq_w
#define __msa_ceq_d __builtin_msa_ceq_d
#define __msa_ceqi_b __builtin_msa_ceqi_b
#define __msa_ceqi_h __builtin_msa_ceqi_h
#define __msa_ceqi_w __builtin_msa_ceqi_w
#define __msa_ceqi_d __builtin_msa_ceqi_d
#define __msa_clt_s_b __builtin_msa_clt_s_b
#define __msa_clt_s_h __builtin_msa_clt_s_h
#define __msa_clt_s_w __builtin_msa_clt_s_w
#define __msa_clt_s_d __builtin_msa_clt_s_d
#define __msa_clti_s_b __builtin_msa_clti_s_b
#define __msa_clti_s_h __builtin_msa_clti_s_h
#define __msa_clti_s_w __builtin_msa_clti_s_w
#define __msa_clti_s_d __builtin_msa_clti_s_d
#define __msa_clt_u_b __builtin_msa_clt_u_b
#define __msa_clt_u_h __builtin_msa_clt_u_h
#define __msa_clt_u_w __builtin_msa_clt_u_w
#define __msa_clt_u_d __builtin_msa_clt_u_d
#define __msa_clti_u_b __builtin_msa_clti_u_b
#define __msa_clti_u_h __builtin_msa_clti_u_h
#define __msa_clti_u_w __builtin_msa_clti_u_w
#define __msa_clti_u_d __builtin_msa_clti_u_d
#define __msa_cle_s_b __builtin_msa_cle_s_b
#define __msa_cle_s_h __builtin_msa_cle_s_h
#define __msa_cle_s_w __builtin_msa_cle_s_w
#define __msa_cle_s_d __builtin_msa_cle_s_d
#define __msa_clei_s_b __builtin_msa_clei_s_b
#define __msa_clei_s_h __builtin_msa_clei_s_h
#define __msa_clei_s_w __builtin_msa_clei_s_w
#define __msa_clei_s_d __builtin_msa_clei_s_d
#define __msa_cle_u_b __builtin_msa_cle_u_b
#define __msa_cle_u_h __builtin_msa_cle_u_h
#define __msa_cle_u_w __builtin_msa_cle_u_w
#define __msa_cle_u_d __builtin_msa_cle_u_d
#define __msa_clei_u_b __builtin_msa_clei_u_b
#define __msa_clei_u_h __builtin_msa_clei_u_h
#define __msa_clei_u_w __builtin_msa_clei_u_w
#define __msa_clei_u_d __builtin_msa_clei_u_d
#define __msa_ld_b __builtin_msa_ld_b
#define __msa_ld_h __builtin_msa_ld_h
#define __msa_ld_w __builtin_msa_ld_w
#define __msa_ld_d __builtin_msa_ld_d
#define __msa_st_b __builtin_msa_st_b
#define __msa_st_h __builtin_msa_st_h
#define __msa_st_w __builtin_msa_st_w
#define __msa_st_d __builtin_msa_st_d
#define __msa_sat_s_b __builtin_msa_sat_s_b
#define __msa_sat_s_h __builtin_msa_sat_s_h
#define __msa_sat_s_w __builtin_msa_sat_s_w
#define __msa_sat_s_d __builtin_msa_sat_s_d
#define __msa_sat_u_b __builtin_msa_sat_u_b
#define __msa_sat_u_h __builtin_msa_sat_u_h
#define __msa_sat_u_w __builtin_msa_sat_u_w
#define __msa_sat_u_d __builtin_msa_sat_u_d
#define __msa_add_a_b __builtin_msa_add_a_b
#define __msa_add_a_h __builtin_msa_add_a_h
#define __msa_add_a_w __builtin_msa_add_a_w
#define __msa_add_a_d __builtin_msa_add_a_d
#define __msa_adds_a_b __builtin_msa_adds_a_b
#define __msa_adds_a_h __builtin_msa_adds_a_h
#define __msa_adds_a_w __builtin_msa_adds_a_w
#define __msa_adds_a_d __builtin_msa_adds_a_d
#define __msa_adds_s_b __builtin_msa_adds_s_b
#define __msa_adds_s_h __builtin_msa_adds_s_h
#define __msa_adds_s_w __builtin_msa_adds_s_w
#define __msa_adds_s_d __builtin_msa_adds_s_d
#define __msa_adds_u_b __builtin_msa_adds_u_b
#define __msa_adds_u_h __builtin_msa_adds_u_h
#define __msa_adds_u_w __builtin_msa_adds_u_w
#define __msa_adds_u_d __builtin_msa_adds_u_d
#define __msa_ave_s_b __builtin_msa_ave_s_b
#define __msa_ave_s_h __builtin_msa_ave_s_h
#define __msa_ave_s_w __builtin_msa_ave_s_w
#define __msa_ave_s_d __builtin_msa_ave_s_d
#define __msa_ave_u_b __builtin_msa_ave_u_b
#define __msa_ave_u_h __builtin_msa_ave_u_h
#define __msa_ave_u_w __builtin_msa_ave_u_w
#define __msa_ave_u_d __builtin_msa_ave_u_d
#define __msa_aver_s_b __builtin_msa_aver_s_b
#define __msa_aver_s_h __builtin_msa_aver_s_h
#define __msa_aver_s_w __builtin_msa_aver_s_w
#define __msa_aver_s_d __builtin_msa_aver_s_d
#define __msa_aver_u_b __builtin_msa_aver_u_b
#define __msa_aver_u_h __builtin_msa_aver_u_h
#define __msa_aver_u_w __builtin_msa_aver_u_w
#define __msa_aver_u_d __builtin_msa_aver_u_d
#define __msa_subs_s_b __builtin_msa_subs_s_b
#define __msa_subs_s_h __builtin_msa_subs_s_h
#define __msa_subs_s_w __builtin_msa_subs_s_w
#define __msa_subs_s_d __builtin_msa_subs_s_d
#define __msa_subs_u_b __builtin_msa_subs_u_b
#define __msa_subs_u_h __builtin_msa_subs_u_h
#define __msa_subs_u_w __builtin_msa_subs_u_w
#define __msa_subs_u_d __builtin_msa_subs_u_d
#define __msa_subsuu_s_b __builtin_msa_subsuu_s_b
#define __msa_subsuu_s_h __builtin_msa_subsuu_s_h
#define __msa_subsuu_s_w __builtin_msa_subsuu_s_w
#define __msa_subsuu_s_d __builtin_msa_subsuu_s_d
#define __msa_subsus_u_b __builtin_msa_subsus_u_b
#define __msa_subsus_u_h __builtin_msa_subsus_u_h
#define __msa_subsus_u_w __builtin_msa_subsus_u_w
#define __msa_subsus_u_d __builtin_msa_subsus_u_d
#define __msa_asub_s_b __builtin_msa_asub_s_b
#define __msa_asub_s_h __builtin_msa_asub_s_h
#define __msa_asub_s_w __builtin_msa_asub_s_w
#define __msa_asub_s_d __builtin_msa_asub_s_d
#define __msa_asub_u_b __builtin_msa_asub_u_b
#define __msa_asub_u_h __builtin_msa_asub_u_h
#define __msa_asub_u_w __builtin_msa_asub_u_w
#define __msa_asub_u_d __builtin_msa_asub_u_d
#define __msa_mulv_b __builtin_msa_mulv_b
#define __msa_mulv_h __builtin_msa_mulv_h
#define __msa_mulv_w __builtin_msa_mulv_w
#define __msa_mulv_d __builtin_msa_mulv_d
#define __msa_maddv_b __builtin_msa_maddv_b
#define __msa_maddv_h __builtin_msa_maddv_h
#define __msa_maddv_w __builtin_msa_maddv_w
#define __msa_maddv_d __builtin_msa_maddv_d
#define __msa_msubv_b __builtin_msa_msubv_b
#define __msa_msubv_h __builtin_msa_msubv_h
#define __msa_msubv_w __builtin_msa_msubv_w
#define __msa_msubv_d __builtin_msa_msubv_d
#define __msa_div_s_b __builtin_msa_div_s_b
#define __msa_div_s_h __builtin_msa_div_s_h
#define __msa_div_s_w __builtin_msa_div_s_w
#define __msa_div_s_d __builtin_msa_div_s_d
#define __msa_div_u_b __builtin_msa_div_u_b
#define __msa_div_u_h __builtin_msa_div_u_h
#define __msa_div_u_w __builtin_msa_div_u_w
#define __msa_div_u_d __builtin_msa_div_u_d
#define __msa_hadd_s_h __builtin_msa_hadd_s_h
#define __msa_hadd_s_w __builtin_msa_hadd_s_w
#define __msa_hadd_s_d __builtin_msa_hadd_s_d
#define __msa_hadd_u_h __builtin_msa_hadd_u_h
#define __msa_hadd_u_w __builtin_msa_hadd_u_w
#define __msa_hadd_u_d __builtin_msa_hadd_u_d
#define __msa_hsub_s_h __builtin_msa_hsub_s_h
#define __msa_hsub_s_w __builtin_msa_hsub_s_w
#define __msa_hsub_s_d __builtin_msa_hsub_s_d
#define __msa_hsub_u_h __builtin_msa_hsub_u_h
#define __msa_hsub_u_w __builtin_msa_hsub_u_w
#define __msa_hsub_u_d __builtin_msa_hsub_u_d
#define __msa_mod_s_b __builtin_msa_mod_s_b
#define __msa_mod_s_h __builtin_msa_mod_s_h
#define __msa_mod_s_w __builtin_msa_mod_s_w
#define __msa_mod_s_d __builtin_msa_mod_s_d
#define __msa_mod_u_b __builtin_msa_mod_u_b
#define __msa_mod_u_h __builtin_msa_mod_u_h
#define __msa_mod_u_w __builtin_msa_mod_u_w
#define __msa_mod_u_d __builtin_msa_mod_u_d
#define __msa_dotp_s_h __builtin_msa_dotp_s_h
#define __msa_dotp_s_w __builtin_msa_dotp_s_w
#define __msa_dotp_s_d __builtin_msa_dotp_s_d
#define __msa_dotp_u_h __builtin_msa_dotp_u_h
#define __msa_dotp_u_w __builtin_msa_dotp_u_w
#define __msa_dotp_u_d __builtin_msa_dotp_u_d
#define __msa_dpadd_s_h __builtin_msa_dpadd_s_h
#define __msa_dpadd_s_w __builtin_msa_dpadd_s_w
#define __msa_dpadd_s_d __builtin_msa_dpadd_s_d
#define __msa_dpadd_u_h __builtin_msa_dpadd_u_h
#define __msa_dpadd_u_w __builtin_msa_dpadd_u_w
#define __msa_dpadd_u_d __builtin_msa_dpadd_u_d
#define __msa_dpsub_s_h __builtin_msa_dpsub_s_h
#define __msa_dpsub_s_w __builtin_msa_dpsub_s_w
#define __msa_dpsub_s_d __builtin_msa_dpsub_s_d
#define __msa_dpsub_u_h __builtin_msa_dpsub_u_h
#define __msa_dpsub_u_w __builtin_msa_dpsub_u_w
#define __msa_dpsub_u_d __builtin_msa_dpsub_u_d
#define __msa_sld_b __builtin_msa_sld_b
#define __msa_sld_h __builtin_msa_sld_h
#define __msa_sld_w __builtin_msa_sld_w
#define __msa_sld_d __builtin_msa_sld_d
#define __msa_sldi_b __builtin_msa_sldi_b
#define __msa_sldi_h __builtin_msa_sldi_h
#define __msa_sldi_w __builtin_msa_sldi_w
#define __msa_sldi_d __builtin_msa_sldi_d
#define __msa_splat_b __builtin_msa_splat_b
#define __msa_splat_h __builtin_msa_splat_h
#define __msa_splat_w __builtin_msa_splat_w
#define __msa_splat_d __builtin_msa_splat_d
#define __msa_splati_b __builtin_msa_splati_b
#define __msa_splati_h __builtin_msa_splati_h
#define __msa_splati_w __builtin_msa_splati_w
#define __msa_splati_d __builtin_msa_splati_d
#define __msa_pckev_b __builtin_msa_pckev_b
#define __msa_pckev_h __builtin_msa_pckev_h
#define __msa_pckev_w __builtin_msa_pckev_w
#define __msa_pckev_d __builtin_msa_pckev_d
#define __msa_pckod_b __builtin_msa_pckod_b
#define __msa_pckod_h __builtin_msa_pckod_h
#define __msa_pckod_w __builtin_msa_pckod_w
#define __msa_pckod_d __builtin_msa_pckod_d
#define __msa_ilvl_b __builtin_msa_ilvl_b
#define __msa_ilvl_h __builtin_msa_ilvl_h
#define __msa_ilvl_w __builtin_msa_ilvl_w
#define __msa_ilvl_d __builtin_msa_ilvl_d
#define __msa_ilvr_b __builtin_msa_ilvr_b
#define __msa_ilvr_h __builtin_msa_ilvr_h
#define __msa_ilvr_w __builtin_msa_ilvr_w
#define __msa_ilvr_d __builtin_msa_ilvr_d
#define __msa_ilvev_b __builtin_msa_ilvev_b
#define __msa_ilvev_h __builtin_msa_ilvev_h
#define __msa_ilvev_w __builtin_msa_ilvev_w
#define __msa_ilvev_d __builtin_msa_ilvev_d
#define __msa_ilvod_b __builtin_msa_ilvod_b
#define __msa_ilvod_h __builtin_msa_ilvod_h
#define __msa_ilvod_w __builtin_msa_ilvod_w
#define __msa_ilvod_d __builtin_msa_ilvod_d
#define __msa_vshf_b __builtin_msa_vshf_b
#define __msa_vshf_h __builtin_msa_vshf_h
#define __msa_vshf_w __builtin_msa_vshf_w
#define __msa_vshf_d __builtin_msa_vshf_d
#define __msa_and_v __builtin_msa_and_v
#define __msa_andi_b __builtin_msa_andi_b
#define __msa_or_v __builtin_msa_or_v
#define __msa_ori_b __builtin_msa_ori_b
#define __msa_nor_v __builtin_msa_nor_v
#define __msa_nori_b __builtin_msa_nori_b
#define __msa_xor_v __builtin_msa_xor_v
#define __msa_xori_b __builtin_msa_xori_b
#define __msa_bmnz_v __builtin_msa_bmnz_v
#define __msa_bmnzi_b __builtin_msa_bmnzi_b
#define __msa_bmz_v __builtin_msa_bmz_v
#define __msa_bmzi_b __builtin_msa_bmzi_b
#define __msa_bsel_v __builtin_msa_bsel_v
#define __msa_bseli_b __builtin_msa_bseli_b
#define __msa_shf_b __builtin_msa_shf_b
#define __msa_shf_h __builtin_msa_shf_h
#define __msa_shf_w __builtin_msa_shf_w
#define __msa_test_bnz_v __builtin_msa_bnz_v
#define __msa_test_bz_v __builtin_msa_bz_v
#define __msa_fill_b __builtin_msa_fill_b
#define __msa_fill_h __builtin_msa_fill_h
#define __msa_fill_w __builtin_msa_fill_w
#define __msa_fill_d __builtin_msa_fill_d
#define __msa_pcnt_b __builtin_msa_pcnt_b
#define __msa_pcnt_h __builtin_msa_pcnt_h
#define __msa_pcnt_w __builtin_msa_pcnt_w
#define __msa_pcnt_d __builtin_msa_pcnt_d
#define __msa_nloc_b __builtin_msa_nloc_b
#define __msa_nloc_h __builtin_msa_nloc_h
#define __msa_nloc_w __builtin_msa_nloc_w
#define __msa_nloc_d __builtin_msa_nloc_d
#define __msa_nlzc_b __builtin_msa_nlzc_b
#define __msa_nlzc_h __builtin_msa_nlzc_h
#define __msa_nlzc_w __builtin_msa_nlzc_w
#define __msa_nlzc_d __builtin_msa_nlzc_d
#define __msa_copy_s_b __builtin_msa_copy_s_b
#define __msa_copy_s_h __builtin_msa_copy_s_h
#define __msa_copy_s_w __builtin_msa_copy_s_w
#define __msa_copy_s_d __builtin_msa_copy_s_d
#define __msa_copy_u_b __builtin_msa_copy_u_b
#define __msa_copy_u_h __builtin_msa_copy_u_h
#define __msa_copy_u_w __builtin_msa_copy_u_w
#define __msa_copy_u_d __builtin_msa_copy_u_d
#define __msa_insert_b __builtin_msa_insert_b
#define __msa_insert_h __builtin_msa_insert_h
#define __msa_insert_w __builtin_msa_insert_w
#define __msa_insert_d __builtin_msa_insert_d
#define __msa_insve_b __builtin_msa_insve_b
#define __msa_insve_h __builtin_msa_insve_h
#define __msa_insve_w __builtin_msa_insve_w
#define __msa_insve_d __builtin_msa_insve_d
#define __msa_test_bnz_b __builtin_msa_bnz_b
#define __msa_test_bnz_h __builtin_msa_bnz_h
#define __msa_test_bnz_w __builtin_msa_bnz_w
#define __msa_test_bnz_d __builtin_msa_bnz_d
#define __msa_test_bz_b __builtin_msa_bz_b
#define __msa_test_bz_h __builtin_msa_bz_h
#define __msa_test_bz_w __builtin_msa_bz_w
#define __msa_test_bz_d __builtin_msa_bz_d
#define __msa_ldi_b __builtin_msa_ldi_b
#define __msa_ldi_h __builtin_msa_ldi_h
#define __msa_ldi_w __builtin_msa_ldi_w
#define __msa_ldi_d __builtin_msa_ldi_d
#define __msa_fcaf_w __builtin_msa_fcaf_w
#define __msa_fcaf_d __builtin_msa_fcaf_d
#define __msa_fcor_w __builtin_msa_fcor_w
#define __msa_fcor_d __builtin_msa_fcor_d
#define __msa_fcun_w __builtin_msa_fcun_w
#define __msa_fcun_d __builtin_msa_fcun_d
#define __msa_fcune_w __builtin_msa_fcune_w
#define __msa_fcune_d __builtin_msa_fcune_d
#define __msa_fcueq_w __builtin_msa_fcueq_w
#define __msa_fcueq_d __builtin_msa_fcueq_d
#define __msa_fceq_w __builtin_msa_fceq_w
#define __msa_fceq_d __builtin_msa_fceq_d
#define __msa_fcne_w __builtin_msa_fcne_w
#define __msa_fcne_d __builtin_msa_fcne_d
#define __msa_fclt_w __builtin_msa_fclt_w
#define __msa_fclt_d __builtin_msa_fclt_d
#define __msa_fcult_w __builtin_msa_fcult_w
#define __msa_fcult_d __builtin_msa_fcult_d
#define __msa_fcle_w __builtin_msa_fcle_w
#define __msa_fcle_d __builtin_msa_fcle_d
#define __msa_fcule_w __builtin_msa_fcule_w
#define __msa_fcule_d __builtin_msa_fcule_d
#define __msa_fsaf_w __builtin_msa_fsaf_w
#define __msa_fsaf_d __builtin_msa_fsaf_d
#define __msa_fsor_w __builtin_msa_fsor_w
#define __msa_fsor_d __builtin_msa_fsor_d
#define __msa_fsun_w __builtin_msa_fsun_w
#define __msa_fsun_d __builtin_msa_fsun_d
#define __msa_fsune_w __builtin_msa_fsune_w
#define __msa_fsune_d __builtin_msa_fsune_d
#define __msa_fsueq_w __builtin_msa_fsueq_w
#define __msa_fsueq_d __builtin_msa_fsueq_d
#define __msa_fseq_w __builtin_msa_fseq_w
#define __msa_fseq_d __builtin_msa_fseq_d
#define __msa_fsne_w __builtin_msa_fsne_w
#define __msa_fsne_d __builtin_msa_fsne_d
#define __msa_fslt_w __builtin_msa_fslt_w
#define __msa_fslt_d __builtin_msa_fslt_d
#define __msa_fsult_w __builtin_msa_fsult_w
#define __msa_fsult_d __builtin_msa_fsult_d
#define __msa_fsle_w __builtin_msa_fsle_w
#define __msa_fsle_d __builtin_msa_fsle_d
#define __msa_fsule_w __builtin_msa_fsule_w
#define __msa_fsule_d __builtin_msa_fsule_d
#define __msa_fadd_w __builtin_msa_fadd_w
#define __msa_fadd_d __builtin_msa_fadd_d
#define __msa_fsub_w __builtin_msa_fsub_w
#define __msa_fsub_d __builtin_msa_fsub_d
#define __msa_fmul_w __builtin_msa_fmul_w
#define __msa_fmul_d __builtin_msa_fmul_d
#define __msa_fdiv_w __builtin_msa_fdiv_w
#define __msa_fdiv_d __builtin_msa_fdiv_d
#define __msa_fmadd_w __builtin_msa_fmadd_w
#define __msa_fmadd_d __builtin_msa_fmadd_d
#define __msa_fmsub_w __builtin_msa_fmsub_w
#define __msa_fmsub_d __builtin_msa_fmsub_d
#define __msa_fexp2_w __builtin_msa_fexp2_w
#define __msa_fexp2_d __builtin_msa_fexp2_d
#define __msa_fexdo_h __builtin_msa_fexdo_h
#define __msa_fexdo_w __builtin_msa_fexdo_w
#define __msa_ftq_h __builtin_msa_ftq_h
#define __msa_ftq_w __builtin_msa_ftq_w
#define __msa_fmin_w __builtin_msa_fmin_w
#define __msa_fmin_d __builtin_msa_fmin_d
#define __msa_fmin_a_w __builtin_msa_fmin_a_w
#define __msa_fmin_a_d __builtin_msa_fmin_a_d
#define __msa_fmax_w __builtin_msa_fmax_w
#define __msa_fmax_d __builtin_msa_fmax_d
#define __msa_fmax_a_w __builtin_msa_fmax_a_w
#define __msa_fmax_a_d __builtin_msa_fmax_a_d
#define __msa_mul_q_h __builtin_msa_mul_q_h
#define __msa_mul_q_w __builtin_msa_mul_q_w
#define __msa_mulr_q_h __builtin_msa_mulr_q_h
#define __msa_mulr_q_w __builtin_msa_mulr_q_w
#define __msa_madd_q_h __builtin_msa_madd_q_h
#define __msa_madd_q_w __builtin_msa_madd_q_w
#define __msa_maddr_q_h __builtin_msa_maddr_q_h
#define __msa_maddr_q_w __builtin_msa_maddr_q_w
#define __msa_msub_q_h __builtin_msa_msub_q_h
#define __msa_msub_q_w __builtin_msa_msub_q_w
#define __msa_msubr_q_h __builtin_msa_msubr_q_h
#define __msa_msubr_q_w __builtin_msa_msubr_q_w
#define __msa_fclass_w __builtin_msa_fclass_w
#define __msa_fclass_d __builtin_msa_fclass_d
#define __msa_fsqrt_w __builtin_msa_fsqrt_w
#define __msa_fsqrt_d __builtin_msa_fsqrt_d
#define __msa_frcp_w __builtin_msa_frcp_w
#define __msa_frcp_d __builtin_msa_frcp_d
#define __msa_frint_w __builtin_msa_frint_w
#define __msa_frint_d __builtin_msa_frint_d
#define __msa_frsqrt_w __builtin_msa_frsqrt_w
#define __msa_frsqrt_d __builtin_msa_frsqrt_d
#define __msa_flog2_w __builtin_msa_flog2_w
#define __msa_flog2_d __builtin_msa_flog2_d
#define __msa_fexupl_w __builtin_msa_fexupl_w
#define __msa_fexupl_d __builtin_msa_fexupl_d
#define __msa_fexupr_w __builtin_msa_fexupr_w
#define __msa_fexupr_d __builtin_msa_fexupr_d
#define __msa_ffql_w __builtin_msa_ffql_w
#define __msa_ffql_d __builtin_msa_ffql_d
#define __msa_ffqr_w __builtin_msa_ffqr_w
#define __msa_ffqr_d __builtin_msa_ffqr_d
#define __msa_ftint_s_w __builtin_msa_ftint_s_w
#define __msa_ftint_s_d __builtin_msa_ftint_s_d
#define __msa_ftint_u_w __builtin_msa_ftint_u_w
#define __msa_ftint_u_d __builtin_msa_ftint_u_d
#define __msa_ftrunc_s_w __builtin_msa_ftrunc_s_w
#define __msa_ftrunc_s_d __builtin_msa_ftrunc_s_d
#define __msa_ftrunc_u_w __builtin_msa_ftrunc_u_w
#define __msa_ftrunc_u_d __builtin_msa_ftrunc_u_d
#define __msa_ffint_s_w __builtin_msa_ffint_s_w
#define __msa_ffint_s_d __builtin_msa_ffint_s_d
#define __msa_ffint_u_w __builtin_msa_ffint_u_w
#define __msa_ffint_u_d __builtin_msa_ffint_u_d
#define __msa_cfcmsa __builtin_msa_cfcmsa
#define __msa_move_v __builtin_msa_move_v
#define __msa_cast_to_vector_float __builtin_msa_cast_to_vector_float
#define __msa_cast_to_vector_double __builtin_msa_cast_to_vector_double
#define __msa_cast_to_scalar_float __builtin_msa_cast_to_scalar_float
#define __msa_cast_to_scalar_double __builtin_msa_cast_to_scalar_double
#endif /* defined(__mips_msa) */
#endif /* _MSA_H */

47
c_headers/mwaitxintrin.h Normal file
View File

@ -0,0 +1,47 @@
/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _MWAITXINTRIN_H
#define _MWAITXINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx")))
static __inline__ void __DEFAULT_FN_ATTRS
_mm_monitorx(void const * __p, unsigned __extensions, unsigned __hints)
{
__builtin_ia32_monitorx((void *)__p, __extensions, __hints);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
{
__builtin_ia32_mwaitx(__extensions, __hints, __clock);
}
#undef __DEFAULT_FN_ATTRS
#endif /* _MWAITXINTRIN_H */

View File

@ -24,12 +24,7 @@
#ifndef _NMMINTRIN_H
#define _NMMINTRIN_H
#ifndef __SSE4_2__
#error "SSE4.2 instruction set not enabled"
#else
/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
just include it now then. */
#include <smmintrin.h>
#endif /* __SSE4_2__ */
#endif /* _NMMINTRIN_H */

17055
c_headers/opencl-c.h Normal file

File diff suppressed because it is too large Load Diff

48
c_headers/pkuintrin.h Normal file
View File

@ -0,0 +1,48 @@
/*===------------- pkuintrin.h - PKU intrinsics ------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __PKUINTRIN_H
#define __PKUINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_rdpkru_u32(void)
{
return __builtin_ia32_rdpkru();
}
static __inline__ void __DEFAULT_FN_ATTRS
_wrpkru(unsigned int __val)
{
return __builtin_ia32_wrpkru(__val);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -20,79 +20,241 @@
*
*===-----------------------------------------------------------------------===
*/
#ifndef __PMMINTRIN_H
#define __PMMINTRIN_H
#ifndef __SSE3__
#error "SSE3 instruction set not enabled"
#else
#include <emmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sse3")))
/// \brief Loads data from an unaligned memory location to elements in a 128-bit
/// vector. If the address of the data is not 16-byte aligned, the
/// instruction may read two adjacent aligned blocks of memory to retrieve
/// the requested data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
///
/// \param __p
/// A pointer to a 128-bit integer vector containing integer values.
/// \returns A 128-bit vector containing the moved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lddqu_si128(__m128i const *__p)
{
return (__m128i)__builtin_ia32_lddqu((char const *)__p);
}
/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
/// two 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the left source operand.
/// \param __b
/// A 128-bit vector of [4 x float] containing the right source operand.
/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
/// differences of both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_addsub_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_addsubps(__a, __b);
return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in two
/// 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
/// both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hadd_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_haddps(__a, __b);
return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in two
/// 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal differences between the values are stored in the lower
/// bits of the destination.
/// \param __b
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal differences between the values are stored in the upper
/// bits of the destination.
/// \returns A 128-bit vector of [4 x float] containing the horizontal
/// differences of both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hsub_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_hsubps(__a, __b);
return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
}
/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
/// vector of [4 x float] to float values stored in a 128-bit vector of
/// [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float]. \n
/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
/// the destination. \n
/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
/// values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehdup_ps(__m128 __a)
{
return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
}
/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] \n
/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
/// the destination. \n
/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
/// values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_moveldup_ps(__m128 __a)
{
return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
}
/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
/// two 128-bit vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing the left source operand.
/// \param __b
/// A 128-bit vector of [2 x double] containing the right source operand.
/// \returns A 128-bit vector of [2 x double] containing the alternating sums
/// and differences of both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_addsub_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_addsubpd(__a, __b);
return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
}
/// \brief Horizontally adds the pairs of values contained in two 128-bit
/// vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal sum of the values is stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal sum of the values is stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
/// both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hadd_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_haddpd(__a, __b);
return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
}
/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
/// vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal difference of the values is stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal difference of the values is stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [2 x double] containing the horizontal
/// differences of both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hsub_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_hsubpd(__a, __b);
return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
}
/// \brief Moves and duplicates one double-precision value to double-precision
/// values stored in a 128-bit vector of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128d _mm_loaddup_pd(double const * dp);
/// \endcode
///
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
///
/// \param dp
/// A pointer to a double-precision value to be moved and duplicated.
/// \returns A 128-bit vector of [2 x double] containing the moved and
/// duplicated values.
#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
/// \brief Moves and duplicates the double-precision value in the lower bits of
/// a 128-bit vector of [2 x double] to double-precision values stored in a
/// 128-bit vector of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
/// [127:64] and [63:0] of the destination.
/// \returns A 128-bit vector of [2 x double] containing the moved and
/// duplicated values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_movedup_pd(__m128d __a)
{
return __builtin_shufflevector(__a, __a, 0, 0);
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
}
#define _MM_DENORMALS_ZERO_ON (0x0040)
@ -103,12 +265,40 @@ _mm_movedup_pd(__m128d __a)
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
/// \brief Establishes a linear address memory range to be monitored and puts
/// the processor in the monitor event pending state. Data stored in the
/// monitored address range causes the processor to exit the pending state.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
///
/// \param __p
/// The memory range to be monitored. The size of the range is determined by
/// CPUID function 0000_0005h.
/// \param __extensions
/// Optional extensions for the monitoring state.
/// \param __hints
/// Optional hints for the monitoring state.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
{
__builtin_ia32_monitor((void *)__p, __extensions, __hints);
}
/// \brief Used with the MONITOR instruction to wait while the processor is in
/// the monitor event pending state. Data stored in the monitored address
/// range causes the processor to exit the pending state.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
///
/// \param __extensions
/// Optional extensions for the monitoring state, which may vary by
/// processor.
/// \param __hints
/// Optional hints for the monitoring state, which may vary by processor.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_mwait(unsigned __extensions, unsigned __hints)
{
@ -117,6 +307,4 @@ _mm_mwait(unsigned __extensions, unsigned __hints)
#undef __DEFAULT_FN_ATTRS
#endif /* __SSE3__ */
#endif /* __PMMINTRIN_H */

View File

@ -21,28 +21,76 @@
*===-----------------------------------------------------------------------===
*/
#ifndef __POPCNT__
#error "POPCNT instruction set not enabled"
#endif
#ifndef _POPCNTINTRIN_H
#define _POPCNTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
/// \brief Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// An unsigned 32-bit integer operand.
/// \returns A 32-bit integer containing the number of bits with value 1 in the
/// source operand.
static __inline__ int __DEFAULT_FN_ATTRS
_mm_popcnt_u32(unsigned int __A)
{
return __builtin_popcount(__A);
}
/// \brief Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// A signed 32-bit integer operand.
/// \returns A 32-bit integer containing the number of bits with value 1 in the
/// source operand.
static __inline__ int __DEFAULT_FN_ATTRS
_popcnt32(int __A)
{
return __builtin_popcount(__A);
}
#ifdef __x86_64__
/// \brief Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// An unsigned 64-bit integer operand.
/// \returns A 64-bit integer containing the number of bits with value 1 in the
/// source operand.
static __inline__ long long __DEFAULT_FN_ATTRS
_mm_popcnt_u64(unsigned long long __A)
{
return __builtin_popcountll(__A);
}
/// \brief Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// A signed 64-bit integer operand.
/// \returns A 64-bit integer containing the number of bits with value 1 in the
/// source operand.
static __inline__ long long __DEFAULT_FN_ATTRS
_popcnt64(long long __A)
{
return __builtin_popcountll(__A);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS

View File

@ -29,6 +29,12 @@
#define __PRFCHWINTRIN_H
#if defined(__PRFCHW__) || defined(__3dNOW__)
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_m_prefetch(void *__P)
{
__builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_m_prefetchw(void *__P)
{

View File

@ -28,10 +28,8 @@
#ifndef __RDSEEDINTRIN_H
#define __RDSEEDINTRIN_H
#ifdef __RDSEED__
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
static __inline__ int __DEFAULT_FN_ATTRS
_rdseed16_step(unsigned short *__p)
@ -55,5 +53,4 @@ _rdseed64_step(unsigned long long *__p)
#undef __DEFAULT_FN_ATTRS
#endif /* __RDSEED__ */
#endif /* __RDSEEDINTRIN_H */

View File

@ -38,7 +38,7 @@
#define _XABORT_CODE(x) (((x) >> 24) & 0xFF)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_xbegin(void)

View File

@ -28,15 +28,11 @@
#ifndef __SHAINTRIN_H
#define __SHAINTRIN_H
#if !defined (__SHA__)
# error "SHA instructions not enabled"
#endif
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
__builtin_ia32_sha1rnds4((V1), (V2), (M)); })
__builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)

View File

@ -24,14 +24,10 @@
#ifndef _SMMINTRIN_H
#define _SMMINTRIN_H
#ifndef __SSE4_1__
#error "SSE4.1 instruction set not enabled"
#else
#include <tmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
/* SSE4 Rounding macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
@ -61,35 +57,28 @@
#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
#define _mm_round_ps(X, M) __extension__ ({ \
__m128 __X = (X); \
(__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); })
(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
#define _mm_round_ss(X, Y, M) __extension__ ({ \
__m128 __X = (X); \
__m128 __Y = (Y); \
(__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); })
(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)); })
#define _mm_round_pd(X, M) __extension__ ({ \
__m128d __X = (X); \
(__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); })
(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
#define _mm_round_sd(X, Y, M) __extension__ ({ \
__m128d __X = (X); \
__m128d __Y = (Y); \
(__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); })
(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)); })
/* SSE4 Packed Blending Intrinsics. */
#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
__m128d __V1 = (V1); \
__m128d __V2 = (V2); \
(__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
(__v2df)(__m128d)(V2), \
(((M) & 0x01) ? 2 : 0), \
(((M) & 0x02) ? 3 : 1)); })
#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
__m128 __V1 = (V1); \
__m128 __V2 = (V2); \
(__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \
(__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
(((M) & 0x01) ? 4 : 0), \
(((M) & 0x02) ? 5 : 1), \
(((M) & 0x04) ? 6 : 2), \
@ -117,9 +106,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
}
#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
__m128i __V1 = (V1); \
__m128i __V2 = (V2); \
(__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
(__v8hi)(__m128i)(V2), \
(((M) & 0x01) ? 8 : 0), \
(((M) & 0x02) ? 9 : 1), \
(((M) & 0x04) ? 10 : 2), \
@ -133,7 +121,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
{
return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@ -144,20 +132,18 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/* SSE4 Floating Point Dot Product Instructions. */
#define _mm_dp_ps(X, Y, M) __extension__ ({ \
__m128 __X = (X); \
__m128 __Y = (Y); \
(__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); })
(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)); })
#define _mm_dp_pd(X, Y, M) __extension__ ({\
__m128d __X = (X); \
__m128d __Y = (Y); \
(__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); })
(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)); })
/* SSE4 Streaming Load Hint Instruction. */
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_stream_load_si128 (__m128i *__V)
_mm_stream_load_si128 (__m128i const *__V)
{
return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
}
/* SSE4 Packed Integer Min/Max Instructions. */
@ -213,7 +199,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
#define _mm_extract_ps(X, N) (__extension__ \
({ union { int __i; float __f; } __t; \
__v4sf __a = (__v4sf)(X); \
__v4sf __a = (__v4sf)(__m128)(X); \
__t.__f = __a[(N) & 3]; \
__t.__i;}))
@ -221,39 +207,44 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/* Extract a single-precision float from X at index N into D. */
#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
(D) = __a[N]; }))
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
an index suitable for _mm_insert_ps. */
#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
/* Extract a float from X at index N into the first index of the return. */
#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
/* Insert int into packed integer array at index. */
#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
__a[(N) & 15] = (I); \
__a;}))
#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
__a[(N) & 3] = (I); \
__a;}))
#define _mm_insert_epi8(X, I, N) (__extension__ \
({ __v16qi __a = (__v16qi)(__m128i)(X); \
__a[(N) & 15] = (I); \
(__m128i)__a;}))
#define _mm_insert_epi32(X, I, N) (__extension__ \
({ __v4si __a = (__v4si)(__m128i)(X); \
__a[(N) & 3] = (I); \
(__m128i)__a;}))
#ifdef __x86_64__
#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
__a[(N) & 1] = (I); \
__a;}))
#define _mm_insert_epi64(X, I, N) (__extension__ \
({ __v2di __a = (__v2di)(__m128i)(X); \
__a[(N) & 1] = (I); \
(__m128i)__a;}))
#endif /* __x86_64__ */
/* Extract int from packed integer array at index. This returns the element
* as a zero extended value, so it is unsigned.
*/
#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
(int)(unsigned char) \
__a[(N) & 15];}))
#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
__a[(N) & 3];}))
#define _mm_extract_epi8(X, N) (__extension__ \
({ __v16qi __a = (__v16qi)(__m128i)(X); \
(int)(unsigned char) __a[(N) & 15];}))
#define _mm_extract_epi32(X, N) (__extension__ \
({ __v4si __a = (__v4si)(__m128i)(X); \
(int)__a[(N) & 3];}))
#ifdef __x86_64__
#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
__a[(N) & 1];}))
#define _mm_extract_epi64(X, N) (__extension__ \
({ __v2di __a = (__v2di)(__m128i)(X); \
(long long)__a[(N) & 1];}))
#endif /* __x86_64 */
/* SSE4 128-bit Packed Integer Comparisons. */
@ -290,74 +281,80 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi16(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
/* This function always performs a signed extension, but __v16qi is a char
which may be signed or unsigned, so use __v16qs. */
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi32(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
/* This function always performs a signed extension, but __v16qi is a char
which may be signed or unsigned, so use __v16qs. */
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi64(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
/* This function always performs a signed extension, but __v16qi is a char
which may be signed or unsigned, so use __v16qs. */
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi32(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi64(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi32_epi64(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
}
/* SSE4 Packed Integer Zero-Extension. */
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi16(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi32(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi64(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu16_epi32(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu16_epi64(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu32_epi64(__m128i __V)
{
return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
}
/* SSE4 Pack with Unsigned Saturation. */
@ -369,9 +366,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
/* SSE4 Multiple Packed Sums of Absolute Difference. */
#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
__m128i __X = (X); \
__m128i __Y = (Y); \
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); })
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (M)); })
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_minpos_epu16(__m128i __V)
@ -379,9 +375,13 @@ _mm_minpos_epu16(__m128i __V)
return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
}
/* Handle the sse4.2 definitions here. */
/* These definitions are normally in nmmintrin.h, but gcc puts them in here
so we'll do the same. */
#ifdef __SSE4_2__
#undef __DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
/* These specify the type of data that we're comparing. */
#define _SIDD_UBYTE_OPS 0x00
@ -410,36 +410,59 @@ _mm_minpos_epu16(__m128i __V)
#define _SIDD_UNIT_MASK 0x40
/* SSE4.2 Packed Comparison Intrinsics. */
#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
#define _mm_cmpistrm(A, B, M) \
(__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistri(A, B, M) \
(int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpestrm(A, LA, B, LB, M) \
__builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
(__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
#define _mm_cmpestri(A, LA, B, LB, M) \
__builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
(int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
#define _mm_cmpistra(A, B, M) \
__builtin_ia32_pcmpistria128((A), (B), (M))
(int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrc(A, B, M) \
__builtin_ia32_pcmpistric128((A), (B), (M))
(int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistro(A, B, M) \
__builtin_ia32_pcmpistrio128((A), (B), (M))
(int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrs(A, B, M) \
__builtin_ia32_pcmpistris128((A), (B), (M))
(int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrz(A, B, M) \
__builtin_ia32_pcmpistriz128((A), (B), (M))
(int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpestra(A, LA, B, LB, M) \
__builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
(int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
#define _mm_cmpestrc(A, LA, B, LB, M) \
__builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
(int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
#define _mm_cmpestro(A, LA, B, LB, M) \
__builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
(int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
#define _mm_cmpestrs(A, LA, B, LB, M) \
__builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
(int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
#define _mm_cmpestrz(A, LA, B, LB, M) \
__builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
(int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
/* SSE4.2 Compare Packed Data -- Greater Than. */
static __inline__ __m128i __DEFAULT_FN_ATTRS
@ -481,7 +504,4 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
#include <popcntintrin.h>
#endif
#endif /* __SSE4_2__ */
#endif /* __SSE4_1__ */
#endif /* _SMMINTRIN_H */

View File

@ -45,11 +45,11 @@ extern "C" {
#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE
#define ATOMIC_SHORT_T_LOCK_FREE __GCC_ATOMIC_SHORT_T_LOCK_FREE
#define ATOMIC_INT_T_LOCK_FREE __GCC_ATOMIC_INT_T_LOCK_FREE
#define ATOMIC_LONG_T_LOCK_FREE __GCC_ATOMIC_LONG_T_LOCK_FREE
#define ATOMIC_LLONG_T_LOCK_FREE __GCC_ATOMIC_LLONG_T_LOCK_FREE
#define ATOMIC_POINTER_T_LOCK_FREE __GCC_ATOMIC_POINTER_T_LOCK_FREE
#define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE
#define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
#define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE
#define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE
#define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE
/* 7.17.2 Initialization */

View File

@ -77,14 +77,14 @@
* C99 7.18.1.2 Minimum-width integer types.
* C99 7.18.1.3 Fastest minimum-width integer types.
*
* The standard requires that exact-width type be defined for 8-, 16-, 32-, and
* The standard requires that exact-width type be defined for 8-, 16-, 32-, and
* 64-bit types if they are implemented. Other exact width types are optional.
* This implementation defines an exact-width types for every integer width
* that is represented in the standard integer types.
*
* The standard also requires minimum-width types be defined for 8-, 16-, 32-,
* and 64-bit widths regardless of whether there are corresponding exact-width
* types.
* types.
*
* To accommodate targets that are missing types that are exactly 8, 16, 32, or
* 64 bits wide, this implementation takes an approach of cascading
@ -97,7 +97,7 @@
* suboptimal.
*
* In violation of the standard, some targets do not implement a type that is
* wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
* wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
* To accommodate these targets, a required minimum-width type is only
* defined if there exists an exact-width type of equal or greater width.
*/
@ -247,7 +247,7 @@ typedef __uint_least8_t uint_fast8_t;
#endif /* __int_least8_t */
/* prevent glibc sys/types.h from defining conflicting types */
#ifndef __int8_t_defined
#ifndef __int8_t_defined
# define __int8_t_defined
#endif /* __int8_t_defined */
@ -280,9 +280,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
*
* The standard requires that integer constant macros be defined for all the
* minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
* types are required, the corresponding integer constant macros are defined
* types are required, the corresponding integer constant macros are defined
* here. This implementation also defines minimum-width types for every other
* integer width that the target implements, so corresponding macros are
* integer width that the target implements, so corresponding macros are
* defined below, too.
*
* These macros are defined using the same successive-shrinking approach as
@ -452,7 +452,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#endif /* __int_least8_t */
/* C99 7.18.2.1 Limits of exact-width integer types.
/* C99 7.18.2.1 Limits of exact-width integer types.
* C99 7.18.2.2 Limits of minimum-width integer types.
* C99 7.18.2.3 Limits of fastest minimum-width integer types.
*

View File

@ -21,10 +21,6 @@
*===-----------------------------------------------------------------------===
*/
#ifndef __TBM__
#error "TBM instruction set is not enabled"
#endif
#ifndef __X86INTRIN_H
#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
#endif
@ -33,119 +29,123 @@
#define __TBMINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
#define __bextri_u32(a, b) (__builtin_ia32_bextri_u32((a), (b)))
#define __bextri_u32(a, b) \
((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
(unsigned int)(b)))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blcfill_u32(unsigned int a)
__blcfill_u32(unsigned int __a)
{
return a & (a + 1);
return __a & (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blci_u32(unsigned int a)
__blci_u32(unsigned int __a)
{
return a | ~(a + 1);
return __a | ~(__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blcic_u32(unsigned int a)
__blcic_u32(unsigned int __a)
{
return ~a & (a + 1);
return ~__a & (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blcmsk_u32(unsigned int a)
__blcmsk_u32(unsigned int __a)
{
return a ^ (a + 1);
return __a ^ (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blcs_u32(unsigned int a)
__blcs_u32(unsigned int __a)
{
return a | (a + 1);
return __a | (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsfill_u32(unsigned int a)
__blsfill_u32(unsigned int __a)
{
return a | (a - 1);
return __a | (__a - 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsic_u32(unsigned int a)
__blsic_u32(unsigned int __a)
{
return ~a | (a - 1);
return ~__a | (__a - 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__t1mskc_u32(unsigned int a)
__t1mskc_u32(unsigned int __a)
{
return ~a | (a + 1);
return ~__a | (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__tzmsk_u32(unsigned int a)
__tzmsk_u32(unsigned int __a)
{
return ~a & (a - 1);
return ~__a & (__a - 1);
}
#ifdef __x86_64__
#define __bextri_u64(a, b) (__builtin_ia32_bextri_u64((a), (int)(b)))
#define __bextri_u64(a, b) \
((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
(unsigned long long)(b)))
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blcfill_u64(unsigned long long a)
__blcfill_u64(unsigned long long __a)
{
return a & (a + 1);
return __a & (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blci_u64(unsigned long long a)
__blci_u64(unsigned long long __a)
{
return a | ~(a + 1);
return __a | ~(__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blcic_u64(unsigned long long a)
__blcic_u64(unsigned long long __a)
{
return ~a & (a + 1);
return ~__a & (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blcmsk_u64(unsigned long long a)
__blcmsk_u64(unsigned long long __a)
{
return a ^ (a + 1);
return __a ^ (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blcs_u64(unsigned long long a)
__blcs_u64(unsigned long long __a)
{
return a | (a + 1);
return __a | (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsfill_u64(unsigned long long a)
__blsfill_u64(unsigned long long __a)
{
return a | (a - 1);
return __a | (__a - 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsic_u64(unsigned long long a)
__blsic_u64(unsigned long long __a)
{
return ~a | (a - 1);
return ~__a | (__a - 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__t1mskc_u64(unsigned long long a)
__t1mskc_u64(unsigned long long __a)
{
return ~a | (a + 1);
return ~__a | (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__tzmsk_u64(unsigned long long a)
__tzmsk_u64(unsigned long long __a)
{
return ~a & (a - 1);
return ~__a & (__a - 1);
}
#endif

View File

@ -490,7 +490,7 @@ static double _Complex
static long double _Complex
_TG_ATTRS
__tg_pow(long double _Complex __x, long double _Complex __y)
__tg_pow(long double _Complex __x, long double _Complex __y)
{return cpowl(__x, __y);}
#undef pow

View File

@ -20,203 +20,748 @@
*
*===-----------------------------------------------------------------------===
*/
#ifndef __TMMINTRIN_H
#define __TMMINTRIN_H
#ifndef __SSSE3__
#error "SSSE3 instruction set not enabled"
#else
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
/// \brief Computes the absolute value of each of the packed 8-bit signed
/// integers in the source operand and stores the 8-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PABSB instruction.
///
/// \param __a
/// A 64-bit vector of [8 x i8].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi8(__m64 __a)
{
return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
}
/// \brief Computes the absolute value of each of the packed 8-bit signed
/// integers in the source operand and stores the 8-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPABSB instruction.
///
/// \param __a
/// A 128-bit vector of [16 x i8].
/// \returns A 128-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi8(__m128i __a)
{
return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
}
/// \brief Computes the absolute value of each of the packed 16-bit signed
/// integers in the source operand and stores the 16-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PABSW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi16(__m64 __a)
{
return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
}
/// \brief Computes the absolute value of each of the packed 16-bit signed
/// integers in the source operand and stores the 16-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPABSW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16].
/// \returns A 128-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi16(__m128i __a)
{
return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
}
/// \brief Computes the absolute value of each of the packed 32-bit signed
/// integers in the source operand and stores the 32-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PABSD instruction.
///
/// \param __a
/// A 64-bit vector of [2 x i32].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi32(__m64 __a)
{
return (__m64)__builtin_ia32_pabsd((__v2si)__a);
}
/// \brief Computes the absolute value of each of the packed 32-bit signed
/// integers in the source operand and stores the 32-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPABSD instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32].
/// \returns A 128-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi32(__m128i __a)
{
return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
}
/// \brief Concatenates the two 128-bit integer vector operands, and
/// right-shifts the result by the number of bytes specified in the immediate
/// operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
/// \endcode
///
/// This intrinsic corresponds to the \c PALIGNR instruction.
///
/// \param a
/// A 128-bit vector of [16 x i8] containing one of the source operands.
/// \param b
/// A 128-bit vector of [16 x i8] containing one of the source operands.
/// \param n
/// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 128-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
__m128i __a = (a); \
__m128i __b = (b); \
(__m128i)__builtin_ia32_palignr128((__v16qi)__a, (__v16qi)__b, (n)); })
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n)); })
/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
/// the result by the number of bytes specified in the immediate operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
/// \endcode
///
/// This intrinsic corresponds to the \c PALIGNR instruction.
///
/// \param a
/// A 64-bit vector of [8 x i8] containing one of the source operands.
/// \param b
/// A 64-bit vector of [8 x i8] containing one of the source operands.
/// \param n
/// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 64-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
__m64 __a = (a); \
__m64 __b = (b); \
(__m64)__builtin_ia32_palignr((__v8qi)__a, (__v8qi)__b, (n)); })
(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHADDW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
/// both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [4 x i32].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHADDD instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
/// both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [4 x i16].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHADDW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
/// operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadd_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [2 x i32].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHADDD instruction.
///
/// \param __a
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
/// operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadd_pi32(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHADDSW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
/// sums of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadds_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHADDSW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
/// sums of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadds_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 128-bit vectors of [8 x i16].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHSUBW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
/// of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 128-bit vectors of [4 x i32].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHSUBD instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
/// of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 64-bit vectors of [4 x i16].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHSUBW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
/// of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsub_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 64-bit vectors of [2 x i32].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHSUBD instruction.
///
/// \param __a
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
/// of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsub_pi32(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 128-bit vectors of [8 x i16]. Positive differences greater than
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
/// saturated to 8000h.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHSUBSW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
/// differences of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsubs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
/// saturated to 8000h.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHSUBSW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
/// differences of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsubs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
/// values contained in the first source operand and packed 8-bit signed
/// integer values contained in the second source operand, adds pairs of
/// contiguous products with signed saturation, and writes the 16-bit sums to
/// the corresponding bits in the destination. For example, bits [7:0] of
/// both operands are multiplied, bits [15:8] of both operands are
/// multiplied, and the sum of both results is written to bits [15:0] of the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
///
/// \param __a
/// A 128-bit integer vector containing the first source operand.
/// \param __b
/// A 128-bit integer vector containing the second source operand.
/// \returns A 128-bit integer vector containing the sums of products of both
/// operands: \n
/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maddubs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
}
/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
/// values contained in the first source operand and packed 8-bit signed
/// integer values contained in the second source operand, adds pairs of
/// contiguous products with signed saturation, and writes the 16-bit sums to
/// the corresponding bits in the destination. For example, bits [7:0] of
/// both operands are multiplied, bits [15:8] of both operands are
/// multiplied, and the sum of both results is written to bits [15:0] of the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PMADDUBSW instruction.
///
/// \param __a
/// A 64-bit integer vector containing the first source operand.
/// \param __b
/// A 64-bit integer vector containing the second source operand.
/// \returns A 64-bit integer vector containing the sums of products of both
/// operands: \n
/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_maddubs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
}
/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
/// products to the 18 most significant bits by right-shifting, rounds the
/// truncated value by adding 1, and writes bits [16:1] to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPMULHRSW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands.
/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
/// products of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhrs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
/// products to the 18 most significant bits by right-shifting, rounds the
/// truncated value by adding 1, and writes bits [16:1] to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PMULHRSW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands.
/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
/// products of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_mulhrs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
/// destination or clears 8-bit values in the destination, as specified by
/// the second source operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPSHUFB instruction.
///
/// \param __a
/// A 128-bit integer vector containing the values to be copied.
/// \param __b
/// A 128-bit integer vector containing control bytes corresponding to
/// positions in the destination:
/// Bit 7: \n
/// 1: Clear the corresponding byte in the destination. \n
/// 0: Copy the selected source byte to the corresponding byte in the
/// destination. \n
/// Bits [6:4] Reserved. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 128-bit integer vector containing the copied or cleared values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shuffle_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
}
/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
/// destination or clears 8-bit values in the destination, as specified by
/// the second source operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PSHUFB instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values to be copied.
/// \param __b
/// A 64-bit integer vector containing control bytes corresponding to
/// positions in the destination:
/// Bit 7: \n
/// 1: Clear the corresponding byte in the destination. \n
/// 0: Copy the selected source byte to the corresponding byte in the
/// destination. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 64-bit integer vector containing the copied or cleared values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_shuffle_pi8(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
}
/// \brief For each 8-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// byte in the second source is negative, calculate the two's complement of
/// the corresponding byte in the first source, and write that value to the
/// destination. If the byte in the second source is positive, copy the
/// corresponding byte from the first source to the destination. If the byte
/// in the second source is zero, clear the corresponding byte in the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPSIGNB instruction.
///
/// \param __a
/// A 128-bit integer vector containing the values to be copied.
/// \param __b
/// A 128-bit integer vector containing control bytes corresponding to
/// positions in the destination.
/// \returns A 128-bit integer vector containing the resultant values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
}
/// \brief For each 16-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// word in the second source is negative, calculate the two's complement of
/// the corresponding word in the first source, and write that value to the
/// destination. If the word in the second source is positive, copy the
/// corresponding word from the first source to the destination. If the word
/// in the second source is zero, clear the corresponding word in the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPSIGNW instruction.
///
/// \param __a
/// A 128-bit integer vector containing the values to be copied.
/// \param __b
/// A 128-bit integer vector containing control words corresponding to
/// positions in the destination.
/// \returns A 128-bit integer vector containing the resultant values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief For each 32-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// doubleword in the second source is negative, calculate the two's
/// complement of the corresponding word in the first source, and write that
/// value to the destination. If the doubleword in the second source is
/// positive, copy the corresponding word from the first source to the
/// destination. If the doubleword in the second source is zero, clear the
/// corresponding word in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPSIGND instruction.
///
/// \param __a
/// A 128-bit integer vector containing the values to be copied.
/// \param __b
/// A 128-bit integer vector containing control doublewords corresponding to
/// positions in the destination.
/// \returns A 128-bit integer vector containing the resultant values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
}
/// \brief For each 8-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// byte in the second source is negative, calculate the two's complement of
/// the corresponding byte in the first source, and write that value to the
/// destination. If the byte in the second source is positive, copy the
/// corresponding byte from the first source to the destination. If the byte
/// in the second source is zero, clear the corresponding byte in the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PSIGNB instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values to be copied.
/// \param __b
/// A 64-bit integer vector containing control bytes corresponding to
/// positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi8(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
}
/// \brief For each 16-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// word in the second source is negative, calculate the two's complement of
/// the corresponding word in the first source, and write that value to the
/// destination. If the word in the second source is positive, copy the
/// corresponding word from the first source to the destination. If the word
/// in the second source is zero, clear the corresponding word in the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PSIGNW instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values to be copied.
/// \param __b
/// A 64-bit integer vector containing control words corresponding to
/// positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
}
/// \brief For each 32-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand: If the
/// doubleword in the second source is negative, calculate the two's
/// complement of the corresponding doubleword in the first source, and
/// write that value to the destination. If the doubleword in the second
/// source is positive, copy the corresponding doubleword from the first
/// source to the destination. If the doubleword in the second source is
/// zero, clear the corresponding doubleword in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PSIGND instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values to be copied.
/// \param __b
/// A 64-bit integer vector containing two control doublewords corresponding
/// to positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi32(__m64 __a, __m64 __b)
{
@ -225,6 +770,4 @@ _mm_sign_pi32(__m64 __a, __m64 __b)
#undef __DEFAULT_FN_ATTRS
#endif /* __SSSE3__ */
#endif /* __TMMINTRIN_H */

View File

@ -79,6 +79,10 @@ struct _Unwind_Context;
struct _Unwind_Exception;
typedef enum {
_URC_NO_REASON = 0,
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
!defined(__ARM_DWARF_EH__)
_URC_OK = 0, /* used by ARM EHABI */
#endif
_URC_FOREIGN_EXCEPTION_CAUGHT = 1,
_URC_FATAL_PHASE2_ERROR = 2,
@ -88,7 +92,11 @@ typedef enum {
_URC_END_OF_STACK = 5,
_URC_HANDLER_FOUND = 6,
_URC_INSTALL_CONTEXT = 7,
_URC_CONTINUE_UNWIND = 8
_URC_CONTINUE_UNWIND = 8,
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
!defined(__ARM_DWARF_EH__)
_URC_FAILURE = 9 /* used by ARM EHABI */
#endif
} _Unwind_Reason_Code;
typedef enum {
@ -150,6 +158,15 @@ typedef enum {
_UVRSR_FAILED = 2
} _Unwind_VRS_Result;
#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__)
typedef uint32_t _Unwind_State;
#define _US_VIRTUAL_UNWIND_FRAME ((_Unwind_State)0)
#define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1)
#define _US_UNWIND_FRAME_RESUME ((_Unwind_State)2)
#define _US_ACTION_MASK ((_Unwind_State)3)
#define _US_FORCE_UNWIND ((_Unwind_State)8)
#endif
_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
_Unwind_VRS_RegClass __regclass,
uint32_t __regno,

View File

@ -26,17 +26,8 @@
#include <emmintrin.h>
#if !defined (__AES__) && !defined (__PCLMUL__)
# error "AES/PCLMUL instructions not enabled"
#else
#ifdef __AES__
#include <__wmmintrin_aes.h>
#endif /* __AES__ */
#ifdef __PCLMUL__
#include <__wmmintrin_pclmul.h>
#endif /* __PCLMUL__ */
#endif /* __AES__ || __PCLMUL__ */
#endif /* _WMMINTRIN_H */

View File

@ -28,54 +28,58 @@
#include <immintrin.h>
#ifdef __3dNOW__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__3dNOW__)
#include <mm3dnow.h>
#endif
#ifdef __BMI__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
#include <bmiintrin.h>
#endif
#ifdef __BMI2__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
#include <bmi2intrin.h>
#endif
#ifdef __LZCNT__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
#include <lzcntintrin.h>
#endif
#ifdef __POPCNT__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
#include <popcntintrin.h>
#endif
#ifdef __RDSEED__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
#include <rdseedintrin.h>
#endif
#ifdef __PRFCHW__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
#include <prfchwintrin.h>
#endif
#ifdef __SSE4A__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE4A__)
#include <ammintrin.h>
#endif
#ifdef __FMA4__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA4__)
#include <fma4intrin.h>
#endif
#ifdef __XOP__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XOP__)
#include <xopintrin.h>
#endif
#ifdef __TBM__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__TBM__)
#include <tbmintrin.h>
#endif
#ifdef __F16C__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
#include <f16cintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
#include <mwaitxintrin.h>
#endif
/* FIXME: LWP */
#endif /* __X86INTRIN_H */

File diff suppressed because it is too large Load Diff

View File

@ -28,14 +28,10 @@
#ifndef __XOPINTRIN_H
#define __XOPINTRIN_H
#ifndef __XOP__
# error "XOP instruction set is not enabled"
#else
#include <fma4intrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
@ -202,13 +198,13 @@ _mm_hsubq_epi32(__m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@ -242,20 +238,16 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
}
#define _mm_roti_epi8(A, N) __extension__ ({ \
__m128i __A = (A); \
(__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); })
(__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
#define _mm_roti_epi16(A, N) __extension__ ({ \
__m128i __A = (A); \
(__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); })
(__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
#define _mm_roti_epi32(A, N) __extension__ ({ \
__m128i __A = (A); \
(__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); })
(__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
#define _mm_roti_epi64(A, N) __extension__ ({ \
__m128i __A = (A); \
(__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); })
(__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shl_epi8(__m128i __A, __m128i __B)
@ -306,44 +298,36 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
}
#define _mm_com_epu8(A, B, N) __extension__ ({ \
__m128i __A = (A); \
__m128i __B = (B); \
(__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); })
(__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N)); })
#define _mm_com_epu16(A, B, N) __extension__ ({ \
__m128i __A = (A); \
__m128i __B = (B); \
(__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); })
(__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N)); })
#define _mm_com_epu32(A, B, N) __extension__ ({ \
__m128i __A = (A); \
__m128i __B = (B); \
(__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); })
(__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N)); })
#define _mm_com_epu64(A, B, N) __extension__ ({ \
__m128i __A = (A); \
__m128i __B = (B); \
(__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); })
(__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N)); })
#define _mm_com_epi8(A, B, N) __extension__ ({ \
__m128i __A = (A); \
__m128i __B = (B); \
(__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); })
(__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N)); })
#define _mm_com_epi16(A, B, N) __extension__ ({ \
__m128i __A = (A); \
__m128i __B = (B); \
(__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); })
(__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N)); })
#define _mm_com_epi32(A, B, N) __extension__ ({ \
__m128i __A = (A); \
__m128i __B = (B); \
(__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); })
(__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N)); })
#define _mm_com_epi64(A, B, N) __extension__ ({ \
__m128i __A = (A); \
__m128i __B = (B); \
(__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); })
(__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N)); })
#define _MM_PCOMCTRL_LT 0
#define _MM_PCOMCTRL_LE 1
@ -739,32 +723,23 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
}
#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
__m128d __X = (X); \
__m128d __Y = (Y); \
__m128i __C = (C); \
(__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \
(__v2di)__C, (I)); })
(__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), \
(__v2di)(__m128i)(C), (I)); })
#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
__m256d __X = (X); \
__m256d __Y = (Y); \
__m256i __C = (C); \
(__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \
(__v4di)__C, (I)); })
(__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
(__v4df)(__m256d)(Y), \
(__v4di)(__m256i)(C), (I)); })
#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
__m128 __X = (X); \
__m128 __Y = (Y); \
__m128i __C = (C); \
(__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \
(__v4si)__C, (I)); })
(__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
(__v4si)(__m128i)(C), (I)); })
#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
__m256 __X = (X); \
__m256 __Y = (Y); \
__m256i __C = (C); \
(__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \
(__v8si)__C, (I)); })
(__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
(__v8sf)(__m256)(Y), \
(__v8si)(__m256i)(C), (I)); })
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_frcz_ss(__m128 __A)
@ -804,6 +779,4 @@ _mm256_frcz_pd(__m256d __A)
#undef __DEFAULT_FN_ATTRS
#endif /* __XOP__ */
#endif /* __XOPINTRIN_H */

48
c_headers/xsavecintrin.h Normal file
View File

@ -0,0 +1,48 @@
/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __XSAVECINTRIN_H
#define __XSAVECINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsavec")))
static __inline__ void __DEFAULT_FN_ATTRS
_xsavec(void *__p, unsigned long long __m) {
__builtin_ia32_xsavec(__p, __m);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
_xsavec64(void *__p, unsigned long long __m) {
__builtin_ia32_xsavec64(__p, __m);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif

58
c_headers/xsaveintrin.h Normal file
View File

@ -0,0 +1,58 @@
/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __XSAVEINTRIN_H
#define __XSAVEINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsave")))
static __inline__ void __DEFAULT_FN_ATTRS
_xsave(void *__p, unsigned long long __m) {
return __builtin_ia32_xsave(__p, __m);
}
static __inline__ void __DEFAULT_FN_ATTRS
_xrstor(void *__p, unsigned long long __m) {
return __builtin_ia32_xrstor(__p, __m);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
_xsave64(void *__p, unsigned long long __m) {
return __builtin_ia32_xsave64(__p, __m);
}
static __inline__ void __DEFAULT_FN_ATTRS
_xrstor64(void *__p, unsigned long long __m) {
return __builtin_ia32_xrstor64(__p, __m);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -0,0 +1,48 @@
/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __XSAVEOPTINTRIN_H
#define __XSAVEOPTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaveopt")))
static __inline__ void __DEFAULT_FN_ATTRS
_xsaveopt(void *__p, unsigned long long __m) {
return __builtin_ia32_xsaveopt(__p, __m);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
_xsaveopt64(void *__p, unsigned long long __m) {
return __builtin_ia32_xsaveopt64(__p, __m);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif

58
c_headers/xsavesintrin.h Normal file
View File

@ -0,0 +1,58 @@
/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __XSAVESINTRIN_H
#define __XSAVESINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaves")))
static __inline__ void __DEFAULT_FN_ATTRS
_xsaves(void *__p, unsigned long long __m) {
__builtin_ia32_xsaves(__p, __m);
}
static __inline__ void __DEFAULT_FN_ATTRS
_xrstors(void *__p, unsigned long long __m) {
__builtin_ia32_xrstors(__p, __m);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
_xrstors64(void *__p, unsigned long long __m) {
__builtin_ia32_xrstors64(__p, __m);
}
static __inline__ void __DEFAULT_FN_ATTRS
_xsaves64(void *__p, unsigned long long __m) {
__builtin_ia32_xsaves64(__p, __m);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif