parent
c0f9012bed
commit
91afdc58d2
|
@ -2,6 +2,7 @@ zig-cache/
|
|||
build/
|
||||
build-release/
|
||||
build-windows/
|
||||
build-llvm-5/
|
||||
/.cproject
|
||||
/.project
|
||||
/.settings/
|
||||
|
|
164
CMakeLists.txt
164
CMakeLists.txt
|
@ -64,76 +64,6 @@ set(ZIG_SOURCES
|
|||
"${CMAKE_SOURCE_DIR}/src/zig_llvm.cpp"
|
||||
)
|
||||
|
||||
set(C_HEADERS
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/Intrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/__stddef_max_align_t.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/__wmmintrin_aes.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/__wmmintrin_pclmul.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/adxintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/ammintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/arm_acle.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx2intrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx512bwintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx512cdintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx512dqintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx512erintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx512fintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx512vlbwintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx512vldqintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avx512vlintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/avxintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/bmi2intrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/bmiintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/cpuid.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/cuda_builtin_vars.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/emmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/f16cintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/float.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/fma4intrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/fmaintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/fxsrintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/htmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/htmxlintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/ia32intrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/immintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/inttypes.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/iso646.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/limits.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/lzcntintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/mm3dnow.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/mm_malloc.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/mmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/nmmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/pmmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/popcntintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/prfchwintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/rdseedintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/rtmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/s390intrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/shaintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/smmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/stdalign.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/stdarg.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/stdatomic.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/stdbool.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/stddef.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/stdint.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/stdnoreturn.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/tbmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/tgmath.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/tmmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/unwind.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/vadefs.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/varargs.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/vecintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/wmmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/x86intrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/xmmintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/xopintrin.h"
|
||||
"${CMAKE_SOURCE_DIR}/c_headers/xtestintrin.h"
|
||||
)
|
||||
|
||||
|
||||
set(ZIG_HOST_LINK_VERSION)
|
||||
if (APPLE)
|
||||
set(LD_V_OUTPUT)
|
||||
|
@ -198,7 +128,99 @@ if(MINGW)
|
|||
endif()
|
||||
install(TARGETS zig DESTINATION bin)
|
||||
|
||||
install(FILES ${C_HEADERS} DESTINATION ${C_HEADERS_DEST})
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_builtin_vars.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_cmath.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_complex_builtins.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_intrinsics.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_math_forward_declares.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__clang_cuda_runtime_wrapper.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__stddef_max_align_t.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__wmmintrin_aes.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/__wmmintrin_pclmul.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/adxintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/altivec.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/ammintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/arm_acle.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/arm_neon.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/armintr.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx2intrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512bwintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512cdintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512dqintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512erintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512fintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512ifmaintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512ifmavlintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512pfintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vbmiintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vbmivlintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlbwintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlcdintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vldqintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avxintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmi2intrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmiintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/clflushoptintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cpuid.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/algorithm" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/complex" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/new" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/emmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/f16cintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/float.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/fma4intrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/fmaintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/fxsrintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/htmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/htmxlintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/ia32intrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/immintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/intrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/inttypes.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/iso646.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/limits.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/lzcntintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm3dnow.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm_malloc.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/module.modulemap" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/msa.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mwaitxintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/nmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/opencl-c.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/pkuintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/pmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/popcntintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/prfchwintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/rdseedintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/rtmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/s390intrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/shaintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/smmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdalign.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdarg.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdatomic.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdbool.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stddef.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdint.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/stdnoreturn.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/tbmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/tgmath.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/tmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/unwind.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/vadefs.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/varargs.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/vecintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/wmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/x86intrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xmmintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xopintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xsavecintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xsaveintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xsaveoptintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xsavesintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/c_headers/xtestintrin.h" DESTINATION "${C_HEADERS_DEST}")
|
||||
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/std/array_list.zig" DESTINATION "${ZIG_STD_DEST}")
|
||||
install(FILES "${CMAKE_SOURCE_DIR}/std/base64.zig" DESTINATION "${ZIG_STD_DEST}")
|
||||
|
|
|
@ -24,16 +24,20 @@
|
|||
#ifndef __CUDA_BUILTIN_VARS_H
|
||||
#define __CUDA_BUILTIN_VARS_H
|
||||
|
||||
// Forward declares from vector_types.h.
|
||||
struct uint3;
|
||||
struct dim3;
|
||||
|
||||
// The file implements built-in CUDA variables using __declspec(property).
|
||||
// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
|
||||
// All read accesses of built-in variable fields get converted into calls to a
|
||||
// getter function which in turn would call appropriate builtin to fetch the
|
||||
// getter function which in turn calls the appropriate builtin to fetch the
|
||||
// value.
|
||||
//
|
||||
// Example:
|
||||
// int x = threadIdx.x;
|
||||
// IR output:
|
||||
// %0 = call i32 @llvm.ptx.read.tid.x() #3
|
||||
// %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
// PTX output:
|
||||
// mov.u32 %r2, %tid.x;
|
||||
|
||||
|
@ -60,33 +64,45 @@
|
|||
__attribute__((device)) TypeName *operator&() const __DELETE
|
||||
|
||||
struct __cuda_builtin_threadIdx_t {
|
||||
__CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_tid_x());
|
||||
__CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_tid_y());
|
||||
__CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_tid_z());
|
||||
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
|
||||
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
|
||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
|
||||
// threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
|
||||
// uint3). This function is defined after we pull in vector_types.h.
|
||||
__attribute__((device)) operator uint3() const;
|
||||
private:
|
||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
|
||||
};
|
||||
|
||||
struct __cuda_builtin_blockIdx_t {
|
||||
__CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ctaid_x());
|
||||
__CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ctaid_y());
|
||||
__CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ctaid_z());
|
||||
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
|
||||
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
|
||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
|
||||
// blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
|
||||
// uint3). This function is defined after we pull in vector_types.h.
|
||||
__attribute__((device)) operator uint3() const;
|
||||
private:
|
||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
|
||||
};
|
||||
|
||||
struct __cuda_builtin_blockDim_t {
|
||||
__CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_ntid_x());
|
||||
__CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_ntid_y());
|
||||
__CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_ntid_z());
|
||||
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
|
||||
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
|
||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
|
||||
// blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
|
||||
// dim3). This function is defined after we pull in vector_types.h.
|
||||
__attribute__((device)) operator dim3() const;
|
||||
private:
|
||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
|
||||
};
|
||||
|
||||
struct __cuda_builtin_gridDim_t {
|
||||
__CUDA_DEVICE_BUILTIN(x,__builtin_ptx_read_nctaid_x());
|
||||
__CUDA_DEVICE_BUILTIN(y,__builtin_ptx_read_nctaid_y());
|
||||
__CUDA_DEVICE_BUILTIN(z,__builtin_ptx_read_nctaid_z());
|
||||
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
|
||||
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
|
||||
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
|
||||
// gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
|
||||
// dim3). This function is defined after we pull in vector_types.h.
|
||||
__attribute__((device)) operator dim3() const;
|
||||
private:
|
||||
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
|
||||
};
|
|
@ -0,0 +1,487 @@
|
|||
/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __CLANG_CUDA_CMATH_H__
|
||||
#define __CLANG_CUDA_CMATH_H__
|
||||
#ifndef __CUDA__
|
||||
#error "This file is for CUDA compilation only."
|
||||
#endif
|
||||
|
||||
#include <limits>
|
||||
|
||||
// CUDA lets us use various std math functions on the device side. This file
|
||||
// works in concert with __clang_cuda_math_forward_declares.h to make this work.
|
||||
//
|
||||
// Specifically, the forward-declares header declares __device__ overloads for
|
||||
// these functions in the global namespace, then pulls them into namespace std
|
||||
// with 'using' statements. Then this file implements those functions, after
|
||||
// their implementations have been pulled in.
|
||||
//
|
||||
// It's important that we declare the functions in the global namespace and pull
|
||||
// them into namespace std with using statements, as opposed to simply declaring
|
||||
// these functions in namespace std, because our device functions need to
|
||||
// overload the standard library functions, which may be declared in the global
|
||||
// namespace or in std, depending on the degree of conformance of the stdlib
|
||||
// implementation. Declaring in the global namespace and pulling into namespace
|
||||
// std covers all of the known knowns.
|
||||
|
||||
#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
|
||||
|
||||
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
|
||||
__DEVICE__ long abs(long __n) { return ::labs(__n); }
|
||||
__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
|
||||
__DEVICE__ double abs(double __x) { return ::fabs(__x); }
|
||||
__DEVICE__ float acos(float __x) { return ::acosf(__x); }
|
||||
__DEVICE__ float asin(float __x) { return ::asinf(__x); }
|
||||
__DEVICE__ float atan(float __x) { return ::atanf(__x); }
|
||||
__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
|
||||
__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
|
||||
__DEVICE__ float cos(float __x) { return ::cosf(__x); }
|
||||
__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
|
||||
__DEVICE__ float exp(float __x) { return ::expf(__x); }
|
||||
__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
|
||||
__DEVICE__ float floor(float __x) { return ::floorf(__x); }
|
||||
__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
|
||||
__DEVICE__ int fpclassify(float __x) {
|
||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
||||
FP_ZERO, __x);
|
||||
}
|
||||
__DEVICE__ int fpclassify(double __x) {
|
||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
||||
FP_ZERO, __x);
|
||||
}
|
||||
__DEVICE__ float frexp(float __arg, int *__exp) {
|
||||
return ::frexpf(__arg, __exp);
|
||||
}
|
||||
|
||||
// For inscrutable reasons, the CUDA headers define these functions for us on
|
||||
// Windows.
|
||||
#ifndef _MSC_VER
|
||||
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
|
||||
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
|
||||
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
|
||||
// For inscrutable reasons, __finite(), the double-precision version of
|
||||
// __finitef, does not exist when compiling for MacOS. __isfinited is available
|
||||
// everywhere and is just as good.
|
||||
__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
|
||||
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
|
||||
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
|
||||
#endif
|
||||
|
||||
__DEVICE__ bool isgreater(float __x, float __y) {
|
||||
return __builtin_isgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isgreater(double __x, double __y) {
|
||||
return __builtin_isgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isgreaterequal(float __x, float __y) {
|
||||
return __builtin_isgreaterequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isgreaterequal(double __x, double __y) {
|
||||
return __builtin_isgreaterequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isless(float __x, float __y) {
|
||||
return __builtin_isless(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isless(double __x, double __y) {
|
||||
return __builtin_isless(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool islessequal(float __x, float __y) {
|
||||
return __builtin_islessequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool islessequal(double __x, double __y) {
|
||||
return __builtin_islessequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool islessgreater(float __x, float __y) {
|
||||
return __builtin_islessgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool islessgreater(double __x, double __y) {
|
||||
return __builtin_islessgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
|
||||
__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
|
||||
__DEVICE__ bool isunordered(float __x, float __y) {
|
||||
return __builtin_isunordered(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isunordered(double __x, double __y) {
|
||||
return __builtin_isunordered(__x, __y);
|
||||
}
|
||||
__DEVICE__ float ldexp(float __arg, int __exp) {
|
||||
return ::ldexpf(__arg, __exp);
|
||||
}
|
||||
__DEVICE__ float log(float __x) { return ::logf(__x); }
|
||||
__DEVICE__ float log10(float __x) { return ::log10f(__x); }
|
||||
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
|
||||
__DEVICE__ float nexttoward(float __from, double __to) {
|
||||
return __builtin_nexttowardf(__from, __to);
|
||||
}
|
||||
__DEVICE__ double nexttoward(double __from, double __to) {
|
||||
return __builtin_nexttoward(__from, __to);
|
||||
}
|
||||
__DEVICE__ float nexttowardf(float __from, double __to) {
|
||||
return __builtin_nexttowardf(__from, __to);
|
||||
}
|
||||
__DEVICE__ float pow(float __base, float __exp) {
|
||||
return ::powf(__base, __exp);
|
||||
}
|
||||
__DEVICE__ float pow(float __base, int __iexp) {
|
||||
return ::powif(__base, __iexp);
|
||||
}
|
||||
__DEVICE__ double pow(double __base, int __iexp) {
|
||||
return ::powi(__base, __iexp);
|
||||
}
|
||||
__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
|
||||
__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
|
||||
__DEVICE__ float sin(float __x) { return ::sinf(__x); }
|
||||
__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
|
||||
__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
|
||||
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
|
||||
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
|
||||
|
||||
// Now we've defined everything we promised we'd define in
|
||||
// __clang_cuda_math_forward_declares.h. We need to do two additional things to
|
||||
// fix up our math functions.
|
||||
//
|
||||
// 1) Define __device__ overloads for e.g. sin(int). The CUDA headers define
|
||||
// only sin(float) and sin(double), which means that e.g. sin(0) is
|
||||
// ambiguous.
|
||||
//
|
||||
// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
|
||||
// std. These are defined in the CUDA headers in the global namespace,
|
||||
// independent of everything else we've done here.
|
||||
|
||||
// We can't use std::enable_if, because we want to be pre-C++11 compatible. But
|
||||
// we go ahead and unconditionally define functions that are only available when
|
||||
// compiling for C++11 to match the behavior of the CUDA headers.
|
||||
template<bool __B, class __T = void>
|
||||
struct __clang_cuda_enable_if {};
|
||||
|
||||
template <class __T> struct __clang_cuda_enable_if<true, __T> {
|
||||
typedef __T type;
|
||||
};
|
||||
|
||||
// Defines an overload of __fn that accepts one integral argument, calls
|
||||
// __fn((double)x), and returns __retty.
|
||||
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn) \
|
||||
template <typename __T> \
|
||||
__DEVICE__ \
|
||||
typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, \
|
||||
__retty>::type \
|
||||
__fn(__T __x) { \
|
||||
return ::__fn((double)__x); \
|
||||
}
|
||||
|
||||
// Defines an overload of __fn that accepts one two arithmetic arguments, calls
|
||||
// __fn((double)x, (double)y), and returns a double.
|
||||
//
|
||||
// Note this is different from OVERLOAD_1, which generates an overload that
|
||||
// accepts only *integral* arguments.
|
||||
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn) \
|
||||
template <typename __T1, typename __T2> \
|
||||
__DEVICE__ typename __clang_cuda_enable_if< \
|
||||
std::numeric_limits<__T1>::is_specialized && \
|
||||
std::numeric_limits<__T2>::is_specialized, \
|
||||
__retty>::type \
|
||||
__fn(__T1 __x, __T2 __y) { \
|
||||
return __fn((double)__x, (double)__y); \
|
||||
}
|
||||
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
|
||||
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
|
||||
|
||||
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
|
||||
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
|
||||
|
||||
// Overloads for functions that don't match the patterns expected by
|
||||
// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
|
||||
template <typename __T1, typename __T2, typename __T3>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<
|
||||
std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized &&
|
||||
std::numeric_limits<__T3>::is_specialized,
|
||||
double>::type
|
||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
||||
return std::fma((double)__x, (double)__y, (double)__z);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
||||
double>::type
|
||||
frexp(__T __x, int *__exp) {
|
||||
return std::frexp((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
||||
double>::type
|
||||
ldexp(__T __x, int __exp) {
|
||||
return std::ldexp((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
||||
double>::type
|
||||
nexttoward(__T __from, double __to) {
|
||||
return std::nexttoward((double)__from, __to);
|
||||
}
|
||||
|
||||
template <typename __T1, typename __T2>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<
|
||||
std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized,
|
||||
double>::type
|
||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
||||
return std::remquo((double)__x, (double)__y, __quo);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
||||
double>::type
|
||||
scalbln(__T __x, long __exp) {
|
||||
return std::scalbln((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
|
||||
double>::type
|
||||
scalbn(__T __x, int __exp) {
|
||||
return std::scalbn((double)__x, __exp);
|
||||
}
|
||||
|
||||
// We need to define these overloads in exactly the namespace our standard
|
||||
// library uses (including the right inline namespace), otherwise they won't be
|
||||
// picked up by other functions in the standard library (e.g. functions in
|
||||
// <complex>). Thus the ugliness below.
|
||||
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
#else
|
||||
namespace std {
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Pull the new overloads we defined above into namespace std.
|
||||
using ::acos;
|
||||
using ::acosh;
|
||||
using ::asin;
|
||||
using ::asinh;
|
||||
using ::atan;
|
||||
using ::atan2;
|
||||
using ::atanh;
|
||||
using ::cbrt;
|
||||
using ::ceil;
|
||||
using ::copysign;
|
||||
using ::cos;
|
||||
using ::cosh;
|
||||
using ::erf;
|
||||
using ::erfc;
|
||||
using ::exp;
|
||||
using ::exp2;
|
||||
using ::expm1;
|
||||
using ::fabs;
|
||||
using ::fdim;
|
||||
using ::floor;
|
||||
using ::fma;
|
||||
using ::fmax;
|
||||
using ::fmin;
|
||||
using ::fmod;
|
||||
using ::fpclassify;
|
||||
using ::frexp;
|
||||
using ::hypot;
|
||||
using ::ilogb;
|
||||
using ::isfinite;
|
||||
using ::isgreater;
|
||||
using ::isgreaterequal;
|
||||
using ::isless;
|
||||
using ::islessequal;
|
||||
using ::islessgreater;
|
||||
using ::isnormal;
|
||||
using ::isunordered;
|
||||
using ::ldexp;
|
||||
using ::lgamma;
|
||||
using ::llrint;
|
||||
using ::llround;
|
||||
using ::log;
|
||||
using ::log10;
|
||||
using ::log1p;
|
||||
using ::log2;
|
||||
using ::logb;
|
||||
using ::lrint;
|
||||
using ::lround;
|
||||
using ::nearbyint;
|
||||
using ::nextafter;
|
||||
using ::nexttoward;
|
||||
using ::pow;
|
||||
using ::remainder;
|
||||
using ::remquo;
|
||||
using ::rint;
|
||||
using ::round;
|
||||
using ::scalbln;
|
||||
using ::scalbn;
|
||||
using ::signbit;
|
||||
using ::sin;
|
||||
using ::sinh;
|
||||
using ::sqrt;
|
||||
using ::tan;
|
||||
using ::tanh;
|
||||
using ::tgamma;
|
||||
using ::trunc;
|
||||
|
||||
// Well this is fun: We need to pull these symbols in for libc++, but we can't
|
||||
// pull them in with libstdc++, because its ::isinf and ::isnan are different
|
||||
// than its std::isinf and std::isnan.
|
||||
#ifndef __GLIBCXX__
|
||||
using ::isinf;
|
||||
using ::isnan;
|
||||
#endif
|
||||
|
||||
// Finally, pull the "foobarf" functions that CUDA defines in its headers into
|
||||
// namespace std.
|
||||
using ::acosf;
|
||||
using ::acoshf;
|
||||
using ::asinf;
|
||||
using ::asinhf;
|
||||
using ::atan2f;
|
||||
using ::atanf;
|
||||
using ::atanhf;
|
||||
using ::cbrtf;
|
||||
using ::ceilf;
|
||||
using ::copysignf;
|
||||
using ::cosf;
|
||||
using ::coshf;
|
||||
using ::erfcf;
|
||||
using ::erff;
|
||||
using ::exp2f;
|
||||
using ::expf;
|
||||
using ::expm1f;
|
||||
using ::fabsf;
|
||||
using ::fdimf;
|
||||
using ::floorf;
|
||||
using ::fmaf;
|
||||
using ::fmaxf;
|
||||
using ::fminf;
|
||||
using ::fmodf;
|
||||
using ::frexpf;
|
||||
using ::hypotf;
|
||||
using ::ilogbf;
|
||||
using ::ldexpf;
|
||||
using ::lgammaf;
|
||||
using ::llrintf;
|
||||
using ::llroundf;
|
||||
using ::log10f;
|
||||
using ::log1pf;
|
||||
using ::log2f;
|
||||
using ::logbf;
|
||||
using ::logf;
|
||||
using ::lrintf;
|
||||
using ::lroundf;
|
||||
using ::modff;
|
||||
using ::nearbyintf;
|
||||
using ::nextafterf;
|
||||
using ::nexttowardf;
|
||||
using ::nexttowardf;
|
||||
using ::powf;
|
||||
using ::remainderf;
|
||||
using ::remquof;
|
||||
using ::rintf;
|
||||
using ::roundf;
|
||||
using ::scalblnf;
|
||||
using ::scalbnf;
|
||||
using ::sinf;
|
||||
using ::sinhf;
|
||||
using ::sqrtf;
|
||||
using ::tanf;
|
||||
using ::tanhf;
|
||||
using ::tgammaf;
|
||||
using ::truncf;
|
||||
|
||||
#ifdef _LIBCPP_END_NAMESPACE_STD
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
#else
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_END_NAMESPACE_VERSION
|
||||
#endif
|
||||
} // namespace std
|
||||
#endif
|
||||
|
||||
#undef __DEVICE__
|
||||
|
||||
#endif
|
|
@ -0,0 +1,203 @@
|
|||
/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
|
||||
#define __CLANG_CUDA_COMPLEX_BUILTINS
|
||||
|
||||
// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are
|
||||
// libgcc functions that clang assumes are available when compiling c99 complex
|
||||
// operations. (These implementations come from libc++, and have been modified
|
||||
// to work with CUDA.)
|
||||
|
||||
extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
|
||||
double __c, double __d) {
|
||||
double __ac = __a * __c;
|
||||
double __bd = __b * __d;
|
||||
double __ad = __a * __d;
|
||||
double __bc = __b * __c;
|
||||
double _Complex z;
|
||||
__real__(z) = __ac - __bd;
|
||||
__imag__(z) = __ad + __bc;
|
||||
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
|
||||
int __recalc = 0;
|
||||
if (std::isinf(__a) || std::isinf(__b)) {
|
||||
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
|
||||
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
|
||||
if (std::isnan(__c))
|
||||
__c = std::copysign(0, __c);
|
||||
if (std::isnan(__d))
|
||||
__d = std::copysign(0, __d);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (std::isinf(__c) || std::isinf(__d)) {
|
||||
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
|
||||
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
|
||||
if (std::isnan(__a))
|
||||
__a = std::copysign(0, __a);
|
||||
if (std::isnan(__b))
|
||||
__b = std::copysign(0, __b);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
|
||||
std::isinf(__ad) || std::isinf(__bc))) {
|
||||
if (std::isnan(__a))
|
||||
__a = std::copysign(0, __a);
|
||||
if (std::isnan(__b))
|
||||
__b = std::copysign(0, __b);
|
||||
if (std::isnan(__c))
|
||||
__c = std::copysign(0, __c);
|
||||
if (std::isnan(__d))
|
||||
__d = std::copysign(0, __d);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (__recalc) {
|
||||
// Can't use std::numeric_limits<double>::infinity() -- that doesn't have
|
||||
// a device overload (and isn't constexpr before C++11, naturally).
|
||||
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
|
||||
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
|
||||
}
|
||||
}
|
||||
return z;
|
||||
}
|
||||
|
||||
extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
|
||||
float __c, float __d) {
|
||||
float __ac = __a * __c;
|
||||
float __bd = __b * __d;
|
||||
float __ad = __a * __d;
|
||||
float __bc = __b * __c;
|
||||
float _Complex z;
|
||||
__real__(z) = __ac - __bd;
|
||||
__imag__(z) = __ad + __bc;
|
||||
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
|
||||
int __recalc = 0;
|
||||
if (std::isinf(__a) || std::isinf(__b)) {
|
||||
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
|
||||
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
|
||||
if (std::isnan(__c))
|
||||
__c = std::copysign(0, __c);
|
||||
if (std::isnan(__d))
|
||||
__d = std::copysign(0, __d);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (std::isinf(__c) || std::isinf(__d)) {
|
||||
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
|
||||
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
|
||||
if (std::isnan(__a))
|
||||
__a = std::copysign(0, __a);
|
||||
if (std::isnan(__b))
|
||||
__b = std::copysign(0, __b);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
|
||||
std::isinf(__ad) || std::isinf(__bc))) {
|
||||
if (std::isnan(__a))
|
||||
__a = std::copysign(0, __a);
|
||||
if (std::isnan(__b))
|
||||
__b = std::copysign(0, __b);
|
||||
if (std::isnan(__c))
|
||||
__c = std::copysign(0, __c);
|
||||
if (std::isnan(__d))
|
||||
__d = std::copysign(0, __d);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (__recalc) {
|
||||
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
|
||||
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
|
||||
}
|
||||
}
|
||||
return z;
|
||||
}
|
||||
|
||||
extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
|
||||
double __c, double __d) {
|
||||
int __ilogbw = 0;
|
||||
// Can't use std::max, because that's defined in <algorithm>, and we don't
|
||||
// want to pull that in for every compile. The CUDA headers define
|
||||
// ::max(float, float) and ::max(double, double), which is sufficient for us.
|
||||
double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
|
||||
if (std::isfinite(__logbw)) {
|
||||
__ilogbw = (int)__logbw;
|
||||
__c = std::scalbn(__c, -__ilogbw);
|
||||
__d = std::scalbn(__d, -__ilogbw);
|
||||
}
|
||||
double __denom = __c * __c + __d * __d;
|
||||
double _Complex z;
|
||||
__real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
|
||||
__imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
|
||||
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
|
||||
if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
|
||||
__real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
|
||||
__imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
|
||||
} else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
|
||||
std::isfinite(__d)) {
|
||||
__a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
|
||||
__b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
|
||||
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
|
||||
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
|
||||
} else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
|
||||
std::isfinite(__b)) {
|
||||
__c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
|
||||
__d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
|
||||
__real__(z) = 0.0 * (__a * __c + __b * __d);
|
||||
__imag__(z) = 0.0 * (__b * __c - __a * __d);
|
||||
}
|
||||
}
|
||||
return z;
|
||||
}
|
||||
|
||||
extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
|
||||
float __c, float __d) {
|
||||
int __ilogbw = 0;
|
||||
float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
|
||||
if (std::isfinite(__logbw)) {
|
||||
__ilogbw = (int)__logbw;
|
||||
__c = std::scalbn(__c, -__ilogbw);
|
||||
__d = std::scalbn(__d, -__ilogbw);
|
||||
}
|
||||
float __denom = __c * __c + __d * __d;
|
||||
float _Complex z;
|
||||
__real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
|
||||
__imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
|
||||
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
|
||||
if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
|
||||
__real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
|
||||
__imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
|
||||
} else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
|
||||
std::isfinite(__d)) {
|
||||
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
|
||||
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
|
||||
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
|
||||
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
|
||||
} else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
|
||||
std::isfinite(__b)) {
|
||||
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
|
||||
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
|
||||
__real__(z) = 0 * (__a * __c + __b * __d);
|
||||
__imag__(z) = 0 * (__b * __c - __a * __d);
|
||||
}
|
||||
}
|
||||
return z;
|
||||
}
|
||||
|
||||
#endif // __CLANG_CUDA_COMPLEX_BUILTINS
|
|
@ -0,0 +1,322 @@
|
|||
/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __CLANG_CUDA_INTRINSICS_H__
|
||||
#define __CLANG_CUDA_INTRINSICS_H__
|
||||
#ifndef __CUDA__
|
||||
#error "This file is for CUDA compilation only."
|
||||
#endif
|
||||
|
||||
// sm_30 intrinsics: __shfl_{up,down,xor}.
|
||||
|
||||
#define __SM_30_INTRINSICS_H__
|
||||
#define __SM_30_INTRINSICS_HPP__
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
||||
#pragma push_macro("__MAKE_SHUFFLES")
|
||||
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \
|
||||
inline __device__ int __FnName(int __val, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
return __IntIntrinsic(__val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ float __FnName(float __val, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
return __FloatIntrinsic(__val, __offset, \
|
||||
((warpSize - __width) << 8) | (__Mask)); \
|
||||
} \
|
||||
inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
return static_cast<unsigned int>( \
|
||||
::__FnName(static_cast<int>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ long long __FnName(long long __val, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
struct __Bits { \
|
||||
int __a, __b; \
|
||||
}; \
|
||||
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
|
||||
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
|
||||
__Bits __tmp; \
|
||||
memcpy(&__val, &__tmp, sizeof(__val)); \
|
||||
__tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
|
||||
__tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
|
||||
long long __ret; \
|
||||
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
|
||||
return __ret; \
|
||||
} \
|
||||
inline __device__ unsigned long long __FnName( \
|
||||
unsigned long long __val, int __offset, int __width = warpSize) { \
|
||||
return static_cast<unsigned long long>(::__FnName( \
|
||||
static_cast<unsigned long long>(__val), __offset, __width)); \
|
||||
} \
|
||||
inline __device__ double __FnName(double __val, int __offset, \
|
||||
int __width = warpSize) { \
|
||||
long long __tmp; \
|
||||
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
|
||||
memcpy(&__tmp, &__val, sizeof(__val)); \
|
||||
__tmp = ::__FnName(__tmp, __offset, __width); \
|
||||
double __ret; \
|
||||
memcpy(&__ret, &__tmp, sizeof(__ret)); \
|
||||
return __ret; \
|
||||
}
|
||||
|
||||
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
|
||||
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
|
||||
// maxLane.
|
||||
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
|
||||
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
|
||||
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
|
||||
|
||||
#pragma pop_macro("__MAKE_SHUFFLES")
|
||||
|
||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
|
||||
|
||||
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
|
||||
|
||||
// Prevent the vanilla sm_32 intrinsics header from being included.
|
||||
#define __SM_32_INTRINSICS_H__
|
||||
#define __SM_32_INTRINSICS_HPP__
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
|
||||
|
||||
inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
|
||||
inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
|
||||
inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
|
||||
inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
|
||||
inline __device__ long long __ldg(const long long *ptr) {
|
||||
return __nvvm_ldg_ll(ptr);
|
||||
}
|
||||
inline __device__ unsigned char __ldg(const unsigned char *ptr) {
|
||||
return __nvvm_ldg_uc(ptr);
|
||||
}
|
||||
inline __device__ unsigned short __ldg(const unsigned short *ptr) {
|
||||
return __nvvm_ldg_us(ptr);
|
||||
}
|
||||
inline __device__ unsigned int __ldg(const unsigned int *ptr) {
|
||||
return __nvvm_ldg_ui(ptr);
|
||||
}
|
||||
inline __device__ unsigned long __ldg(const unsigned long *ptr) {
|
||||
return __nvvm_ldg_ul(ptr);
|
||||
}
|
||||
inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
|
||||
return __nvvm_ldg_ull(ptr);
|
||||
}
|
||||
inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
|
||||
inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
|
||||
|
||||
inline __device__ char2 __ldg(const char2 *ptr) {
|
||||
typedef char c2 __attribute__((ext_vector_type(2)));
|
||||
// We can assume that ptr is aligned at least to char2's alignment, but the
|
||||
// load will assume that ptr is aligned to char2's alignment. This is only
|
||||
// safe if alignof(c2) <= alignof(char2).
|
||||
c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
|
||||
char2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ char4 __ldg(const char4 *ptr) {
|
||||
typedef char c4 __attribute__((ext_vector_type(4)));
|
||||
c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
|
||||
char4 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
ret.z = rv[2];
|
||||
ret.w = rv[3];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ short2 __ldg(const short2 *ptr) {
|
||||
typedef short s2 __attribute__((ext_vector_type(2)));
|
||||
s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
|
||||
short2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ short4 __ldg(const short4 *ptr) {
|
||||
typedef short s4 __attribute__((ext_vector_type(4)));
|
||||
s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
|
||||
short4 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
ret.z = rv[2];
|
||||
ret.w = rv[3];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ int2 __ldg(const int2 *ptr) {
|
||||
typedef int i2 __attribute__((ext_vector_type(2)));
|
||||
i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
|
||||
int2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ int4 __ldg(const int4 *ptr) {
|
||||
typedef int i4 __attribute__((ext_vector_type(4)));
|
||||
i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
|
||||
int4 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
ret.z = rv[2];
|
||||
ret.w = rv[3];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ longlong2 __ldg(const longlong2 *ptr) {
|
||||
typedef long long ll2 __attribute__((ext_vector_type(2)));
|
||||
ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
|
||||
longlong2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline __device__ uchar2 __ldg(const uchar2 *ptr) {
|
||||
typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
|
||||
uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
|
||||
uchar2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ uchar4 __ldg(const uchar4 *ptr) {
|
||||
typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
|
||||
uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
|
||||
uchar4 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
ret.z = rv[2];
|
||||
ret.w = rv[3];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ ushort2 __ldg(const ushort2 *ptr) {
|
||||
typedef unsigned short us2 __attribute__((ext_vector_type(2)));
|
||||
us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
|
||||
ushort2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ ushort4 __ldg(const ushort4 *ptr) {
|
||||
typedef unsigned short us4 __attribute__((ext_vector_type(4)));
|
||||
us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
|
||||
ushort4 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
ret.z = rv[2];
|
||||
ret.w = rv[3];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ uint2 __ldg(const uint2 *ptr) {
|
||||
typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
|
||||
ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
|
||||
uint2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ uint4 __ldg(const uint4 *ptr) {
|
||||
typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
|
||||
ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
|
||||
uint4 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
ret.z = rv[2];
|
||||
ret.w = rv[3];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
|
||||
typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
|
||||
ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
|
||||
ulonglong2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline __device__ float2 __ldg(const float2 *ptr) {
|
||||
typedef float f2 __attribute__((ext_vector_type(2)));
|
||||
f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
|
||||
float2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ float4 __ldg(const float4 *ptr) {
|
||||
typedef float f4 __attribute__((ext_vector_type(4)));
|
||||
f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
|
||||
float4 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
ret.z = rv[2];
|
||||
ret.w = rv[3];
|
||||
return ret;
|
||||
}
|
||||
inline __device__ double2 __ldg(const double2 *ptr) {
|
||||
typedef double d2 __attribute__((ext_vector_type(2)));
|
||||
d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
|
||||
double2 ret;
|
||||
ret.x = rv[0];
|
||||
ret.y = rv[1];
|
||||
return ret;
|
||||
}
|
||||
|
||||
// TODO: Implement these as intrinsics, so the backend can work its magic on
|
||||
// these. Alternatively, we could implement these as plain C and try to get
|
||||
// llvm to recognize the relevant patterns.
|
||||
inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
|
||||
unsigned shiftWidth) {
|
||||
unsigned result;
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result)
|
||||
: "r"(low32), "r"(high32), "r"(shiftWidth));
|
||||
return result;
|
||||
}
|
||||
inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
|
||||
unsigned shiftWidth) {
|
||||
unsigned result;
|
||||
asm("shf.l.clamp.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result)
|
||||
: "r"(low32), "r"(high32), "r"(shiftWidth));
|
||||
return result;
|
||||
}
|
||||
inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
|
||||
unsigned shiftWidth) {
|
||||
unsigned result;
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result)
|
||||
: "r"(low32), "r"(high32), "r"(shiftWidth));
|
||||
return result;
|
||||
}
|
||||
inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
|
||||
unsigned shiftWidth) {
|
||||
unsigned ret;
|
||||
asm("shf.r.clamp.b32 %0, %1, %2, %3;"
|
||||
: "=r"(ret)
|
||||
: "r"(low32), "r"(high32), "r"(shiftWidth));
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
|
||||
|
||||
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
|
|
@ -0,0 +1,286 @@
|
|||
/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
|
||||
#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
|
||||
#ifndef __CUDA__
|
||||
#error "This file is for CUDA compilation only."
|
||||
#endif
|
||||
|
||||
// This file forward-declares of some math functions we (or the CUDA headers)
|
||||
// will define later. We need to do this, and do it before cmath is included,
|
||||
// because the standard library may have constexpr math functions. In the
|
||||
// absence of a prior __device__ decl, those constexpr functions may become
|
||||
// implicitly host+device. host+device functions can't be overloaded, so that
|
||||
// would preclude the use of our own __device__ overloads for these functions.
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#define __DEVICE__ \
|
||||
static __inline__ __attribute__((always_inline)) __attribute__((device))
|
||||
|
||||
__DEVICE__ double abs(double);
|
||||
__DEVICE__ float abs(float);
|
||||
__DEVICE__ int abs(int);
|
||||
__DEVICE__ long abs(long);
|
||||
__DEVICE__ long long abs(long long);
|
||||
__DEVICE__ double acos(double);
|
||||
__DEVICE__ float acos(float);
|
||||
__DEVICE__ double acosh(double);
|
||||
__DEVICE__ float acosh(float);
|
||||
__DEVICE__ double asin(double);
|
||||
__DEVICE__ float asin(float);
|
||||
__DEVICE__ double asinh(double);
|
||||
__DEVICE__ float asinh(float);
|
||||
__DEVICE__ double atan2(double, double);
|
||||
__DEVICE__ float atan2(float, float);
|
||||
__DEVICE__ double atan(double);
|
||||
__DEVICE__ float atan(float);
|
||||
__DEVICE__ double atanh(double);
|
||||
__DEVICE__ float atanh(float);
|
||||
__DEVICE__ double cbrt(double);
|
||||
__DEVICE__ float cbrt(float);
|
||||
__DEVICE__ double ceil(double);
|
||||
__DEVICE__ float ceil(float);
|
||||
__DEVICE__ double copysign(double, double);
|
||||
__DEVICE__ float copysign(float, float);
|
||||
__DEVICE__ double cos(double);
|
||||
__DEVICE__ float cos(float);
|
||||
__DEVICE__ double cosh(double);
|
||||
__DEVICE__ float cosh(float);
|
||||
__DEVICE__ double erfc(double);
|
||||
__DEVICE__ float erfc(float);
|
||||
__DEVICE__ double erf(double);
|
||||
__DEVICE__ float erf(float);
|
||||
__DEVICE__ double exp2(double);
|
||||
__DEVICE__ float exp2(float);
|
||||
__DEVICE__ double exp(double);
|
||||
__DEVICE__ float exp(float);
|
||||
__DEVICE__ double expm1(double);
|
||||
__DEVICE__ float expm1(float);
|
||||
__DEVICE__ double fabs(double);
|
||||
__DEVICE__ float fabs(float);
|
||||
__DEVICE__ double fdim(double, double);
|
||||
__DEVICE__ float fdim(float, float);
|
||||
__DEVICE__ double floor(double);
|
||||
__DEVICE__ float floor(float);
|
||||
__DEVICE__ double fma(double, double, double);
|
||||
__DEVICE__ float fma(float, float, float);
|
||||
__DEVICE__ double fmax(double, double);
|
||||
__DEVICE__ float fmax(float, float);
|
||||
__DEVICE__ double fmin(double, double);
|
||||
__DEVICE__ float fmin(float, float);
|
||||
__DEVICE__ double fmod(double, double);
|
||||
__DEVICE__ float fmod(float, float);
|
||||
__DEVICE__ int fpclassify(double);
|
||||
__DEVICE__ int fpclassify(float);
|
||||
__DEVICE__ double frexp(double, int *);
|
||||
__DEVICE__ float frexp(float, int *);
|
||||
__DEVICE__ double hypot(double, double);
|
||||
__DEVICE__ float hypot(float, float);
|
||||
__DEVICE__ int ilogb(double);
|
||||
__DEVICE__ int ilogb(float);
|
||||
__DEVICE__ bool isfinite(double);
|
||||
__DEVICE__ bool isfinite(float);
|
||||
__DEVICE__ bool isgreater(double, double);
|
||||
__DEVICE__ bool isgreaterequal(double, double);
|
||||
__DEVICE__ bool isgreaterequal(float, float);
|
||||
__DEVICE__ bool isgreater(float, float);
|
||||
__DEVICE__ bool isinf(double);
|
||||
__DEVICE__ bool isinf(float);
|
||||
__DEVICE__ bool isless(double, double);
|
||||
__DEVICE__ bool islessequal(double, double);
|
||||
__DEVICE__ bool islessequal(float, float);
|
||||
__DEVICE__ bool isless(float, float);
|
||||
__DEVICE__ bool islessgreater(double, double);
|
||||
__DEVICE__ bool islessgreater(float, float);
|
||||
__DEVICE__ bool isnan(double);
|
||||
__DEVICE__ bool isnan(float);
|
||||
__DEVICE__ bool isnormal(double);
|
||||
__DEVICE__ bool isnormal(float);
|
||||
__DEVICE__ bool isunordered(double, double);
|
||||
__DEVICE__ bool isunordered(float, float);
|
||||
__DEVICE__ long labs(long);
|
||||
__DEVICE__ double ldexp(double, int);
|
||||
__DEVICE__ float ldexp(float, int);
|
||||
__DEVICE__ double lgamma(double);
|
||||
__DEVICE__ float lgamma(float);
|
||||
__DEVICE__ long long llabs(long long);
|
||||
__DEVICE__ long long llrint(double);
|
||||
__DEVICE__ long long llrint(float);
|
||||
__DEVICE__ double log10(double);
|
||||
__DEVICE__ float log10(float);
|
||||
__DEVICE__ double log1p(double);
|
||||
__DEVICE__ float log1p(float);
|
||||
__DEVICE__ double log2(double);
|
||||
__DEVICE__ float log2(float);
|
||||
__DEVICE__ double logb(double);
|
||||
__DEVICE__ float logb(float);
|
||||
__DEVICE__ double log(double);
|
||||
__DEVICE__ float log(float);
|
||||
__DEVICE__ long lrint(double);
|
||||
__DEVICE__ long lrint(float);
|
||||
__DEVICE__ long lround(double);
|
||||
__DEVICE__ long lround(float);
|
||||
__DEVICE__ long long llround(float); // No llround(double).
|
||||
__DEVICE__ double modf(double, double *);
|
||||
__DEVICE__ float modf(float, float *);
|
||||
__DEVICE__ double nan(const char *);
|
||||
__DEVICE__ float nanf(const char *);
|
||||
__DEVICE__ double nearbyint(double);
|
||||
__DEVICE__ float nearbyint(float);
|
||||
__DEVICE__ double nextafter(double, double);
|
||||
__DEVICE__ float nextafter(float, float);
|
||||
__DEVICE__ double nexttoward(double, double);
|
||||
__DEVICE__ float nexttoward(float, double);
|
||||
__DEVICE__ float nexttowardf(float, double);
|
||||
__DEVICE__ double pow(double, double);
|
||||
__DEVICE__ double pow(double, int);
|
||||
__DEVICE__ float pow(float, float);
|
||||
__DEVICE__ float pow(float, int);
|
||||
__DEVICE__ double remainder(double, double);
|
||||
__DEVICE__ float remainder(float, float);
|
||||
__DEVICE__ double remquo(double, double, int *);
|
||||
__DEVICE__ float remquo(float, float, int *);
|
||||
__DEVICE__ double rint(double);
|
||||
__DEVICE__ float rint(float);
|
||||
__DEVICE__ double round(double);
|
||||
__DEVICE__ float round(float);
|
||||
__DEVICE__ double scalbln(double, long);
|
||||
__DEVICE__ float scalbln(float, long);
|
||||
__DEVICE__ double scalbn(double, int);
|
||||
__DEVICE__ float scalbn(float, int);
|
||||
__DEVICE__ bool signbit(double);
|
||||
__DEVICE__ bool signbit(float);
|
||||
__DEVICE__ double sin(double);
|
||||
__DEVICE__ float sin(float);
|
||||
__DEVICE__ double sinh(double);
|
||||
__DEVICE__ float sinh(float);
|
||||
__DEVICE__ double sqrt(double);
|
||||
__DEVICE__ float sqrt(float);
|
||||
__DEVICE__ double tan(double);
|
||||
__DEVICE__ float tan(float);
|
||||
__DEVICE__ double tanh(double);
|
||||
__DEVICE__ float tanh(float);
|
||||
__DEVICE__ double tgamma(double);
|
||||
__DEVICE__ float tgamma(float);
|
||||
__DEVICE__ double trunc(double);
|
||||
__DEVICE__ float trunc(float);
|
||||
|
||||
// We need to define these overloads in exactly the namespace our standard
|
||||
// library uses (including the right inline namespace), otherwise they won't be
|
||||
// picked up by other functions in the standard library (e.g. functions in
|
||||
// <complex>). Thus the ugliness below.
|
||||
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
#else
|
||||
namespace std {
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
#endif
|
||||
#endif
|
||||
|
||||
using ::abs;
|
||||
using ::acos;
|
||||
using ::acosh;
|
||||
using ::asin;
|
||||
using ::asinh;
|
||||
using ::atan;
|
||||
using ::atan2;
|
||||
using ::atanh;
|
||||
using ::cbrt;
|
||||
using ::ceil;
|
||||
using ::copysign;
|
||||
using ::cos;
|
||||
using ::cosh;
|
||||
using ::erf;
|
||||
using ::erfc;
|
||||
using ::exp;
|
||||
using ::exp2;
|
||||
using ::expm1;
|
||||
using ::fabs;
|
||||
using ::fdim;
|
||||
using ::floor;
|
||||
using ::fma;
|
||||
using ::fmax;
|
||||
using ::fmin;
|
||||
using ::fmod;
|
||||
using ::fpclassify;
|
||||
using ::frexp;
|
||||
using ::hypot;
|
||||
using ::ilogb;
|
||||
using ::isfinite;
|
||||
using ::isgreater;
|
||||
using ::isgreaterequal;
|
||||
using ::isinf;
|
||||
using ::isless;
|
||||
using ::islessequal;
|
||||
using ::islessgreater;
|
||||
using ::isnan;
|
||||
using ::isnormal;
|
||||
using ::isunordered;
|
||||
using ::labs;
|
||||
using ::ldexp;
|
||||
using ::lgamma;
|
||||
using ::llabs;
|
||||
using ::llrint;
|
||||
using ::log;
|
||||
using ::log10;
|
||||
using ::log1p;
|
||||
using ::log2;
|
||||
using ::logb;
|
||||
using ::lrint;
|
||||
using ::lround;
|
||||
using ::llround;
|
||||
using ::modf;
|
||||
using ::nan;
|
||||
using ::nanf;
|
||||
using ::nearbyint;
|
||||
using ::nextafter;
|
||||
using ::nexttoward;
|
||||
using ::pow;
|
||||
using ::remainder;
|
||||
using ::remquo;
|
||||
using ::rint;
|
||||
using ::round;
|
||||
using ::scalbln;
|
||||
using ::scalbn;
|
||||
using ::signbit;
|
||||
using ::sin;
|
||||
using ::sinh;
|
||||
using ::sqrt;
|
||||
using ::tan;
|
||||
using ::tanh;
|
||||
using ::tgamma;
|
||||
using ::trunc;
|
||||
|
||||
#ifdef _LIBCPP_END_NAMESPACE_STD
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
#else
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_END_NAMESPACE_VERSION
|
||||
#endif
|
||||
} // namespace std
|
||||
#endif
|
||||
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
|
||||
#endif
|
|
@ -0,0 +1,347 @@
|
|||
/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
/*
|
||||
* WARNING: This header is intended to be directly -include'd by
|
||||
* the compiler and is not supposed to be included by users.
|
||||
*
|
||||
* CUDA headers are implemented in a way that currently makes it
|
||||
* impossible for user code to #include directly when compiling with
|
||||
* Clang. They present different view of CUDA-supplied functions
|
||||
* depending on where in NVCC's compilation pipeline the headers are
|
||||
* included. Neither of these modes provides function definitions with
|
||||
* correct attributes, so we use preprocessor to force the headers
|
||||
* into a form that Clang can use.
|
||||
*
|
||||
* Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
|
||||
* this file during every CUDA compilation.
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
|
||||
#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
|
||||
|
||||
#if defined(__CUDA__) && defined(__clang__)
|
||||
|
||||
// Include some forward declares that must come before cmath.
|
||||
#include <__clang_cuda_math_forward_declares.h>
|
||||
|
||||
// Include some standard headers to avoid CUDA headers including them
|
||||
// while some required macros (like __THROW) are in a weird state.
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Preserve common macros that will be changed below by us or by CUDA
|
||||
// headers.
|
||||
#pragma push_macro("__THROW")
|
||||
#pragma push_macro("__CUDA_ARCH__")
|
||||
|
||||
// WARNING: Preprocessor hacks below are based on specific details of
|
||||
// CUDA-7.x headers and are not expected to work with any other
|
||||
// version of CUDA headers.
|
||||
#include "cuda.h"
|
||||
#if !defined(CUDA_VERSION)
|
||||
#error "cuda.h did not define CUDA_VERSION"
|
||||
#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
|
||||
#error "Unsupported CUDA version!"
|
||||
#endif
|
||||
|
||||
// Make largest subset of device functions available during host
|
||||
// compilation -- SM_35 for the time being.
|
||||
#ifndef __CUDA_ARCH__
|
||||
#define __CUDA_ARCH__ 350
|
||||
#endif
|
||||
|
||||
#include "__clang_cuda_builtin_vars.h"
|
||||
|
||||
// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
|
||||
// has taken care of builtin variables declared in the file.
|
||||
#define __DEVICE_LAUNCH_PARAMETERS_H__
|
||||
|
||||
// {math,device}_functions.h only have declarations of the
|
||||
// functions. We don't need them as we're going to pull in their
|
||||
// definitions from .hpp files.
|
||||
#define __DEVICE_FUNCTIONS_H__
|
||||
#define __MATH_FUNCTIONS_H__
|
||||
#define __COMMON_FUNCTIONS_H__
|
||||
|
||||
#undef __CUDACC__
|
||||
#define __CUDABE__
|
||||
// Disables definitions of device-side runtime support stubs in
|
||||
// cuda_device_runtime_api.h
|
||||
#include "driver_types.h"
|
||||
#include "host_config.h"
|
||||
#include "host_defines.h"
|
||||
|
||||
#undef __CUDABE__
|
||||
#define __CUDACC__
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
#undef __CUDACC__
|
||||
#define __CUDABE__
|
||||
|
||||
// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
|
||||
// not have at the moment. Emulate them with a builtin memcpy/memset.
|
||||
#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
|
||||
#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
|
||||
|
||||
#include "crt/device_runtime.h"
|
||||
#include "crt/host_runtime.h"
|
||||
// device_runtime.h defines __cxa_* macros that will conflict with
|
||||
// cxxabi.h.
|
||||
// FIXME: redefine these as __device__ functions.
|
||||
#undef __cxa_vec_ctor
|
||||
#undef __cxa_vec_cctor
|
||||
#undef __cxa_vec_dtor
|
||||
#undef __cxa_vec_new
|
||||
#undef __cxa_vec_new2
|
||||
#undef __cxa_vec_new3
|
||||
#undef __cxa_vec_delete2
|
||||
#undef __cxa_vec_delete
|
||||
#undef __cxa_vec_delete3
|
||||
#undef __cxa_pure_virtual
|
||||
|
||||
// math_functions.hpp expects this host function be defined on MacOS, but it
|
||||
// ends up not being there because of the games we play here. Just define it
|
||||
// ourselves; it's simple enough.
|
||||
#ifdef __APPLE__
|
||||
inline __host__ double __signbitd(double x) {
|
||||
return std::signbit(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
// We need decls for functions in CUDA's libdevice with __device__
|
||||
// attribute only. Alas they come either as __host__ __device__ or
|
||||
// with no attributes at all. To work around that, define __CUDA_RTC__
|
||||
// which produces HD variant and undef __host__ which gives us desided
|
||||
// decls with __device__ attribute.
|
||||
#pragma push_macro("__host__")
|
||||
#define __host__
|
||||
#define __CUDACC_RTC__
|
||||
#include "device_functions_decls.h"
|
||||
#undef __CUDACC_RTC__
|
||||
|
||||
// Temporarily poison __host__ macro to ensure it's not used by any of
|
||||
// the headers we're about to include.
|
||||
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
|
||||
|
||||
// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
|
||||
// Previous versions used to check whether they are defined or not.
|
||||
// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
|
||||
// here to detect the switch.
|
||||
|
||||
#if defined(CU_DEVICE_INVALID)
|
||||
#if !defined(__USE_FAST_MATH__)
|
||||
#define __USE_FAST_MATH__ 0
|
||||
#endif
|
||||
|
||||
#if !defined(__CUDA_PREC_DIV)
|
||||
#define __CUDA_PREC_DIV 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// device_functions.hpp and math_functions*.hpp use 'static
|
||||
// __forceinline__' (with no __device__) for definitions of device
|
||||
// functions. Temporarily redefine __forceinline__ to include
|
||||
// __device__.
|
||||
#pragma push_macro("__forceinline__")
|
||||
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
|
||||
#include "device_functions.hpp"
|
||||
|
||||
// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
|
||||
// get the slow-but-accurate or fast-but-inaccurate versions of functions like
|
||||
// sin and exp. This is controlled in clang by -fcuda-approx-transcendentals.
|
||||
//
|
||||
// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
|
||||
// slow divides), so we need to scope our define carefully here.
|
||||
#pragma push_macro("__USE_FAST_MATH__")
|
||||
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
|
||||
#define __USE_FAST_MATH__ 1
|
||||
#endif
|
||||
#include "math_functions.hpp"
|
||||
#pragma pop_macro("__USE_FAST_MATH__")
|
||||
|
||||
#include "math_functions_dbl_ptx3.hpp"
|
||||
#pragma pop_macro("__forceinline__")
|
||||
|
||||
// Pull in host-only functions that are only available when neither
|
||||
// __CUDACC__ nor __CUDABE__ are defined.
|
||||
#undef __MATH_FUNCTIONS_HPP__
|
||||
#undef __CUDABE__
|
||||
#include "math_functions.hpp"
|
||||
// Alas, additional overloads for these functions are hard to get to.
|
||||
// Considering that we only need these overloads for a few functions,
|
||||
// we can provide them here.
|
||||
static inline float rsqrt(float __a) { return rsqrtf(__a); }
|
||||
static inline float rcbrt(float __a) { return rcbrtf(__a); }
|
||||
static inline float sinpi(float __a) { return sinpif(__a); }
|
||||
static inline float cospi(float __a) { return cospif(__a); }
|
||||
static inline void sincospi(float __a, float *__b, float *__c) {
|
||||
return sincospif(__a, __b, __c);
|
||||
}
|
||||
static inline float erfcinv(float __a) { return erfcinvf(__a); }
|
||||
static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
|
||||
static inline float normcdf(float __a) { return normcdff(__a); }
|
||||
static inline float erfcx(float __a) { return erfcxf(__a); }
|
||||
|
||||
// For some reason single-argument variant is not always declared by
|
||||
// CUDA headers. Alas, device_functions.hpp included below needs it.
|
||||
static inline __device__ void __brkpt(int __c) { __brkpt(); }
|
||||
|
||||
// Now include *.hpp with definitions of various GPU functions. Alas,
|
||||
// a lot of thins get declared/defined with __host__ attribute which
|
||||
// we don't want and we have to define it out. We also have to include
|
||||
// {device,math}_functions.hpp again in order to extract the other
|
||||
// branch of #if/else inside.
|
||||
|
||||
#define __host__
|
||||
#undef __CUDABE__
|
||||
#define __CUDACC__
|
||||
#undef __DEVICE_FUNCTIONS_HPP__
|
||||
#include "device_atomic_functions.hpp"
|
||||
#include "device_functions.hpp"
|
||||
#include "sm_20_atomic_functions.hpp"
|
||||
#include "sm_20_intrinsics.hpp"
|
||||
#include "sm_32_atomic_functions.hpp"
|
||||
|
||||
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
|
||||
// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
|
||||
// define them using builtins so that the optimizer can reason about and across
|
||||
// these instructions. In particular, using intrinsics for ldg gets us the
|
||||
// [addr+imm] addressing mode, which, although it doesn't actually exist in the
|
||||
// hardware, seems to generate faster machine code because ptxas can more easily
|
||||
// reason about our code.
|
||||
|
||||
#if CUDA_VERSION >= 8000
|
||||
#include "sm_60_atomic_functions.hpp"
|
||||
#include "sm_61_intrinsics.hpp"
|
||||
#endif
|
||||
|
||||
#undef __MATH_FUNCTIONS_HPP__
|
||||
|
||||
// math_functions.hpp defines ::signbit as a __host__ __device__ function. This
|
||||
// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
|
||||
// math_function.hpp's ::signbit. It's guarded by #undef signbit, but that's
|
||||
// conditional on __GNUC__. :)
|
||||
#pragma push_macro("signbit")
|
||||
#pragma push_macro("__GNUC__")
|
||||
#undef __GNUC__
|
||||
#define signbit __ignored_cuda_signbit
|
||||
#include "math_functions.hpp"
|
||||
#pragma pop_macro("__GNUC__")
|
||||
#pragma pop_macro("signbit")
|
||||
|
||||
#pragma pop_macro("__host__")
|
||||
|
||||
#include "texture_indirect_functions.h"
|
||||
|
||||
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
|
||||
#pragma pop_macro("__CUDA_ARCH__")
|
||||
#pragma pop_macro("__THROW")
|
||||
|
||||
// Set up compiler macros expected to be seen during compilation.
|
||||
#undef __CUDABE__
|
||||
#define __CUDACC__
|
||||
|
||||
extern "C" {
|
||||
// Device-side CUDA system calls.
|
||||
// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
|
||||
// We need these declarations and wrappers for device-side
|
||||
// malloc/free/printf calls to work without relying on
|
||||
// -fcuda-disable-target-call-checks option.
|
||||
__device__ int vprintf(const char *, const char *);
|
||||
__device__ void free(void *) __attribute((nothrow));
|
||||
__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
|
||||
__device__ void __assertfail(const char *__message, const char *__file,
|
||||
unsigned __line, const char *__function,
|
||||
size_t __charSize) __attribute__((noreturn));
|
||||
|
||||
// In order for standard assert() macro on linux to work we need to
|
||||
// provide device-side __assert_fail()
|
||||
__device__ static inline void __assert_fail(const char *__message,
|
||||
const char *__file, unsigned __line,
|
||||
const char *__function) {
|
||||
__assertfail(__message, __file, __line, __function, sizeof(char));
|
||||
}
|
||||
|
||||
// Clang will convert printf into vprintf, but we still need
|
||||
// device-side declaration for it.
|
||||
__device__ int printf(const char *, ...);
|
||||
} // extern "C"
|
||||
|
||||
// We also need device-side std::malloc and std::free.
|
||||
namespace std {
|
||||
__device__ static inline void free(void *__ptr) { ::free(__ptr); }
|
||||
__device__ static inline void *malloc(size_t __size) {
|
||||
return ::malloc(__size);
|
||||
}
|
||||
} // namespace std
|
||||
|
||||
// Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to
|
||||
// come after we've pulled in the definition of uint3 and dim3.
|
||||
|
||||
__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
|
||||
uint3 ret;
|
||||
ret.x = x;
|
||||
ret.y = y;
|
||||
ret.z = z;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
|
||||
uint3 ret;
|
||||
ret.x = x;
|
||||
ret.y = y;
|
||||
ret.z = z;
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
|
||||
return dim3(x, y, z);
|
||||
}
|
||||
|
||||
__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
|
||||
return dim3(x, y, z);
|
||||
}
|
||||
|
||||
#include <__clang_cuda_cmath.h>
|
||||
#include <__clang_cuda_intrinsics.h>
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
|
||||
// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
|
||||
// mode, giving them their "proper" types of dim3 and uint3. This is
|
||||
// incompatible with the types we give in __clang_cuda_builtin_vars.h. As as
|
||||
// hack, force-include the header (nvcc doesn't include it by default) but
|
||||
// redefine dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are
|
||||
// only used here for the redeclarations of blockDim and threadIdx.)
|
||||
#pragma push_macro("dim3")
|
||||
#pragma push_macro("uint3")
|
||||
#define dim3 __cuda_builtin_blockDim_t
|
||||
#define uint3 __cuda_builtin_threadIdx_t
|
||||
#include "curand_mtgp32_kernel.h"
|
||||
#pragma pop_macro("dim3")
|
||||
#pragma pop_macro("uint3")
|
||||
#pragma pop_macro("__USE_FAST_MATH__")
|
||||
|
||||
#endif // __CUDA__
|
||||
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
|
|
@ -25,48 +25,127 @@
|
|||
|
||||
#include <emmintrin.h>
|
||||
|
||||
#if !defined (__AES__)
|
||||
# error "AES instructions not enabled"
|
||||
#else
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
|
||||
|
||||
/// \brief Performs a single round of AES encryption using the Equivalent
|
||||
/// Inverse Cipher, transforming the state value from the first source
|
||||
/// operand using a 128-bit round key value contained in the second source
|
||||
/// operand, and writes the result to the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
|
||||
///
|
||||
/// \param __V
|
||||
/// A 128-bit integer vector containing the state value.
|
||||
/// \param __R
|
||||
/// A 128-bit integer vector containing the round key value.
|
||||
/// \returns A 128-bit integer vector containing the encrypted value.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_aesenc_si128(__m128i __V, __m128i __R)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_aesenc128(__V, __R);
|
||||
return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
|
||||
}
|
||||
|
||||
/// \brief Performs the final round of AES encryption using the Equivalent
|
||||
/// Inverse Cipher, transforming the state value from the first source
|
||||
/// operand using a 128-bit round key value contained in the second source
|
||||
/// operand, and writes the result to the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
|
||||
///
|
||||
/// \param __V
|
||||
/// A 128-bit integer vector containing the state value.
|
||||
/// \param __R
|
||||
/// A 128-bit integer vector containing the round key value.
|
||||
/// \returns A 128-bit integer vector containing the encrypted value.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_aesenclast_si128(__m128i __V, __m128i __R)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_aesenclast128(__V, __R);
|
||||
return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
|
||||
}
|
||||
|
||||
/// \brief Performs a single round of AES decryption using the Equivalent
|
||||
/// Inverse Cipher, transforming the state value from the first source
|
||||
/// operand using a 128-bit round key value contained in the second source
|
||||
/// operand, and writes the result to the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
|
||||
///
|
||||
/// \param __V
|
||||
/// A 128-bit integer vector containing the state value.
|
||||
/// \param __R
|
||||
/// A 128-bit integer vector containing the round key value.
|
||||
/// \returns A 128-bit integer vector containing the decrypted value.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_aesdec_si128(__m128i __V, __m128i __R)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_aesdec128(__V, __R);
|
||||
return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
|
||||
}
|
||||
|
||||
/// \brief Performs the final round of AES decryption using the Equivalent
|
||||
/// Inverse Cipher, transforming the state value from the first source
|
||||
/// operand using a 128-bit round key value contained in the second source
|
||||
/// operand, and writes the result to the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
|
||||
///
|
||||
/// \param __V
|
||||
/// A 128-bit integer vector containing the state value.
|
||||
/// \param __R
|
||||
/// A 128-bit integer vector containing the round key value.
|
||||
/// \returns A 128-bit integer vector containing the decrypted value.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_aesdeclast_si128(__m128i __V, __m128i __R)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_aesdeclast128(__V, __R);
|
||||
return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
|
||||
}
|
||||
|
||||
/// \brief Applies the AES InvMixColumns() transformation to an expanded key
|
||||
/// contained in the source operand, and writes the result to the
|
||||
/// destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
|
||||
///
|
||||
/// \param __V
|
||||
/// A 128-bit integer vector containing the expanded key.
|
||||
/// \returns A 128-bit integer vector containing the transformed value.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_aesimc_si128(__m128i __V)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_aesimc128(__V);
|
||||
return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
|
||||
}
|
||||
|
||||
/// \brief Generates a round key for AES encyption, operating on 128-bit data
|
||||
/// specified in the first source operand and using an 8-bit round constant
|
||||
/// specified by the second source operand, and writes the result to the
|
||||
/// destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
|
||||
///
|
||||
/// \param C
|
||||
/// A 128-bit integer vector that is used to generate the AES encryption key.
|
||||
/// \param R
|
||||
/// An 8-bit round constant used to generate the AES encryption key.
|
||||
/// \returns A 128-bit round key for AES encryption.
|
||||
#define _mm_aeskeygenassist_si128(C, R) \
|
||||
__builtin_ia32_aeskeygenassist128((C), (R))
|
||||
(__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* _WMMINTRIN_AES_H */
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/*===---- __wmmintrin_pclmul.h - AES intrinsics ----------------------------===
|
||||
/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
|
@ -23,12 +23,35 @@
|
|||
#ifndef _WMMINTRIN_PCLMUL_H
|
||||
#define _WMMINTRIN_PCLMUL_H
|
||||
|
||||
#if !defined (__PCLMUL__)
|
||||
# error "PCLMUL instruction is not enabled"
|
||||
#else
|
||||
/// \brief Multiplies two 64-bit integer values, which are selected from source
|
||||
/// operands using the immediate-value operand. The multiplication is a
|
||||
/// carry-less multiplication, and the 128-bit integer product is stored in
|
||||
/// the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
||||
/// \param __Y
|
||||
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
||||
/// \param __I
|
||||
/// An immediate value specifying which 64-bit values to select from the
|
||||
/// operands. Bit 0 is used to select a value from operand \a __X, and bit
|
||||
/// 4 is used to select a value from operand \a __Y: \n
|
||||
/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
|
||||
/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
|
||||
/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
|
||||
/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
|
||||
/// \returns The 128-bit integer vector containing the result of the carry-less
|
||||
/// multiplication of the selected 64-bit values.
|
||||
#define _mm_clmulepi64_si128(__X, __Y, __I) \
|
||||
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
|
||||
(__v2di)(__m128i)(__Y), (char)(__I)))
|
||||
#endif
|
||||
|
||||
#endif /* _WMMINTRIN_PCLMUL_H */
|
||||
|
|
|
@ -32,8 +32,7 @@
|
|||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
|
||||
/* Intrinsics that are available only if __ADX__ defined */
|
||||
#ifdef __ADX__
|
||||
static __inline unsigned char __DEFAULT_FN_ATTRS
|
||||
static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
|
||||
_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
|
||||
unsigned int *__p)
|
||||
{
|
||||
|
@ -41,14 +40,13 @@ _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
|
|||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline unsigned char __DEFAULT_FN_ATTRS
|
||||
static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
|
||||
_addcarryx_u64(unsigned char __cf, unsigned long long __x,
|
||||
unsigned long long __y, unsigned long long *__p)
|
||||
{
|
||||
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Intrinsics that are also available if __ADX__ undefined */
|
||||
static __inline unsigned char __DEFAULT_FN_ATTRS
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,27 +24,21 @@
|
|||
#ifndef __AMMINTRIN_H
|
||||
#define __AMMINTRIN_H
|
||||
|
||||
#ifndef __SSE4A__
|
||||
#error "SSE4A instruction set not enabled"
|
||||
#else
|
||||
|
||||
#include <pmmintrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
|
||||
|
||||
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
|
||||
/// integer vector operand at the index idx and of the length len.
|
||||
/// integer vector operand at the index \a idx and of the length \a len.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// \code
|
||||
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
|
||||
/// \endcode
|
||||
/// \endcode
|
||||
///
|
||||
/// \code
|
||||
/// This intrinsic corresponds to the \c EXTRQ instruction.
|
||||
/// \endcode
|
||||
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
|
||||
///
|
||||
/// \param x
|
||||
/// The value from which bits are extracted.
|
||||
|
@ -52,11 +46,11 @@
|
|||
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
|
||||
/// are zero, the length is interpreted as 64.
|
||||
/// \param idx
|
||||
/// Bits [5:0] specify the index of the least significant bit; the other
|
||||
/// bits are ignored. If the sum of the index and length is greater than
|
||||
/// 64, the result is undefined. If the length and index are both zero,
|
||||
/// bits [63:0] of parameter x are extracted. If the length is zero
|
||||
/// but the index is non-zero, the result is undefined.
|
||||
/// Bits [5:0] specify the index of the least significant bit; the other
|
||||
/// bits are ignored. If the sum of the index and length is greater than 64,
|
||||
/// the result is undefined. If the length and index are both zero, bits
|
||||
/// [63:0] of parameter \a x are extracted. If the length is zero but the
|
||||
/// index is non-zero, the result is undefined.
|
||||
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
|
||||
/// extracted from the source operand.
|
||||
#define _mm_extracti_si64(x, len, idx) \
|
||||
|
@ -64,25 +58,23 @@
|
|||
(char)(len), (char)(idx)))
|
||||
|
||||
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
|
||||
/// integer vector operand at the index and of the length specified by __y.
|
||||
/// integer vector operand at the index and of the length specified by
|
||||
/// \a __y.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// This intrinsic corresponds to the \c EXTRQ instruction.
|
||||
/// \endcode
|
||||
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
|
||||
///
|
||||
/// \param __x
|
||||
/// The value from which bits are extracted.
|
||||
/// \param __y
|
||||
/// Specifies the index of the least significant bit at [13:8]
|
||||
/// and the length at [5:0]; all other bits are ignored.
|
||||
/// If bits [5:0] are zero, the length is interpreted as 64.
|
||||
/// If the sum of the index and length is greater than 64, the result is
|
||||
/// undefined. If the length and index are both zero, bits [63:0] of
|
||||
/// parameter __x are extracted. If the length is zero but the index is
|
||||
/// non-zero, the result is undefined.
|
||||
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
|
||||
/// Specifies the index of the least significant bit at [13:8] and the
|
||||
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
|
||||
/// length is interpreted as 64. If the sum of the index and length is
|
||||
/// greater than 64, the result is undefined. If the length and index are
|
||||
/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
|
||||
/// is zero but the index is non-zero, the result is undefined.
|
||||
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
|
||||
/// from the source operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_extract_si64(__m128i __x, __m128i __y)
|
||||
|
@ -90,97 +82,88 @@ _mm_extract_si64(__m128i __x, __m128i __y)
|
|||
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
|
||||
}
|
||||
|
||||
/// \brief Inserts bits of a specified length from the source integer vector
|
||||
/// y into the lower 64 bits of the destination integer vector x at the
|
||||
/// index idx and of the length len.
|
||||
/// \brief Inserts bits of a specified length from the source integer vector
|
||||
/// \a y into the lower 64 bits of the destination integer vector \a x at
|
||||
/// the index \a idx and of the length \a len.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// \code
|
||||
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
|
||||
/// const int idx);
|
||||
/// \endcode
|
||||
/// \endcode
|
||||
///
|
||||
/// \code
|
||||
/// This intrinsic corresponds to the \c INSERTQ instruction.
|
||||
/// \endcode
|
||||
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
|
||||
///
|
||||
/// \param x
|
||||
/// The destination operand where bits will be inserted. The inserted bits
|
||||
/// are defined by the length len and by the index idx specifying the least
|
||||
/// significant bit.
|
||||
/// The destination operand where bits will be inserted. The inserted bits
|
||||
/// are defined by the length \a len and by the index \a idx specifying the
|
||||
/// least significant bit.
|
||||
/// \param y
|
||||
/// The source operand containing the bits to be extracted. The extracted
|
||||
/// bits are the least significant bits of operand y of length len.
|
||||
/// The source operand containing the bits to be extracted. The extracted
|
||||
/// bits are the least significant bits of operand \a y of length \a len.
|
||||
/// \param len
|
||||
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
|
||||
/// are zero, the length is interpreted as 64.
|
||||
/// \param idx
|
||||
/// Bits [5:0] specify the index of the least significant bit; the other
|
||||
/// bits are ignored. If the sum of the index and length is greater than
|
||||
/// 64, the result is undefined. If the length and index are both zero,
|
||||
/// bits [63:0] of parameter y are inserted into parameter x. If the
|
||||
/// length is zero but the index is non-zero, the result is undefined.
|
||||
/// \returns A 128-bit integer vector containing the original lower 64-bits
|
||||
/// of destination operand x with the specified bitfields replaced by the
|
||||
/// lower bits of source operand y. The upper 64 bits of the return value
|
||||
/// Bits [5:0] specify the index of the least significant bit; the other
|
||||
/// bits are ignored. If the sum of the index and length is greater than 64,
|
||||
/// the result is undefined. If the length and index are both zero, bits
|
||||
/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
|
||||
/// is zero but the index is non-zero, the result is undefined.
|
||||
/// \returns A 128-bit integer vector containing the original lower 64-bits of
|
||||
/// destination operand \a x with the specified bitfields replaced by the
|
||||
/// lower bits of source operand \a y. The upper 64 bits of the return value
|
||||
/// are undefined.
|
||||
|
||||
#define _mm_inserti_si64(x, y, len, idx) \
|
||||
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
|
||||
(__v2di)(__m128i)(y), \
|
||||
(char)(len), (char)(idx)))
|
||||
|
||||
/// \brief Inserts bits of a specified length from the source integer vector
|
||||
/// __y into the lower 64 bits of the destination integer vector __x at
|
||||
/// the index and of the length specified by __y.
|
||||
/// \brief Inserts bits of a specified length from the source integer vector
|
||||
/// \a __y into the lower 64 bits of the destination integer vector \a __x
|
||||
/// at the index and of the length specified by \a __y.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// This intrinsic corresponds to the \c INSERTQ instruction.
|
||||
/// \endcode
|
||||
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
|
||||
///
|
||||
/// \param __x
|
||||
/// The destination operand where bits will be inserted. The inserted bits
|
||||
/// are defined by the length and by the index of the least significant bit
|
||||
/// specified by operand __y.
|
||||
/// The destination operand where bits will be inserted. The inserted bits
|
||||
/// are defined by the length and by the index of the least significant bit
|
||||
/// specified by operand \a __y.
|
||||
/// \param __y
|
||||
/// The source operand containing the bits to be extracted. The extracted
|
||||
/// bits are the least significant bits of operand __y with length specified
|
||||
/// by bits [69:64]. These are inserted into the destination at the index
|
||||
/// specified by bits [77:72]; all other bits are ignored.
|
||||
/// If bits [69:64] are zero, the length is interpreted as 64.
|
||||
/// If the sum of the index and length is greater than 64, the result is
|
||||
/// undefined. If the length and index are both zero, bits [63:0] of
|
||||
/// parameter __y are inserted into parameter __x. If the length
|
||||
/// is zero but the index is non-zero, the result is undefined.
|
||||
/// \returns A 128-bit integer vector containing the original lower 64-bits
|
||||
/// of destination operand __x with the specified bitfields replaced by the
|
||||
/// lower bits of source operand __y. The upper 64 bits of the return value
|
||||
/// are undefined.
|
||||
|
||||
/// The source operand containing the bits to be extracted. The extracted
|
||||
/// bits are the least significant bits of operand \a __y with length
|
||||
/// specified by bits [69:64]. These are inserted into the destination at the
|
||||
/// index specified by bits [77:72]; all other bits are ignored. If bits
|
||||
/// [69:64] are zero, the length is interpreted as 64. If the sum of the
|
||||
/// index and length is greater than 64, the result is undefined. If the
|
||||
/// length and index are both zero, bits [63:0] of parameter \a __y are
|
||||
/// inserted into parameter \a __x. If the length is zero but the index is
|
||||
/// non-zero, the result is undefined.
|
||||
/// \returns A 128-bit integer vector containing the original lower 64-bits of
|
||||
/// destination operand \a __x with the specified bitfields replaced by the
|
||||
/// lower bits of source operand \a __y. The upper 64 bits of the return
|
||||
/// value are undefined.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_insert_si64(__m128i __x, __m128i __y)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
|
||||
}
|
||||
|
||||
/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
|
||||
/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
|
||||
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||
/// used again soon).
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// This intrinsic corresponds to the \c MOVNTSD instruction.
|
||||
/// \endcode
|
||||
/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
|
||||
///
|
||||
/// \param __p
|
||||
/// The 64-bit memory location used to store the register value.
|
||||
/// \param __a
|
||||
/// The 64-bit double-precision floating-point register value to
|
||||
/// be stored.
|
||||
/// The 64-bit double-precision floating-point register value to be stored.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_stream_sd(double *__p, __m128d __a)
|
||||
{
|
||||
|
@ -193,15 +176,12 @@ _mm_stream_sd(double *__p, __m128d __a)
|
|||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// This intrinsic corresponds to the \c MOVNTSS instruction.
|
||||
/// \endcode
|
||||
/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
|
||||
///
|
||||
/// \param __p
|
||||
/// The 32-bit memory location used to store the register value.
|
||||
/// \param __a
|
||||
/// The 32-bit single-precision floating-point register value to
|
||||
/// be stored.
|
||||
/// The 32-bit single-precision floating-point register value to be stored.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_stream_ss(float *__p, __m128 __a)
|
||||
{
|
||||
|
@ -210,6 +190,4 @@ _mm_stream_ss(float *__p, __m128 __a)
|
|||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __SSE4A__ */
|
||||
|
||||
#endif /* __AMMINTRIN_H */
|
||||
|
|
|
@ -72,9 +72,11 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
|
|||
|
||||
/* 8.5 Swap */
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__swp(uint32_t x, volatile uint32_t *p) {
|
||||
__swp(uint32_t __x, volatile uint32_t *__p) {
|
||||
uint32_t v;
|
||||
do v = __builtin_arm_ldrex(p); while (__builtin_arm_strex(x, p));
|
||||
do
|
||||
v = __builtin_arm_ldrex(__p);
|
||||
while (__builtin_arm_strex(__x, __p));
|
||||
return v;
|
||||
}
|
||||
|
||||
|
@ -110,109 +112,115 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(voi
|
|||
/* 9.2 Miscellaneous data-processing intrinsics */
|
||||
/* ROR */
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__ror(uint32_t x, uint32_t y) {
|
||||
y %= 32;
|
||||
if (y == 0) return x;
|
||||
return (x >> y) | (x << (32 - y));
|
||||
__ror(uint32_t __x, uint32_t __y) {
|
||||
__y %= 32;
|
||||
if (__y == 0)
|
||||
return __x;
|
||||
return (__x >> __y) | (__x << (32 - __y));
|
||||
}
|
||||
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__rorll(uint64_t x, uint32_t y) {
|
||||
y %= 64;
|
||||
if (y == 0) return x;
|
||||
return (x >> y) | (x << (64 - y));
|
||||
__rorll(uint64_t __x, uint32_t __y) {
|
||||
__y %= 64;
|
||||
if (__y == 0)
|
||||
return __x;
|
||||
return (__x >> __y) | (__x << (64 - __y));
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||
__rorl(unsigned long x, uint32_t y) {
|
||||
__rorl(unsigned long __x, uint32_t __y) {
|
||||
#if __SIZEOF_LONG__ == 4
|
||||
return __ror(x, y);
|
||||
return __ror(__x, __y);
|
||||
#else
|
||||
return __rorll(x, y);
|
||||
return __rorll(__x, __y);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* CLZ */
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__clz(uint32_t t) {
|
||||
return __builtin_clz(t);
|
||||
__clz(uint32_t __t) {
|
||||
return __builtin_clz(__t);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||
__clzl(unsigned long t) {
|
||||
return __builtin_clzl(t);
|
||||
__clzl(unsigned long __t) {
|
||||
return __builtin_clzl(__t);
|
||||
}
|
||||
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__clzll(uint64_t t) {
|
||||
return __builtin_clzll(t);
|
||||
__clzll(uint64_t __t) {
|
||||
return __builtin_clzll(__t);
|
||||
}
|
||||
|
||||
/* REV */
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__rev(uint32_t t) {
|
||||
return __builtin_bswap32(t);
|
||||
__rev(uint32_t __t) {
|
||||
return __builtin_bswap32(__t);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||
__revl(unsigned long t) {
|
||||
__revl(unsigned long __t) {
|
||||
#if __SIZEOF_LONG__ == 4
|
||||
return __builtin_bswap32(t);
|
||||
return __builtin_bswap32(__t);
|
||||
#else
|
||||
return __builtin_bswap64(t);
|
||||
return __builtin_bswap64(__t);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__revll(uint64_t t) {
|
||||
return __builtin_bswap64(t);
|
||||
__revll(uint64_t __t) {
|
||||
return __builtin_bswap64(__t);
|
||||
}
|
||||
|
||||
/* REV16 */
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__rev16(uint32_t t) {
|
||||
return __ror(__rev(t), 16);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||
__rev16l(unsigned long t) {
|
||||
return __rorl(__revl(t), sizeof(long) / 2);
|
||||
__rev16(uint32_t __t) {
|
||||
return __ror(__rev(__t), 16);
|
||||
}
|
||||
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__rev16ll(uint64_t t) {
|
||||
return __rorll(__revll(t), 32);
|
||||
__rev16ll(uint64_t __t) {
|
||||
return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||
__rev16l(unsigned long __t) {
|
||||
#if __SIZEOF_LONG__ == 4
|
||||
return __rev16(__t);
|
||||
#else
|
||||
return __rev16ll(__t);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* REVSH */
|
||||
static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
|
||||
__revsh(int16_t t) {
|
||||
return __builtin_bswap16(t);
|
||||
__revsh(int16_t __t) {
|
||||
return __builtin_bswap16(__t);
|
||||
}
|
||||
|
||||
/* RBIT */
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__rbit(uint32_t t) {
|
||||
return __builtin_arm_rbit(t);
|
||||
__rbit(uint32_t __t) {
|
||||
return __builtin_arm_rbit(__t);
|
||||
}
|
||||
|
||||
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||
__rbitll(uint64_t t) {
|
||||
__rbitll(uint64_t __t) {
|
||||
#if __ARM_32BIT_STATE
|
||||
return (((uint64_t) __builtin_arm_rbit(t)) << 32) |
|
||||
__builtin_arm_rbit(t >> 32);
|
||||
return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
|
||||
__builtin_arm_rbit(__t >> 32);
|
||||
#else
|
||||
return __builtin_arm_rbit64(t);
|
||||
return __builtin_arm_rbit64(__t);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||
__rbitl(unsigned long t) {
|
||||
__rbitl(unsigned long __t) {
|
||||
#if __SIZEOF_LONG__ == 4
|
||||
return __rbit(t);
|
||||
return __rbit(__t);
|
||||
#else
|
||||
return __rbitll(t);
|
||||
return __rbitll(__t);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -231,61 +239,61 @@ static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
|||
/* 9.4.2 Saturating addition and subtraction intrinsics */
|
||||
#if __ARM_32BIT_STATE
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qadd(int32_t t, int32_t v) {
|
||||
return __builtin_arm_qadd(t, v);
|
||||
__qadd(int32_t __t, int32_t __v) {
|
||||
return __builtin_arm_qadd(__t, __v);
|
||||
}
|
||||
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qsub(int32_t t, int32_t v) {
|
||||
return __builtin_arm_qsub(t, v);
|
||||
__qsub(int32_t __t, int32_t __v) {
|
||||
return __builtin_arm_qsub(__t, __v);
|
||||
}
|
||||
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__qdbl(int32_t t) {
|
||||
return __builtin_arm_qadd(t, t);
|
||||
__qdbl(int32_t __t) {
|
||||
return __builtin_arm_qadd(__t, __t);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 9.7 CRC32 intrinsics */
|
||||
#if __ARM_FEATURE_CRC32
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__crc32b(uint32_t a, uint8_t b) {
|
||||
return __builtin_arm_crc32b(a, b);
|
||||
__crc32b(uint32_t __a, uint8_t __b) {
|
||||
return __builtin_arm_crc32b(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__crc32h(uint32_t a, uint16_t b) {
|
||||
return __builtin_arm_crc32h(a, b);
|
||||
__crc32h(uint32_t __a, uint16_t __b) {
|
||||
return __builtin_arm_crc32h(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__crc32w(uint32_t a, uint32_t b) {
|
||||
return __builtin_arm_crc32w(a, b);
|
||||
__crc32w(uint32_t __a, uint32_t __b) {
|
||||
return __builtin_arm_crc32w(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__crc32d(uint32_t a, uint64_t b) {
|
||||
return __builtin_arm_crc32d(a, b);
|
||||
__crc32d(uint32_t __a, uint64_t __b) {
|
||||
return __builtin_arm_crc32d(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__crc32cb(uint32_t a, uint8_t b) {
|
||||
return __builtin_arm_crc32cb(a, b);
|
||||
__crc32cb(uint32_t __a, uint8_t __b) {
|
||||
return __builtin_arm_crc32cb(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__crc32ch(uint32_t a, uint16_t b) {
|
||||
return __builtin_arm_crc32ch(a, b);
|
||||
__crc32ch(uint32_t __a, uint16_t __b) {
|
||||
return __builtin_arm_crc32ch(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__crc32cw(uint32_t a, uint32_t b) {
|
||||
return __builtin_arm_crc32cw(a, b);
|
||||
__crc32cw(uint32_t __a, uint32_t __b) {
|
||||
return __builtin_arm_crc32cw(__a, __b);
|
||||
}
|
||||
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||
__crc32cd(uint32_t a, uint64_t b) {
|
||||
return __builtin_arm_crc32cd(a, b);
|
||||
__crc32cd(uint32_t __a, uint64_t __b) {
|
||||
return __builtin_arm_crc32cd(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,45 @@
|
|||
/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
/* Only include this if we're compiling for the windows platform. */
|
||||
#ifndef _MSC_VER
|
||||
#include_next <armintr.h>
|
||||
#else
|
||||
|
||||
#ifndef __ARMINTR_H
|
||||
#define __ARMINTR_H
|
||||
|
||||
typedef enum
|
||||
{
|
||||
_ARM_BARRIER_SY = 0xF,
|
||||
_ARM_BARRIER_ST = 0xE,
|
||||
_ARM_BARRIER_ISH = 0xB,
|
||||
_ARM_BARRIER_ISHST = 0xA,
|
||||
_ARM_BARRIER_NSH = 0x7,
|
||||
_ARM_BARRIER_NSHST = 0x6,
|
||||
_ARM_BARRIER_OSH = 0x3,
|
||||
_ARM_BARRIER_OSHST = 0x2
|
||||
} _ARMINTR_BARRIER_TYPE;
|
||||
|
||||
#endif /* __ARMINTR_H */
|
||||
#endif /* _MSC_VER */
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,131 +1,144 @@
|
|||
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512CDINTRIN_H
|
||||
#define __AVX512CDINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_conflict_epi64 (__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
|
||||
(__v8di) _mm512_setzero_si512 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
|
||||
(__v8di) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
|
||||
(__v8di) _mm512_setzero_si512 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_conflict_epi32 (__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
|
||||
(__v16si) _mm512_setzero_si512 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
|
||||
(__v16si) __W,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
|
||||
(__v16si) _mm512_setzero_si512 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_lzcnt_epi32 (__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
|
||||
(__v16si) _mm512_setzero_si512 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
|
||||
(__v16si) __W,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
|
||||
(__v16si) _mm512_setzero_si512 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_lzcnt_epi64 (__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
|
||||
(__v8di) _mm512_setzero_si512 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
|
||||
(__v8di) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
|
||||
(__v8di) _mm512_setzero_si512 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
||||
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512CDINTRIN_H
|
||||
#define __AVX512CDINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_conflict_epi64 (__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
|
||||
(__v8di) _mm512_setzero_si512 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
|
||||
(__v8di) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
|
||||
(__v8di) _mm512_setzero_si512 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_conflict_epi32 (__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
|
||||
(__v16si) _mm512_setzero_si512 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
|
||||
(__v16si) __W,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
|
||||
(__v16si) _mm512_setzero_si512 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_lzcnt_epi32 (__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
|
||||
(__v16si) _mm512_setzero_si512 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
|
||||
(__v16si) __W,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
|
||||
(__v16si) _mm512_setzero_si512 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_lzcnt_epi64 (__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
|
||||
(__v8di) _mm512_setzero_si512 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
|
||||
(__v8di) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
|
||||
(__v8di) _mm512_setzero_si512 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcastmb_epi64 (__mmask8 __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcastmb512 (__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_broadcastmw_epi32 (__mmask16 __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_broadcastmw512 (__A);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
/*===---- avx512fintrin.h - AVX2 intrinsics -----------------------------------===
|
||||
/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
|
@ -31,66 +31,66 @@
|
|||
#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)_mm512_setzero_pd(), \
|
||||
(__mmask8)-1, (R)); })
|
||||
(__mmask8)-1, (int)(R)); })
|
||||
|
||||
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)(__m512d)(S), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||
(int)(R)); })
|
||||
|
||||
#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)_mm512_setzero_pd(), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm512_exp2a23_pd(A) \
|
||||
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm512_mask_exp2a23_pd(S, M, A) \
|
||||
_mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
_mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm512_maskz_exp2a23_pd(M, A) \
|
||||
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)_mm512_setzero_ps(), \
|
||||
(__mmask8)-1, (R)); })
|
||||
(__mmask16)-1, (int)(R)); })
|
||||
|
||||
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)(__m512)(S), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||
(int)(R)); })
|
||||
|
||||
#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)_mm512_setzero_ps(), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__mmask16)(M), (int)(R)); })
|
||||
|
||||
#define _mm512_exp2a23_ps(A) \
|
||||
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm512_mask_exp2a23_ps(S, M, A) \
|
||||
_mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
_mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm512_maskz_exp2a23_ps(M, A) \
|
||||
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
// rsqrt28
|
||||
#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)_mm512_setzero_pd(), \
|
||||
(__mmask8)-1, (R)); })
|
||||
(__mmask8)-1, (int)(R)); })
|
||||
|
||||
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)(__m512d)(S), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||
(int)(R)); })
|
||||
|
||||
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)_mm512_setzero_pd(), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm512_rsqrt28_pd(A) \
|
||||
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
@ -104,17 +104,17 @@
|
|||
#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)_mm512_setzero_ps(), \
|
||||
(__mmask16)-1, (R)); })
|
||||
(__mmask16)-1, (int)(R)); })
|
||||
|
||||
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)(__m512)(S), \
|
||||
(__mmask16)(M), (R)); })
|
||||
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||
(int)(R)); })
|
||||
|
||||
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)_mm512_setzero_ps(), \
|
||||
(__mmask16)(M), (R)); })
|
||||
(__mmask16)(M), (int)(R)); })
|
||||
|
||||
#define _mm512_rsqrt28_ps(A) \
|
||||
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
@ -126,22 +126,22 @@
|
|||
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
|
||||
(__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)_mm_setzero_ps(), \
|
||||
(__mmask8)-1, (R)); })
|
||||
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)_mm_setzero_ps(), \
|
||||
(__mmask8)-1, (int)(R)); })
|
||||
|
||||
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
|
||||
(__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)(__m128)(S), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)(__m128)(S), \
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
|
||||
(__m128)__builtin_ia32_rsqrt28ss_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)_mm_setzero_ps(), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)_mm_setzero_ps(), \
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm_rsqrt28_ss(A, B) \
|
||||
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
|
@ -153,22 +153,22 @@
|
|||
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
|
||||
(__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)_mm_setzero_pd(), \
|
||||
(__mmask8)-1, (R)); })
|
||||
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)_mm_setzero_pd(), \
|
||||
(__mmask8)-1, (int)(R)); })
|
||||
|
||||
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
|
||||
(__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)(__m128d)(S), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)(__m128d)(S), \
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
|
||||
(__m128d)__builtin_ia32_rsqrt28sd_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)_mm_setzero_pd(), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)_mm_setzero_pd(), \
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm_rsqrt28_sd(A, B) \
|
||||
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
|
@ -177,23 +177,23 @@
|
|||
_mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm_maskz_rsqrt28_sd(M, A, B) \
|
||||
_mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
_mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
// rcp28
|
||||
#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)_mm512_setzero_pd(), \
|
||||
(__mmask8)-1, (R)); })
|
||||
(__mmask8)-1, (int)(R)); })
|
||||
|
||||
#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)(__m512d)(S), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||
(int)(R)); })
|
||||
|
||||
#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
|
||||
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||
(__v8df)_mm512_setzero_pd(), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm512_rcp28_pd(A) \
|
||||
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
@ -207,17 +207,17 @@
|
|||
#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)_mm512_setzero_ps(), \
|
||||
(__mmask16)-1, (R)); })
|
||||
(__mmask16)-1, (int)(R)); })
|
||||
|
||||
#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)(__m512)(S), \
|
||||
(__mmask16)(M), (R)); })
|
||||
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||
(int)(R)); })
|
||||
|
||||
#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
|
||||
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||
(__v16sf)_mm512_setzero_ps(), \
|
||||
(__mmask16)(M), (R)); })
|
||||
(__mmask16)(M), (int)(R)); })
|
||||
|
||||
#define _mm512_rcp28_ps(A) \
|
||||
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
@ -229,22 +229,22 @@
|
|||
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
|
||||
(__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)_mm_setzero_ps(), \
|
||||
(__mmask8)-1, (R)); })
|
||||
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)_mm_setzero_ps(), \
|
||||
(__mmask8)-1, (int)(R)); })
|
||||
|
||||
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
|
||||
(__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)(__m128)(S), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)(__m128)(S), \
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
|
||||
(__m128)__builtin_ia32_rcp28ss_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)_mm_setzero_ps(), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||
(__v4sf)(__m128)(B), \
|
||||
(__v4sf)_mm_setzero_ps(), \
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm_rcp28_ss(A, B) \
|
||||
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
|
@ -256,22 +256,22 @@
|
|||
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
|
||||
#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
|
||||
(__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)_mm_setzero_pd(), \
|
||||
(__mmask8)-1, (R)); })
|
||||
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)_mm_setzero_pd(), \
|
||||
(__mmask8)-1, (int)(R)); })
|
||||
|
||||
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
|
||||
(__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)(__m128d)(S), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)(__m128d)(S), \
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
|
||||
(__m128d)__builtin_ia32_rcp28sd_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)_mm_setzero_pd(), \
|
||||
(__mmask8)(M), (R)); })
|
||||
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||
(__v2df)(__m128d)(B), \
|
||||
(__v2df)_mm_setzero_pd(), \
|
||||
(__mmask8)(M), (int)(R)); })
|
||||
|
||||
#define _mm_rcp28_sd(A, B) \
|
||||
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,92 @@
|
|||
/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __IFMAINTRIN_H
|
||||
#define __IFMAINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
|
||||
(__v8di) __Y,
|
||||
(__v8di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
|
||||
__m512i __Y)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
|
||||
(__v8di) __X,
|
||||
(__v8di) __Y,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
|
||||
(__v8di) __Y,
|
||||
(__v8di) __Z,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
|
||||
(__v8di) __Y,
|
||||
(__v8di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
|
||||
__m512i __Y)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
|
||||
(__v8di) __X,
|
||||
(__v8di) __Y,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
|
||||
(__v8di) __Y,
|
||||
(__v8di) __Z,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,149 @@
|
|||
/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __IFMAVLINTRIN_H
|
||||
#define __IFMAVLINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
|
||||
|
||||
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__v2di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
|
||||
(__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__v2di) __Z,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__v4di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
|
||||
__m256i __Y)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
|
||||
(__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__v4di) __Z,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__v2di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
|
||||
(__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__v2di) __Z,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__v4di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
|
||||
__m256i __Y)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
|
||||
(__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__v4di) __Z,
|
||||
(__mmask8) __M);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,111 @@
|
|||
/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512PFINTRIN_H
|
||||
#define __AVX512PFINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
|
||||
|
||||
#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
|
||||
(long long const *)(addr), (int)(scale), \
|
||||
(int)(hint)); })
|
||||
|
||||
#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
|
||||
(long long const *)(addr), (int)(scale), \
|
||||
(int)(hint)); })
|
||||
|
||||
#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
|
||||
__builtin_ia32_gatherpfdps((__mmask16)(mask), \
|
||||
(__v16si)(__m512i)(index), (int const *)(addr), \
|
||||
(int)(scale), (int)(hint)); })
|
||||
|
||||
#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
|
||||
__builtin_ia32_gatherpfdps((__mmask16) -1, \
|
||||
(__v16si)(__m512i)(index), (int const *)(addr), \
|
||||
(int)(scale), (int)(hint)); })
|
||||
|
||||
#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
|
||||
(long long const *)(addr), (int)(scale), \
|
||||
(int)(hint)); })
|
||||
|
||||
#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
|
||||
(long long const *)(addr), (int)(scale), \
|
||||
(int)(hint)); })
|
||||
|
||||
#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
|
||||
__builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
|
||||
(int const *)(addr), (int)(scale), (int)(hint)); })
|
||||
|
||||
#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
|
||||
__builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
|
||||
(int const *)(addr), (int)(scale), (int)(hint)); })
|
||||
|
||||
#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
|
||||
(long long *)(addr), (int)(scale), \
|
||||
(int)(hint)); })
|
||||
|
||||
#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
|
||||
(long long *)(addr), (int)(scale), \
|
||||
(int)(hint)); })
|
||||
|
||||
#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
|
||||
(int *)(addr), (int)(scale), (int)(hint)); })
|
||||
|
||||
#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_scatterpfdps((__mmask16)(mask), \
|
||||
(__v16si)(__m512i)(index), (int *)(addr), \
|
||||
(int)(scale), (int)(hint)); })
|
||||
|
||||
#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
|
||||
(long long *)(addr), (int)(scale), \
|
||||
(int)(hint)); })
|
||||
|
||||
#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
|
||||
(long long *)(addr), (int)(scale), \
|
||||
(int)(hint)); })
|
||||
|
||||
#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
|
||||
(int *)(addr), (int)(scale), (int)(hint)); })
|
||||
|
||||
#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
|
||||
__builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
|
||||
(int *)(addr), (int)(scale), (int)(hint)); })
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,137 @@
|
|||
/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __VBMIINTRIN_H
|
||||
#define __VBMIINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
|
||||
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
|
||||
__mmask64 __U, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
|
||||
(__v64qi) __I
|
||||
/* idx */ ,
|
||||
(__v64qi) __B,
|
||||
(__mmask64) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
|
||||
/* idx */ ,
|
||||
(__v64qi) __A,
|
||||
(__v64qi) __B,
|
||||
(__mmask64) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
|
||||
__m512i __I, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
|
||||
/* idx */ ,
|
||||
(__v64qi) __A,
|
||||
(__v64qi) __B,
|
||||
(__mmask64) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
|
||||
__m512i __I, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
|
||||
/* idx */ ,
|
||||
(__v64qi) __A,
|
||||
(__v64qi) __B,
|
||||
(__mmask64) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
|
||||
(__v64qi) __A,
|
||||
(__v64qi) _mm512_undefined_epi32 (),
|
||||
(__mmask64) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
|
||||
__m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
|
||||
(__v64qi) __A,
|
||||
(__v64qi) _mm512_setzero_si512(),
|
||||
(__mmask64) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
|
||||
__m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
|
||||
(__v64qi) __A,
|
||||
(__v64qi) __W,
|
||||
(__mmask64) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
|
||||
(__v64qi) __Y,
|
||||
(__v64qi) __W,
|
||||
(__mmask64) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
|
||||
(__v64qi) __Y,
|
||||
(__v64qi) _mm512_setzero_si512 (),
|
||||
(__mmask64) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
|
||||
(__v64qi) __Y,
|
||||
(__v64qi) _mm512_undefined_epi32 (),
|
||||
(__mmask64) -1);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,247 @@
|
|||
/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __VBMIVLINTRIN_H
|
||||
#define __VBMIVLINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
|
||||
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
|
||||
__m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
|
||||
(__v16qi) __I
|
||||
/* idx */ ,
|
||||
(__v16qi) __B,
|
||||
(__mmask16)
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
|
||||
__mmask32 __U, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
|
||||
(__v32qi) __I
|
||||
/* idx */ ,
|
||||
(__v32qi) __B,
|
||||
(__mmask32)
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
|
||||
/* idx */ ,
|
||||
(__v16qi) __A,
|
||||
(__v16qi) __B,
|
||||
(__mmask16) -
|
||||
1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
|
||||
__m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
|
||||
/* idx */ ,
|
||||
(__v16qi) __A,
|
||||
(__v16qi) __B,
|
||||
(__mmask16)
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
|
||||
__m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
|
||||
/* idx */ ,
|
||||
(__v16qi) __A,
|
||||
(__v16qi) __B,
|
||||
(__mmask16)
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
|
||||
/* idx */ ,
|
||||
(__v32qi) __A,
|
||||
(__v32qi) __B,
|
||||
(__mmask32) -
|
||||
1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
|
||||
__m256i __I, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
|
||||
/* idx */ ,
|
||||
(__v32qi) __A,
|
||||
(__v32qi) __B,
|
||||
(__mmask32)
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
|
||||
__m256i __I, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
|
||||
/* idx */ ,
|
||||
(__v32qi) __A,
|
||||
(__v32qi) __B,
|
||||
(__mmask32)
|
||||
__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
|
||||
(__v16qi) __A,
|
||||
(__v16qi) _mm_undefined_si128 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
|
||||
(__v16qi) __A,
|
||||
(__v16qi) _mm_setzero_si128 (),
|
||||
(__mmask16) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
|
||||
__m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
|
||||
(__v16qi) __A,
|
||||
(__v16qi) __W,
|
||||
(__mmask16) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
|
||||
(__v32qi) __A,
|
||||
(__v32qi) _mm256_undefined_si256 (),
|
||||
(__mmask32) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
|
||||
__m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
|
||||
(__v32qi) __A,
|
||||
(__v32qi) _mm256_setzero_si256 (),
|
||||
(__mmask32) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
|
||||
__m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
|
||||
(__v32qi) __A,
|
||||
(__v32qi) __W,
|
||||
(__mmask32) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
|
||||
(__v16qi) __Y,
|
||||
(__v16qi) __W,
|
||||
(__mmask16) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
|
||||
(__v16qi) __Y,
|
||||
(__v16qi)
|
||||
_mm_setzero_si128 (),
|
||||
(__mmask16) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
|
||||
(__v16qi) __Y,
|
||||
(__v16qi)
|
||||
_mm_undefined_si128 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
|
||||
(__v32qi) __Y,
|
||||
(__v32qi) __W,
|
||||
(__mmask32) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
|
||||
(__v32qi) __Y,
|
||||
(__v32qi)
|
||||
_mm256_setzero_si256 (),
|
||||
(__mmask32) __M);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
|
||||
(__v32qi) __Y,
|
||||
(__v32qi)
|
||||
_mm256_undefined_si256 (),
|
||||
(__mmask32) -1);
|
||||
}
|
||||
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,263 @@
|
|||
/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX512VLCDINTRIN_H
|
||||
#define __AVX512VLCDINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
|
||||
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_broadcastmb_epi64 (__mmask8 __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_broadcastmb128 (__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_broadcastmb_epi64 (__mmask8 __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcastmb256 (__A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_broadcastmw_epi32 (__mmask16 __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_broadcastmw128 (__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_broadcastmw_epi32 (__mmask16 __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_broadcastmw256 (__A);
|
||||
}
|
||||
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_conflict_epi64 (__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
|
||||
(__v2di) _mm_undefined_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
|
||||
(__v2di) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_di (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_conflict_epi64 (__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
|
||||
(__v4di) _mm256_undefined_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
|
||||
(__v4di) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
|
||||
(__v4di) _mm256_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_conflict_epi32 (__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
|
||||
(__v4si) _mm_undefined_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
|
||||
(__v4si) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
|
||||
(__v4si) _mm_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_conflict_epi32 (__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
|
||||
(__v8si) _mm256_undefined_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
|
||||
(__v8si) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_lzcnt_epi32 (__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
|
||||
(__v4si) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_lzcnt_epi32 (__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
|
||||
(__v8si) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_lzcnt_epi64 (__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_di (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
|
||||
(__v2di) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_di (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_lzcnt_epi64 (__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
|
||||
(__v4di) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __AVX512VLCDINTRIN_H */
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -25,15 +25,11 @@
|
|||
#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __BMI2__
|
||||
# error "BMI2 instruction set not enabled"
|
||||
#endif /* __BMI2__ */
|
||||
|
||||
#ifndef __BMI2INTRIN_H
|
||||
#define __BMI2INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_bzhi_u32(unsigned int __X, unsigned int __Y)
|
||||
|
|
|
@ -25,30 +25,149 @@
|
|||
#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __BMI__
|
||||
# error "BMI instruction set not enabled"
|
||||
#endif /* __BMI__ */
|
||||
|
||||
#ifndef __BMIINTRIN_H
|
||||
#define __BMIINTRIN_H
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned short _tzcnt_u16(unsigned short a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 16-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
#define _tzcnt_u16(a) (__tzcnt_u16((a)))
|
||||
|
||||
/// \brief Performs a bitwise AND of the second operand with the one's
|
||||
/// complement of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _andn_u32(unsigned int a, unsigned int b);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned integer containing one of the operands.
|
||||
/// \param b
|
||||
/// An unsigned integer containing one of the operands.
|
||||
/// \returns An unsigned integer containing the bitwise AND of the second
|
||||
/// operand with the one's complement of the first operand.
|
||||
#define _andn_u32(a, b) (__andn_u32((a), (b)))
|
||||
|
||||
/* _bextr_u32 != __bextr_u32 */
|
||||
/// \brief Clears all bits in the source except for the least significant bit
|
||||
/// containing a value of 1 and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _blsi_u32(unsigned int a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned integer whose bits are to be cleared.
|
||||
/// \returns An unsigned integer containing the result of clearing the bits from
|
||||
/// the source operand.
|
||||
#define _blsi_u32(a) (__blsi_u32((a)))
|
||||
|
||||
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
|
||||
/// including the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _blsmsk_u32(unsigned int a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned integer used to create the mask.
|
||||
/// \returns An unsigned integer containing the newly created mask.
|
||||
#define _blsmsk_u32(a) (__blsmsk_u32((a)))
|
||||
|
||||
/// \brief Clears the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _blsr_u32(unsigned int a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned integer containing the operand to be cleared.
|
||||
/// \returns An unsigned integer containing the result of clearing the source
|
||||
/// operand.
|
||||
#define _blsr_u32(a) (__blsr_u32((a)))
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned int _tzcnt_u32(unsigned int a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 32-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
#define _tzcnt_u32(a) (__tzcnt_u32((a)))
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
|
||||
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS
|
||||
/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
|
||||
instruction behaves as BSF on non-BMI targets, there is code that expects
|
||||
to use it as a potentially faster version of BSF. */
|
||||
#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 16-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
static __inline__ unsigned short __RELAXED_FN_ATTRS
|
||||
__tzcnt_u16(unsigned short __X)
|
||||
{
|
||||
return __X ? __builtin_ctzs(__X) : 16;
|
||||
}
|
||||
|
||||
/// \brief Performs a bitwise AND of the second operand with the one's
|
||||
/// complement of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned integer containing one of the operands.
|
||||
/// \param __Y
|
||||
/// An unsigned integer containing one of the operands.
|
||||
/// \returns An unsigned integer containing the bitwise AND of the second
|
||||
/// operand with the one's complement of the first operand.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__andn_u32(unsigned int __X, unsigned int __Y)
|
||||
{
|
||||
|
@ -56,6 +175,21 @@ __andn_u32(unsigned int __X, unsigned int __Y)
|
|||
}
|
||||
|
||||
/* AMD-specified, double-leading-underscore version of BEXTR */
|
||||
/// \brief Extracts the specified bits from the first operand and returns them
|
||||
/// in the least significant bits of the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned integer whose bits are to be extracted.
|
||||
/// \param __Y
|
||||
/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
|
||||
/// specify the index of the least significant bit. Bits [15:8] specify the
|
||||
/// number of bits to be extracted.
|
||||
/// \returns An unsigned integer whose least significant bits contain the
|
||||
/// extracted bits.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__bextr_u32(unsigned int __X, unsigned int __Y)
|
||||
{
|
||||
|
@ -63,45 +197,214 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
|
|||
}
|
||||
|
||||
/* Intel-specified, single-leading-underscore version of BEXTR */
|
||||
/// \brief Extracts the specified bits from the first operand and returns them
|
||||
/// in the least significant bits of the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned integer whose bits are to be extracted.
|
||||
/// \param __Y
|
||||
/// An unsigned integer used to specify the index of the least significant
|
||||
/// bit for the bits to be extracted. Bits [7:0] specify the index.
|
||||
/// \param __Z
|
||||
/// An unsigned integer used to specify the number of bits to be extracted.
|
||||
/// Bits [7:0] specify the number of bits.
|
||||
/// \returns An unsigned integer whose least significant bits contain the
|
||||
/// extracted bits.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
|
||||
{
|
||||
return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
|
||||
}
|
||||
|
||||
/// \brief Clears all bits in the source except for the least significant bit
|
||||
/// containing a value of 1 and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned integer whose bits are to be cleared.
|
||||
/// \returns An unsigned integer containing the result of clearing the bits from
|
||||
/// the source operand.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsi_u32(unsigned int __X)
|
||||
{
|
||||
return __X & -__X;
|
||||
}
|
||||
|
||||
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
|
||||
/// including the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned integer used to create the mask.
|
||||
/// \returns An unsigned integer containing the newly created mask.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsmsk_u32(unsigned int __X)
|
||||
{
|
||||
return __X ^ (__X - 1);
|
||||
}
|
||||
|
||||
/// \brief Clears the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned integer containing the operand to be cleared.
|
||||
/// \returns An unsigned integer containing the result of clearing the source
|
||||
/// operand.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsr_u32(unsigned int __X)
|
||||
{
|
||||
return __X & (__X - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 32-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
static __inline__ unsigned int __RELAXED_FN_ATTRS
|
||||
__tzcnt_u32(unsigned int __X)
|
||||
{
|
||||
return __X ? __builtin_ctz(__X) : 32;
|
||||
}
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An 32-bit integer containing the number of trailing zero bits in
|
||||
/// the operand.
|
||||
static __inline__ int __RELAXED_FN_ATTRS
|
||||
_mm_tzcnt_32(unsigned int __X)
|
||||
{
|
||||
return __X ? __builtin_ctz(__X) : 32;
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
|
||||
/// \brief Performs a bitwise AND of the second operand with the one's
|
||||
/// complement of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer containing one of the operands.
|
||||
/// \param b
|
||||
/// An unsigned 64-bit integer containing one of the operands.
|
||||
/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
|
||||
/// operand with the one's complement of the first operand.
|
||||
#define _andn_u64(a, b) (__andn_u64((a), (b)))
|
||||
|
||||
/* _bextr_u64 != __bextr_u64 */
|
||||
/// \brief Clears all bits in the source except for the least significant bit
|
||||
/// containing a value of 1 and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _blsi_u64(unsigned long long a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer whose bits are to be cleared.
|
||||
/// \returns An unsigned 64-bit integer containing the result of clearing the
|
||||
/// bits from the source operand.
|
||||
#define _blsi_u64(a) (__blsi_u64((a)))
|
||||
|
||||
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
|
||||
/// including the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _blsmsk_u64(unsigned long long a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer used to create the mask.
|
||||
/// \returns A unsigned 64-bit integer containing the newly created mask.
|
||||
#define _blsmsk_u64(a) (__blsmsk_u64((a)))
|
||||
|
||||
/// \brief Clears the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _blsr_u64(unsigned long long a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer containing the operand to be cleared.
|
||||
/// \returns An unsigned 64-bit integer containing the result of clearing the
|
||||
/// source operand.
|
||||
#define _blsr_u64(a) (__blsr_u64((a)))
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned long long _tzcnt_u64(unsigned long long a);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 64-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
#define _tzcnt_u64(a) (__tzcnt_u64((a)))
|
||||
|
||||
/// \brief Performs a bitwise AND of the second operand with the one's
|
||||
/// complement of the first operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> ANDN </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer containing one of the operands.
|
||||
/// \param __Y
|
||||
/// An unsigned 64-bit integer containing one of the operands.
|
||||
/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
|
||||
/// operand with the one's complement of the first operand.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__andn_u64 (unsigned long long __X, unsigned long long __Y)
|
||||
{
|
||||
|
@ -109,6 +412,21 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
|
|||
}
|
||||
|
||||
/* AMD-specified, double-leading-underscore version of BEXTR */
|
||||
/// \brief Extracts the specified bits from the first operand and returns them
|
||||
/// in the least significant bits of the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer whose bits are to be extracted.
|
||||
/// \param __Y
|
||||
/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
|
||||
/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
|
||||
/// the number of bits to be extracted.
|
||||
/// \returns An unsigned 64-bit integer whose least significant bits contain the
|
||||
/// extracted bits.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__bextr_u64(unsigned long long __X, unsigned long long __Y)
|
||||
{
|
||||
|
@ -116,38 +434,115 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
|
|||
}
|
||||
|
||||
/* Intel-specified, single-leading-underscore version of BEXTR */
|
||||
/// \brief Extracts the specified bits from the first operand and returns them
|
||||
/// in the least significant bits of the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer whose bits are to be extracted.
|
||||
/// \param __Y
|
||||
/// An unsigned integer used to specify the index of the least significant
|
||||
/// bit for the bits to be extracted. Bits [7:0] specify the index.
|
||||
/// \param __Z
|
||||
/// An unsigned integer used to specify the number of bits to be extracted.
|
||||
/// Bits [7:0] specify the number of bits.
|
||||
/// \returns An unsigned 64-bit integer whose least significant bits contain the
|
||||
/// extracted bits.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
|
||||
{
|
||||
return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
|
||||
}
|
||||
|
||||
/// \brief Clears all bits in the source except for the least significant bit
|
||||
/// containing a value of 1 and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSI </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer whose bits are to be cleared.
|
||||
/// \returns An unsigned 64-bit integer containing the result of clearing the
|
||||
/// bits from the source operand.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsi_u64(unsigned long long __X)
|
||||
{
|
||||
return __X & -__X;
|
||||
}
|
||||
|
||||
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
|
||||
/// including the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer used to create the mask.
|
||||
/// \returns A unsigned 64-bit integer containing the newly created mask.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsmsk_u64(unsigned long long __X)
|
||||
{
|
||||
return __X ^ (__X - 1);
|
||||
}
|
||||
|
||||
/// \brief Clears the least siginificant bit that is set to 1 in the source
|
||||
/// operand and returns the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BLSR </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer containing the operand to be cleared.
|
||||
/// \returns An unsigned 64-bit integer containing the result of clearing the
|
||||
/// source operand.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsr_u64(unsigned long long __X)
|
||||
{
|
||||
return __X & (__X - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An unsigned 64-bit integer containing the number of trailing zero
|
||||
/// bits in the operand.
|
||||
static __inline__ unsigned long long __RELAXED_FN_ATTRS
|
||||
__tzcnt_u64(unsigned long long __X)
|
||||
{
|
||||
return __X ? __builtin_ctzll(__X) : 64;
|
||||
}
|
||||
|
||||
/// \brief Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
|
||||
/// \returns An 64-bit integer containing the number of trailing zero bits in
|
||||
/// the operand.
|
||||
static __inline__ long long __RELAXED_FN_ATTRS
|
||||
_mm_tzcnt_64(unsigned long long __X)
|
||||
{
|
||||
return __X ? __builtin_ctzll(__X) : 64;
|
||||
}
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __RELAXED_FN_ATTRS
|
||||
|
||||
#endif /* __BMIINTRIN_H */
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __CLFLUSHOPTINTRIN_H
|
||||
#define __CLFLUSHOPTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt")))
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_clflushopt(char * __m) {
|
||||
__builtin_ia32_clflushopt(__m);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -82,6 +82,7 @@
|
|||
/* Features in %ecx for level 1 */
|
||||
#define bit_SSE3 0x00000001
|
||||
#define bit_PCLMULQDQ 0x00000002
|
||||
#define bit_PCLMUL bit_PCLMULQDQ /* for gcc compat */
|
||||
#define bit_DTES64 0x00000004
|
||||
#define bit_MONITOR 0x00000008
|
||||
#define bit_DSCPL 0x00000010
|
||||
|
@ -98,15 +99,19 @@
|
|||
#define bit_PCID 0x00020000
|
||||
#define bit_DCA 0x00040000
|
||||
#define bit_SSE41 0x00080000
|
||||
#define bit_SSE4_1 bit_SSE41 /* for gcc compat */
|
||||
#define bit_SSE42 0x00100000
|
||||
#define bit_SSE4_2 bit_SSE42 /* for gcc compat */
|
||||
#define bit_x2APIC 0x00200000
|
||||
#define bit_MOVBE 0x00400000
|
||||
#define bit_POPCNT 0x00800000
|
||||
#define bit_TSCDeadline 0x01000000
|
||||
#define bit_AESNI 0x02000000
|
||||
#define bit_AES bit_AESNI /* for gcc compat */
|
||||
#define bit_XSAVE 0x04000000
|
||||
#define bit_OSXSAVE 0x08000000
|
||||
#define bit_AVX 0x10000000
|
||||
#define bit_F16C 0x20000000
|
||||
#define bit_RDRND 0x40000000
|
||||
|
||||
/* Features in %edx for level 1 */
|
||||
|
@ -119,6 +124,7 @@
|
|||
#define bit_PAE 0x00000040
|
||||
#define bit_MCE 0x00000080
|
||||
#define bit_CX8 0x00000100
|
||||
#define bit_CMPXCHG8B bit_CX8 /* for gcc compat */
|
||||
#define bit_APIC 0x00000200
|
||||
#define bit_SEP 0x00000800
|
||||
#define bit_MTRR 0x00001000
|
||||
|
@ -133,7 +139,7 @@
|
|||
#define bit_ACPI 0x00400000
|
||||
#define bit_MMX 0x00800000
|
||||
#define bit_FXSR 0x01000000
|
||||
#define bit_FXSAVE bit_FXSR /* for gcc compat */
|
||||
#define bit_FXSAVE bit_FXSR /* for gcc compat */
|
||||
#define bit_SSE 0x02000000
|
||||
#define bit_SSE2 0x04000000
|
||||
#define bit_SS 0x08000000
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
|
||||
#define __CLANG_CUDA_WRAPPERS_ALGORITHM
|
||||
|
||||
// This header defines __device__ overloads of std::min/max, but only if we're
|
||||
// <= C++11. In C++14, these functions are constexpr, and so are implicitly
|
||||
// __host__ __device__.
|
||||
//
|
||||
// We don't support the initializer_list overloads because
|
||||
// initializer_list::begin() and end() are not __host__ __device__ functions.
|
||||
//
|
||||
// When compiling in C++14 mode, we could force std::min/max to have different
|
||||
// implementations for host and device, by declaring the device overloads
|
||||
// before the constexpr overloads appear. We choose not to do this because
|
||||
|
||||
// a) why write our own implementation when we can use one from the standard
|
||||
// library? and
|
||||
// b) libstdc++ is evil and declares min/max inside a header that is included
|
||||
// *before* we include <algorithm>. So we'd have to unconditionally
|
||||
// declare our __device__ overloads of min/max, but that would pollute
|
||||
// things for people who choose not to include <algorithm>.
|
||||
|
||||
#include_next <algorithm>
|
||||
|
||||
#if __cplusplus <= 201103L
|
||||
|
||||
// We need to define these overloads in exactly the namespace our standard
|
||||
// library uses (including the right inline namespace), otherwise they won't be
|
||||
// picked up by other functions in the standard library (e.g. functions in
|
||||
// <complex>). Thus the ugliness below.
|
||||
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
#else
|
||||
namespace std {
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template <class __T, class __Cmp>
|
||||
inline __device__ const __T &
|
||||
max(const __T &__a, const __T &__b, __Cmp __cmp) {
|
||||
return __cmp(__a, __b) ? __b : __a;
|
||||
}
|
||||
|
||||
template <class __T>
|
||||
inline __device__ const __T &
|
||||
max(const __T &__a, const __T &__b) {
|
||||
return __a < __b ? __b : __a;
|
||||
}
|
||||
|
||||
template <class __T, class __Cmp>
|
||||
inline __device__ const __T &
|
||||
min(const __T &__a, const __T &__b, __Cmp __cmp) {
|
||||
return __cmp(__b, __a) ? __b : __a;
|
||||
}
|
||||
|
||||
template <class __T>
|
||||
inline __device__ const __T &
|
||||
min(const __T &__a, const __T &__b) {
|
||||
return __a < __b ? __b : __a;
|
||||
}
|
||||
|
||||
#ifdef _LIBCPP_END_NAMESPACE_STD
|
||||
_LIBCPP_END_NAMESPACE_STD
|
||||
#else
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_END_NAMESPACE_VERSION
|
||||
#endif
|
||||
} // namespace std
|
||||
#endif
|
||||
|
||||
#endif // __cplusplus <= 201103L
|
||||
#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
|
|
@ -0,0 +1,82 @@
|
|||
/*===---- complex - CUDA wrapper for <complex> ------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
|
||||
#define __CLANG_CUDA_WRAPPERS_COMPLEX
|
||||
|
||||
// Wrapper around <complex> that forces its functions to be __host__
|
||||
// __device__.
|
||||
|
||||
// First, include host-only headers we think are likely to be included by
|
||||
// <complex>, so that the pragma below only applies to <complex> itself.
|
||||
#if __cplusplus >= 201103L
|
||||
#include <type_traits>
|
||||
#endif
|
||||
#include <stdexcept>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
|
||||
// Next, include our <algorithm> wrapper, to ensure that device overloads of
|
||||
// std::min/max are available.
|
||||
#include <algorithm>
|
||||
|
||||
#pragma clang force_cuda_host_device begin
|
||||
|
||||
// When compiling for device, ask libstdc++ to use its own implements of
|
||||
// complex functions, rather than calling builtins (which resolve to library
|
||||
// functions that don't exist when compiling CUDA device code).
|
||||
//
|
||||
// This is a little dicey, because it causes libstdc++ to define a different
|
||||
// set of overloads on host and device.
|
||||
//
|
||||
// // Present only when compiling for host.
|
||||
// __host__ __device__ void complex<float> sin(const complex<float>& x) {
|
||||
// return __builtin_csinf(x);
|
||||
// }
|
||||
//
|
||||
// // Present when compiling for host and for device.
|
||||
// template <typename T>
|
||||
// void __host__ __device__ complex<T> sin(const complex<T>& x) {
|
||||
// return complex<T>(sin(x.real()) * cosh(x.imag()),
|
||||
// cos(x.real()), sinh(x.imag()));
|
||||
// }
|
||||
//
|
||||
// This is safe because when compiling for device, all function calls in
|
||||
// __host__ code to sin() will still resolve to *something*, even if they don't
|
||||
// resolve to the same function as they resolve to when compiling for host. We
|
||||
// don't care that they don't resolve to the right function because we won't
|
||||
// codegen this host code when compiling for device.
|
||||
|
||||
#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
|
||||
#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
|
||||
#define _GLIBCXX_USE_C99_COMPLEX 0
|
||||
#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
|
||||
|
||||
#include_next <complex>
|
||||
|
||||
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
|
||||
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
|
||||
|
||||
#pragma clang force_cuda_host_device end
|
||||
|
||||
#endif // include guard
|
|
@ -0,0 +1,47 @@
|
|||
/*===---- complex - CUDA wrapper for <new> ------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_CUDA_WRAPPERS_NEW
|
||||
#define __CLANG_CUDA_WRAPPERS_NEW
|
||||
|
||||
#include_next <new>
|
||||
|
||||
// Device overrides for placement new and delete.
|
||||
#pragma push_macro("CUDA_NOEXCEPT")
|
||||
#if __cplusplus >= 201103L
|
||||
#define CUDA_NOEXCEPT noexcept
|
||||
#else
|
||||
#define CUDA_NOEXCEPT
|
||||
#endif
|
||||
|
||||
__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
|
||||
return __ptr;
|
||||
}
|
||||
__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
|
||||
return __ptr;
|
||||
}
|
||||
__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
|
||||
__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
|
||||
#pragma pop_macro("CUDA_NOEXCEPT")
|
||||
|
||||
#endif // include guard
|
File diff suppressed because it is too large
Load Diff
|
@ -21,43 +21,104 @@
|
|||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
|
||||
#error "Never use <f16cintrin.h> directly; include <x86intrin.h> instead."
|
||||
#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
|
||||
#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __F16C__
|
||||
# error "F16C instruction is not enabled"
|
||||
#endif /* __F16C__ */
|
||||
|
||||
#ifndef __F16CINTRIN_H
|
||||
#define __F16CINTRIN_H
|
||||
|
||||
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
|
||||
typedef float __m256 __attribute__ ((__vector_size__ (32)));
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("f16c")))
|
||||
|
||||
#define _mm_cvtps_ph(a, imm) __extension__ ({ \
|
||||
__m128 __a = (a); \
|
||||
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__a, (imm)); })
|
||||
/// \brief Converts a 16-bit half-precision float value into a 32-bit float
|
||||
/// value.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 16-bit half-precision float value.
|
||||
/// \returns The converted 32-bit float value.
|
||||
static __inline float __DEFAULT_FN_ATTRS
|
||||
_cvtsh_ss(unsigned short __a)
|
||||
{
|
||||
__v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
|
||||
__v4sf r = __builtin_ia32_vcvtph2ps(v);
|
||||
return r[0];
|
||||
}
|
||||
|
||||
#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
|
||||
__m256 __a = (a); \
|
||||
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__a, (imm)); })
|
||||
/// \brief Converts a 32-bit single-precision float value to a 16-bit
|
||||
/// half-precision float value.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// unsigned short _cvtss_sh(float a, const int imm);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// A 32-bit single-precision float value to be converted to a 16-bit
|
||||
/// half-precision float value.
|
||||
/// \param imm
|
||||
/// An immediate value controlling rounding using bits [2:0]: \n
|
||||
/// 000: Nearest \n
|
||||
/// 001: Down \n
|
||||
/// 010: Up \n
|
||||
/// 011: Truncate \n
|
||||
/// 1XX: Use MXCSR.RC for rounding
|
||||
/// \returns The converted 16-bit half-precision float value.
|
||||
#define _cvtss_sh(a, imm) \
|
||||
((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
|
||||
(imm)))[0]))
|
||||
|
||||
/// \brief Converts a 128-bit vector containing 32-bit float values into a
|
||||
/// 128-bit vector containing 16-bit half-precision float values.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// A 128-bit vector containing 32-bit float values.
|
||||
/// \param imm
|
||||
/// An immediate value controlling rounding using bits [2:0]: \n
|
||||
/// 000: Nearest \n
|
||||
/// 001: Down \n
|
||||
/// 010: Up \n
|
||||
/// 011: Truncate \n
|
||||
/// 1XX: Use MXCSR.RC for rounding
|
||||
/// \returns A 128-bit vector containing converted 16-bit half-precision float
|
||||
/// values. The lower 64 bits are used to store the converted 16-bit
|
||||
/// half-precision floating-point values.
|
||||
#define _mm_cvtps_ph(a, imm) \
|
||||
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
|
||||
|
||||
/// \brief Converts a 128-bit vector containing 16-bit half-precision float
|
||||
/// values into a 128-bit vector containing 32-bit float values.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector containing 16-bit half-precision float values. The lower
|
||||
/// 64 bits are used in the conversion.
|
||||
/// \returns A 128-bit vector of [4 x float] containing converted float values.
|
||||
static __inline __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_cvtph_ps(__m128i __a)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
|
||||
}
|
||||
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_cvtph_ps(__m128i __a)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __F16CINTRIN_H */
|
||||
|
|
|
@ -27,9 +27,12 @@
|
|||
/* If we're on MinGW, fall back to the system's float.h, which might have
|
||||
* additional definitions provided for Windows.
|
||||
* For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
|
||||
*
|
||||
* Also fall back on Darwin to allow additional definitions and
|
||||
* implementation-defined values.
|
||||
*/
|
||||
#if (defined(__MINGW32__) || defined(_MSC_VER)) && __STDC_HOSTED__ && \
|
||||
__has_include_next(<float.h>)
|
||||
#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
|
||||
__STDC_HOSTED__ && __has_include_next(<float.h>)
|
||||
# include_next <float.h>
|
||||
|
||||
/* Undefine anything that we'll be redefining below. */
|
||||
|
@ -39,7 +42,9 @@
|
|||
# undef FLT_MANT_DIG
|
||||
# undef DBL_MANT_DIG
|
||||
# undef LDBL_MANT_DIG
|
||||
# undef DECIMAL_DIG
|
||||
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
|
||||
# undef DECIMAL_DIG
|
||||
# endif
|
||||
# undef FLT_DIG
|
||||
# undef DBL_DIG
|
||||
# undef LDBL_DIG
|
||||
|
@ -68,6 +73,9 @@
|
|||
# undef FLT_TRUE_MIN
|
||||
# undef DBL_TRUE_MIN
|
||||
# undef LDBL_TRUE_MIN
|
||||
# undef FLT_DECIMAL_DIG
|
||||
# undef DBL_DECIMAL_DIG
|
||||
# undef LDBL_DECIMAL_DIG
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
@ -81,7 +89,9 @@
|
|||
#define DBL_MANT_DIG __DBL_MANT_DIG__
|
||||
#define LDBL_MANT_DIG __LDBL_MANT_DIG__
|
||||
|
||||
#define DECIMAL_DIG __DECIMAL_DIG__
|
||||
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
|
||||
# define DECIMAL_DIG __DECIMAL_DIG__
|
||||
#endif
|
||||
|
||||
#define FLT_DIG __FLT_DIG__
|
||||
#define DBL_DIG __DBL_DIG__
|
||||
|
@ -119,6 +129,9 @@
|
|||
# define FLT_TRUE_MIN __FLT_DENORM_MIN__
|
||||
# define DBL_TRUE_MIN __DBL_DENORM_MIN__
|
||||
# define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
|
||||
# define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
|
||||
# define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
|
||||
# define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
|
||||
#endif
|
||||
|
||||
#endif /* __FLOAT_H */
|
||||
|
|
|
@ -28,209 +28,203 @@
|
|||
#ifndef __FMA4INTRIN_H
|
||||
#define __FMA4INTRIN_H
|
||||
|
||||
#ifndef __FMA4__
|
||||
# error "FMA4 instruction set is not enabled"
|
||||
#else
|
||||
|
||||
#include <pmmintrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __FMA4__ */
|
||||
|
||||
#endif /* __FMA4INTRIN_H */
|
||||
|
|
|
@ -28,207 +28,201 @@
|
|||
#ifndef __FMAINTRIN_H
|
||||
#define __FMAINTRIN_H
|
||||
|
||||
#ifndef __FMA__
|
||||
# error "FMA instruction set is not enabled"
|
||||
#else
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmaddps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmaddpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmaddss(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmaddsd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubss(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubsd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmaddps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmaddpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmaddss(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmaddsd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmsubps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmsubpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfnmsubss(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfnmsubsd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmaddsubps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmaddsubpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128)__builtin_ia32_vfmsubaddps(__A, __B, __C);
|
||||
return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
|
||||
{
|
||||
return (__m128d)__builtin_ia32_vfmsubaddpd(__A, __B, __C);
|
||||
return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmaddps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmsubps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmsubpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfnmaddps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfnmaddpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfnmsubps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfnmsubpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmaddsubps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmaddsubpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vfmsubaddps256(__A, __B, __C);
|
||||
return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
|
||||
{
|
||||
return (__m256d)__builtin_ia32_vfmsubaddpd256(__A, __B, __C);
|
||||
return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __FMA__ */
|
||||
|
||||
#endif /* __FMAINTRIN_H */
|
||||
|
|
|
@ -28,27 +28,77 @@
|
|||
#ifndef __FXSRINTRIN_H
|
||||
#define __FXSRINTRIN_H
|
||||
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr")))
|
||||
|
||||
/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
|
||||
/// memory region pointed to by the input parameter \a __p.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
|
||||
///
|
||||
/// \param __p
|
||||
/// A pointer to a 512-byte memory region. The beginning of this memory
|
||||
/// region should be aligned on a 16-byte boundary.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_fxsave(void *__p) {
|
||||
_fxsave(void *__p)
|
||||
{
|
||||
return __builtin_ia32_fxsave(__p);
|
||||
}
|
||||
|
||||
/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
|
||||
/// memory region pointed to by the input parameter \a __p. The contents of
|
||||
/// this memory region should have been written to by a previous \c _fxsave
|
||||
/// or \c _fxsave64 intrinsic.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
|
||||
///
|
||||
/// \param __p
|
||||
/// A pointer to a 512-byte memory region. The beginning of this memory
|
||||
/// region should be aligned on a 16-byte boundary.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_fxsave64(void *__p) {
|
||||
return __builtin_ia32_fxsave64(__p);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_fxrstor(void *__p) {
|
||||
_fxrstor(void *__p)
|
||||
{
|
||||
return __builtin_ia32_fxrstor(__p);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
|
||||
/// memory region pointed to by the input parameter \a __p.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
|
||||
///
|
||||
/// \param __p
|
||||
/// A pointer to a 512-byte memory region. The beginning of this memory
|
||||
/// region should be aligned on a 16-byte boundary.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_fxrstor64(void *__p) {
|
||||
_fxsave64(void *__p)
|
||||
{
|
||||
return __builtin_ia32_fxsave64(__p);
|
||||
}
|
||||
|
||||
/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
|
||||
/// memory region pointed to by the input parameter \a __p. The contents of
|
||||
/// this memory region should have been written to by a previous \c _fxsave
|
||||
/// or \c _fxsave64 intrinsic.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
|
||||
///
|
||||
/// \param __p
|
||||
/// A pointer to a 512-byte memory region. The beginning of this memory
|
||||
/// region should be aligned on a 16-byte boundary.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_fxrstor64(void *__p)
|
||||
{
|
||||
return __builtin_ia32_fxrstor64(__p);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
|
|
|
@ -164,24 +164,24 @@ struct __htm_tdb {
|
|||
/* Helper intrinsics to retry tbegin in case of transient failure. */
|
||||
|
||||
static __inline int __attribute__((__always_inline__, __nodebug__))
|
||||
__builtin_tbegin_retry_null (int retry)
|
||||
__builtin_tbegin_retry_null (int __retry)
|
||||
{
|
||||
int cc, i = 0;
|
||||
|
||||
while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
|
||||
&& i++ < retry)
|
||||
&& i++ < __retry)
|
||||
__builtin_tx_assist(i);
|
||||
|
||||
return cc;
|
||||
}
|
||||
|
||||
static __inline int __attribute__((__always_inline__, __nodebug__))
|
||||
__builtin_tbegin_retry_tdb (void *tdb, int retry)
|
||||
__builtin_tbegin_retry_tdb (void *__tdb, int __retry)
|
||||
{
|
||||
int cc, i = 0;
|
||||
|
||||
while ((cc = __builtin_tbegin(tdb)) == _HTM_TBEGIN_TRANSIENT
|
||||
&& i++ < retry)
|
||||
while ((cc = __builtin_tbegin(__tdb)) == _HTM_TBEGIN_TRANSIENT
|
||||
&& i++ < __retry)
|
||||
__builtin_tx_assist(i);
|
||||
|
||||
return cc;
|
||||
|
@ -193,24 +193,24 @@ __builtin_tbegin_retry_tdb (void *tdb, int retry)
|
|||
__builtin_tbegin_retry_tdb(tdb, retry))
|
||||
|
||||
static __inline int __attribute__((__always_inline__, __nodebug__))
|
||||
__builtin_tbegin_retry_nofloat_null (int retry)
|
||||
__builtin_tbegin_retry_nofloat_null (int __retry)
|
||||
{
|
||||
int cc, i = 0;
|
||||
|
||||
while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
|
||||
&& i++ < retry)
|
||||
&& i++ < __retry)
|
||||
__builtin_tx_assist(i);
|
||||
|
||||
return cc;
|
||||
}
|
||||
|
||||
static __inline int __attribute__((__always_inline__, __nodebug__))
|
||||
__builtin_tbegin_retry_nofloat_tdb (void *tdb, int retry)
|
||||
__builtin_tbegin_retry_nofloat_tdb (void *__tdb, int __retry)
|
||||
{
|
||||
int cc, i = 0;
|
||||
|
||||
while ((cc = __builtin_tbegin_nofloat(tdb)) == _HTM_TBEGIN_TRANSIENT
|
||||
&& i++ < retry)
|
||||
while ((cc = __builtin_tbegin_nofloat(__tdb)) == _HTM_TBEGIN_TRANSIENT
|
||||
&& i++ < __retry)
|
||||
__builtin_tx_assist(i);
|
||||
|
||||
return cc;
|
||||
|
|
|
@ -46,7 +46,7 @@ extern "C" {
|
|||
|
||||
typedef char TM_buff_type[16];
|
||||
|
||||
/* This macro can be used to determine whether a transaction was successfully
|
||||
/* This macro can be used to determine whether a transaction was successfully
|
||||
started from the __TM_begin() and __TM_simple_begin() intrinsic functions
|
||||
below. */
|
||||
#define _HTM_TBEGIN_STARTED 1
|
||||
|
@ -62,18 +62,18 @@ __TM_simple_begin (void)
|
|||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_begin (void* const TM_buff)
|
||||
__TM_begin (void* const __TM_buff)
|
||||
{
|
||||
*_TEXASRL_PTR (TM_buff) = 0;
|
||||
*_TEXASRL_PTR (__TM_buff) = 0;
|
||||
if (__builtin_expect (__builtin_tbegin (0), 1))
|
||||
return _HTM_TBEGIN_STARTED;
|
||||
#ifdef __powerpc64__
|
||||
*_TEXASR_PTR (TM_buff) = __builtin_get_texasr ();
|
||||
*_TEXASR_PTR (__TM_buff) = __builtin_get_texasr ();
|
||||
#else
|
||||
*_TEXASRU_PTR (TM_buff) = __builtin_get_texasru ();
|
||||
*_TEXASRL_PTR (TM_buff) = __builtin_get_texasr ();
|
||||
*_TEXASRU_PTR (__TM_buff) = __builtin_get_texasru ();
|
||||
*_TEXASRL_PTR (__TM_buff) = __builtin_get_texasr ();
|
||||
#endif
|
||||
*_TFIAR_PTR (TM_buff) = __builtin_get_tfiar ();
|
||||
*_TFIAR_PTR (__TM_buff) = __builtin_get_tfiar ();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -95,9 +95,9 @@ __TM_abort (void)
|
|||
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_named_abort (unsigned char const code)
|
||||
__TM_named_abort (unsigned char const __code)
|
||||
{
|
||||
__builtin_tabort (code);
|
||||
__builtin_tabort (__code);
|
||||
}
|
||||
|
||||
extern __inline void
|
||||
|
@ -116,47 +116,47 @@ __TM_suspend (void)
|
|||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_is_user_abort (void* const TM_buff)
|
||||
__TM_is_user_abort (void* const __TM_buff)
|
||||
{
|
||||
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
|
||||
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
|
||||
return _TEXASRU_ABORT (texasru);
|
||||
}
|
||||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_is_named_user_abort (void* const TM_buff, unsigned char *code)
|
||||
__TM_is_named_user_abort (void* const __TM_buff, unsigned char *__code)
|
||||
{
|
||||
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
|
||||
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
|
||||
|
||||
*code = _TEXASRU_FAILURE_CODE (texasru);
|
||||
*__code = _TEXASRU_FAILURE_CODE (texasru);
|
||||
return _TEXASRU_ABORT (texasru);
|
||||
}
|
||||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_is_illegal (void* const TM_buff)
|
||||
__TM_is_illegal (void* const __TM_buff)
|
||||
{
|
||||
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
|
||||
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
|
||||
return _TEXASRU_DISALLOWED (texasru);
|
||||
}
|
||||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_is_footprint_exceeded (void* const TM_buff)
|
||||
__TM_is_footprint_exceeded (void* const __TM_buff)
|
||||
{
|
||||
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
|
||||
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
|
||||
return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
|
||||
}
|
||||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_nesting_depth (void* const TM_buff)
|
||||
__TM_nesting_depth (void* const __TM_buff)
|
||||
{
|
||||
texasrl_t texasrl;
|
||||
|
||||
if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
|
||||
{
|
||||
texasrl = *_TEXASRL_PTR (TM_buff);
|
||||
texasrl = *_TEXASRL_PTR (__TM_buff);
|
||||
if (!_TEXASR_FAILURE_SUMMARY (texasrl))
|
||||
texasrl = 0;
|
||||
}
|
||||
|
@ -168,15 +168,15 @@ __TM_nesting_depth (void* const TM_buff)
|
|||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_is_nested_too_deep(void* const TM_buff)
|
||||
__TM_is_nested_too_deep(void* const __TM_buff)
|
||||
{
|
||||
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
|
||||
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
|
||||
return _TEXASRU_NESTING_OVERFLOW (texasru);
|
||||
}
|
||||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_is_conflict(void* const TM_buff)
|
||||
__TM_is_conflict(void* const __TM_buff)
|
||||
{
|
||||
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
|
||||
/* Return TEXASR bits 11 (Self-Induced Conflict) through
|
||||
|
@ -186,24 +186,24 @@ __TM_is_conflict(void* const TM_buff)
|
|||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_is_failure_persistent(void* const TM_buff)
|
||||
__TM_is_failure_persistent(void* const __TM_buff)
|
||||
{
|
||||
texasru_t texasru = *_TEXASRU_PTR (TM_buff);
|
||||
texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
|
||||
return _TEXASRU_FAILURE_PERSISTENT (texasru);
|
||||
}
|
||||
|
||||
extern __inline long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_failure_address(void* const TM_buff)
|
||||
__TM_failure_address(void* const __TM_buff)
|
||||
{
|
||||
return *_TFIAR_PTR (TM_buff);
|
||||
return *_TFIAR_PTR (__TM_buff);
|
||||
}
|
||||
|
||||
extern __inline long long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__TM_failure_code(void* const TM_buff)
|
||||
__TM_failure_code(void* const __TM_buff)
|
||||
{
|
||||
return *_TEXASR_PTR (TM_buff);
|
||||
return *_TEXASR_PTR (__TM_buff);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -227,9 +227,9 @@ __TM_simple_begin ()
|
|||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_begin (void* const tdb)
|
||||
__TM_begin (void* const __tdb)
|
||||
{
|
||||
return __builtin_tbegin_nofloat (tdb);
|
||||
return __builtin_tbegin_nofloat (__tdb);
|
||||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
|
@ -245,22 +245,22 @@ __TM_abort ()
|
|||
}
|
||||
|
||||
static __inline void __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_named_abort (unsigned char const code)
|
||||
__TM_named_abort (unsigned char const __code)
|
||||
{
|
||||
return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + code);
|
||||
return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + __code);
|
||||
}
|
||||
|
||||
static __inline void __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_non_transactional_store (void* const addr, long long const value)
|
||||
__TM_non_transactional_store (void* const __addr, long long const __value)
|
||||
{
|
||||
__builtin_non_tx_store ((uint64_t*)addr, (uint64_t)value);
|
||||
__builtin_non_tx_store ((uint64_t*)__addr, (uint64_t)__value);
|
||||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_nesting_depth (void* const tdb_ptr)
|
||||
__TM_nesting_depth (void* const __tdb_ptr)
|
||||
{
|
||||
int depth = __builtin_tx_nesting_depth ();
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
|
||||
if (depth != 0)
|
||||
return depth;
|
||||
|
@ -273,9 +273,9 @@ __TM_nesting_depth (void* const tdb_ptr)
|
|||
/* Transaction failure diagnostics */
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_is_user_abort (void* const tdb_ptr)
|
||||
__TM_is_user_abort (void* const __tdb_ptr)
|
||||
{
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
|
||||
if (tdb->format != 1)
|
||||
return 0;
|
||||
|
@ -284,25 +284,25 @@ __TM_is_user_abort (void* const tdb_ptr)
|
|||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_is_named_user_abort (void* const tdb_ptr, unsigned char* code)
|
||||
__TM_is_named_user_abort (void* const __tdb_ptr, unsigned char* __code)
|
||||
{
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
|
||||
if (tdb->format != 1)
|
||||
return 0;
|
||||
|
||||
if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
|
||||
{
|
||||
*code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
|
||||
*__code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_is_illegal (void* const tdb_ptr)
|
||||
__TM_is_illegal (void* const __tdb_ptr)
|
||||
{
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
|
||||
return (tdb->format == 1
|
||||
&& (tdb->abort_code == 4 /* unfiltered program interruption */
|
||||
|
@ -310,9 +310,9 @@ __TM_is_illegal (void* const tdb_ptr)
|
|||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_is_footprint_exceeded (void* const tdb_ptr)
|
||||
__TM_is_footprint_exceeded (void* const __tdb_ptr)
|
||||
{
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
|
||||
return (tdb->format == 1
|
||||
&& (tdb->abort_code == 7 /* fetch overflow */
|
||||
|
@ -320,17 +320,17 @@ __TM_is_footprint_exceeded (void* const tdb_ptr)
|
|||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_is_nested_too_deep (void* const tdb_ptr)
|
||||
__TM_is_nested_too_deep (void* const __tdb_ptr)
|
||||
{
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
|
||||
return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
|
||||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_is_conflict (void* const tdb_ptr)
|
||||
__TM_is_conflict (void* const __tdb_ptr)
|
||||
{
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
|
||||
return (tdb->format == 1
|
||||
&& (tdb->abort_code == 9 /* fetch conflict */
|
||||
|
@ -338,22 +338,22 @@ __TM_is_conflict (void* const tdb_ptr)
|
|||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_is_failure_persistent (long const result)
|
||||
__TM_is_failure_persistent (long const __result)
|
||||
{
|
||||
return result == _HTM_TBEGIN_PERSISTENT;
|
||||
return __result == _HTM_TBEGIN_PERSISTENT;
|
||||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_failure_address (void* const tdb_ptr)
|
||||
__TM_failure_address (void* const __tdb_ptr)
|
||||
{
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
return tdb->atia;
|
||||
}
|
||||
|
||||
static __inline long __attribute__((__always_inline__, __nodebug__))
|
||||
__TM_failure_code (void* const tdb_ptr)
|
||||
__TM_failure_code (void* const __tdb_ptr)
|
||||
{
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)tdb_ptr;
|
||||
struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
|
||||
|
||||
return tdb->abort_code;
|
||||
}
|
||||
|
|
|
@ -32,50 +32,26 @@
|
|||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
__readeflags(void)
|
||||
{
|
||||
unsigned long long __res = 0;
|
||||
__asm__ __volatile__ ("pushf\n\t"
|
||||
"popq %0\n"
|
||||
:"=r"(__res)
|
||||
:
|
||||
:
|
||||
);
|
||||
return __res;
|
||||
return __builtin_ia32_readeflags_u64();
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
__writeeflags(unsigned long long __f)
|
||||
{
|
||||
__asm__ __volatile__ ("pushq %0\n\t"
|
||||
"popf\n"
|
||||
:
|
||||
:"r"(__f)
|
||||
:"flags"
|
||||
);
|
||||
__builtin_ia32_writeeflags_u64(__f);
|
||||
}
|
||||
|
||||
#else /* !__x86_64__ */
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||
__readeflags(void)
|
||||
{
|
||||
unsigned int __res = 0;
|
||||
__asm__ __volatile__ ("pushf\n\t"
|
||||
"popl %0\n"
|
||||
:"=r"(__res)
|
||||
:
|
||||
:
|
||||
);
|
||||
return __res;
|
||||
return __builtin_ia32_readeflags_u32();
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
__writeeflags(unsigned int __f)
|
||||
{
|
||||
__asm__ __volatile__ ("pushl %0\n\t"
|
||||
"popf\n"
|
||||
:
|
||||
:"r"(__f)
|
||||
:"flags"
|
||||
);
|
||||
__builtin_ia32_writeeflags_u32(__f);
|
||||
}
|
||||
#endif /* !__x86_64__ */
|
||||
|
||||
|
@ -84,12 +60,6 @@ __rdpmc(int __A) {
|
|||
return __builtin_ia32_rdpmc(__A);
|
||||
}
|
||||
|
||||
/* __rdtsc */
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
__rdtsc(void) {
|
||||
return __builtin_ia32_rdtsc();
|
||||
}
|
||||
|
||||
/* __rdtscp */
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
__rdtscp(unsigned int *__A) {
|
||||
|
@ -98,4 +68,6 @@ __rdtscp(unsigned int *__A) {
|
|||
|
||||
#define _rdtsc() __rdtsc()
|
||||
|
||||
#define _rdpmc(A) __rdpmc(A)
|
||||
|
||||
#endif /* __IA32INTRIN_H */
|
||||
|
|
|
@ -24,105 +24,204 @@
|
|||
#ifndef __IMMINTRIN_H
|
||||
#define __IMMINTRIN_H
|
||||
|
||||
#ifdef __MMX__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
|
||||
#include <mmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __SSE__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __SSE3__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __SSSE3__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined (__SSE4_2__) || defined (__SSE4_1__)
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__SSE4_2__) || defined(__SSE4_1__))
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined (__AES__) || defined (__PCLMUL__)
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AES__) || defined(__PCLMUL__))
|
||||
#include <wmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
|
||||
#include <clflushoptintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
|
||||
#include <avxintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
|
||||
#include <avx2intrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __BMI__
|
||||
/* The 256-bit versions of functions in f16cintrin.h.
|
||||
Intel documents these as being in immintrin.h, and
|
||||
they depend on typedefs from avxintrin.h. */
|
||||
|
||||
/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
|
||||
/// containing 16-bit half-precision float values.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// A 256-bit vector containing 32-bit single-precision float values to be
|
||||
/// converted to 16-bit half-precision float values.
|
||||
/// \param imm
|
||||
/// An immediate value controlling rounding using bits [2:0]: \n
|
||||
/// 000: Nearest \n
|
||||
/// 001: Down \n
|
||||
/// 010: Up \n
|
||||
/// 011: Truncate \n
|
||||
/// 1XX: Use MXCSR.RC for rounding
|
||||
/// \returns A 128-bit vector containing the converted 16-bit half-precision
|
||||
/// float values.
|
||||
#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
|
||||
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
|
||||
|
||||
/// \brief Converts a 128-bit vector containing 16-bit half-precision float
|
||||
/// values into a 256-bit vector of [8 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector containing 16-bit half-precision float values to be
|
||||
/// converted to 32-bit single-precision float values.
|
||||
/// \returns A vector of [8 x float] containing the converted 32-bit
|
||||
/// single-precision float values.
|
||||
static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
|
||||
_mm256_cvtph_ps(__m128i __a)
|
||||
{
|
||||
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
|
||||
}
|
||||
#endif /* __AVX2__ */
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
|
||||
#include <bmiintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __BMI2__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
|
||||
#include <bmi2intrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __LZCNT__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
|
||||
#include <lzcntintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __FMA__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
|
||||
#include <fmaintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512F__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
|
||||
#include <avx512fintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512VL__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
|
||||
#include <avx512vlintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512BW__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
|
||||
#include <avx512bwintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
|
||||
#include <avx512cdintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512DQ__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
|
||||
#include <avx512dqintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined (__AVX512VL__) && defined (__AVX512BW__)
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512BW__))
|
||||
#include <avx512vlbwintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined (__AVX512VL__) && defined (__AVX512DQ__)
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512CD__))
|
||||
#include <avx512vlcdintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512DQ__))
|
||||
#include <avx512vldqintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512ER__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
|
||||
#include <avx512erintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __RDRND__
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
|
||||
#include <avx512ifmaintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512IFMA__) && defined(__AVX512VL__))
|
||||
#include <avx512ifmavlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
|
||||
#include <avx512vbmiintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VBMI__) && defined(__AVX512VL__))
|
||||
#include <avx512vbmivlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
|
||||
#include <avx512pfintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
|
||||
#include <pkuintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||
_rdrand16_step(unsigned short *__p)
|
||||
{
|
||||
return __builtin_ia32_rdrand16_step(__p);
|
||||
}
|
||||
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||
_rdrand32_step(unsigned int *__p)
|
||||
{
|
||||
return __builtin_ia32_rdrand32_step(__p);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
/* __bit_scan_forward */
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
_bit_scan_forward(int __A) {
|
||||
return __builtin_ctz(__A);
|
||||
}
|
||||
|
||||
/* __bit_scan_reverse */
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
_bit_scan_reverse(int __A) {
|
||||
return 31 - __builtin_clz(__A);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||
_rdrand64_step(unsigned long long *__p)
|
||||
{
|
||||
return __builtin_ia32_rdrand64_step(__p);
|
||||
|
@ -130,71 +229,87 @@ _rdrand64_step(unsigned long long *__p)
|
|||
#endif
|
||||
#endif /* __RDRND__ */
|
||||
|
||||
#ifdef __FSGSBASE__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
|
||||
#ifdef __x86_64__
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_readfsbase_u32(void)
|
||||
{
|
||||
return __builtin_ia32_rdfsbase32();
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_readfsbase_u64(void)
|
||||
{
|
||||
return __builtin_ia32_rdfsbase64();
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_readgsbase_u32(void)
|
||||
{
|
||||
return __builtin_ia32_rdgsbase32();
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_readgsbase_u64(void)
|
||||
{
|
||||
return __builtin_ia32_rdgsbase64();
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_writefsbase_u32(unsigned int __V)
|
||||
{
|
||||
return __builtin_ia32_wrfsbase32(__V);
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_writefsbase_u64(unsigned long long __V)
|
||||
{
|
||||
return __builtin_ia32_wrfsbase64(__V);
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_writegsbase_u32(unsigned int __V)
|
||||
{
|
||||
return __builtin_ia32_wrgsbase32(__V);
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_writegsbase_u64(unsigned long long __V)
|
||||
{
|
||||
return __builtin_ia32_wrgsbase64(__V);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif /* __FSGSBASE__ */
|
||||
|
||||
#ifdef __RTM__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
|
||||
#include <rtmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __RTM__
|
||||
#include <xtestintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __SHA__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
|
||||
#include <shaintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
|
||||
#include <fxsrintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
|
||||
#include <xsaveintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
|
||||
#include <xsaveoptintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
|
||||
#include <xsavecintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
|
||||
#include <xsavesintrin.h>
|
||||
#endif
|
||||
|
||||
/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
|
||||
* whereas others are also available at all times. */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,6 +23,10 @@
|
|||
#ifndef __CLANG_INTTYPES_H
|
||||
#define __CLANG_INTTYPES_H
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1800
|
||||
#error MSVC does not have inttypes.h prior to Visual Studio 2013
|
||||
#endif
|
||||
|
||||
#include_next <inttypes.h>
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1900
|
||||
|
|
|
@ -25,28 +25,54 @@
|
|||
#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __LZCNT__
|
||||
# error "LZCNT instruction is not enabled"
|
||||
#endif /* __LZCNT__ */
|
||||
|
||||
#ifndef __LZCNTINTRIN_H
|
||||
#define __LZCNTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
|
||||
|
||||
/// \brief Counts the number of leading zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c LZCNT instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 16-bit integer whose leading zeros are to be counted.
|
||||
/// \returns An unsigned 16-bit integer containing the number of leading zero
|
||||
/// bits in the operand.
|
||||
static __inline__ unsigned short __DEFAULT_FN_ATTRS
|
||||
__lzcnt16(unsigned short __X)
|
||||
{
|
||||
return __X ? __builtin_clzs(__X) : 16;
|
||||
}
|
||||
|
||||
/// \brief Counts the number of leading zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c LZCNT instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 32-bit integer whose leading zeros are to be counted.
|
||||
/// \returns An unsigned 32-bit integer containing the number of leading zero
|
||||
/// bits in the operand.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__lzcnt32(unsigned int __X)
|
||||
{
|
||||
return __X ? __builtin_clz(__X) : 32;
|
||||
}
|
||||
|
||||
/// \brief Counts the number of leading zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c LZCNT instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 32-bit integer whose leading zeros are to be counted.
|
||||
/// \returns An unsigned 32-bit integer containing the number of leading zero
|
||||
/// bits in the operand.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_lzcnt_u32(unsigned int __X)
|
||||
{
|
||||
|
@ -54,12 +80,32 @@ _lzcnt_u32(unsigned int __X)
|
|||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
/// \brief Counts the number of leading zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c LZCNT instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer whose leading zeros are to be counted.
|
||||
/// \returns An unsigned 64-bit integer containing the number of leading zero
|
||||
/// bits in the operand.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__lzcnt64(unsigned long long __X)
|
||||
{
|
||||
return __X ? __builtin_clzll(__X) : 64;
|
||||
}
|
||||
|
||||
/// \brief Counts the number of leading zero bits in the operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c LZCNT instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer whose leading zeros are to be counted.
|
||||
/// \returns An unsigned 64-bit integer containing the number of leading zero
|
||||
/// bits in the operand.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_lzcnt_u64(unsigned long long __X)
|
||||
{
|
||||
|
|
|
@ -30,10 +30,10 @@
|
|||
typedef float __v2sf __attribute__((__vector_size__(8)));
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_m_femms() {
|
||||
_m_femms(void) {
|
||||
__builtin_ia32_femms();
|
||||
}
|
||||
|
||||
|
@ -132,6 +132,10 @@ _m_pmulhrw(__m64 __m1, __m64 __m2) {
|
|||
return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
|
||||
}
|
||||
|
||||
/* Handle the 3dnowa instructions here. */
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
|
||||
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_m_pf2iw(__m64 __m) {
|
||||
return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
|
||||
|
|
1104
c_headers/mmintrin.h
1104
c_headers/mmintrin.h
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,166 @@
|
|||
/*===---- module.modulemap - intrinsics module map -------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
module _Builtin_intrinsics [system] [extern_c] {
|
||||
explicit module altivec {
|
||||
requires altivec
|
||||
header "altivec.h"
|
||||
}
|
||||
|
||||
explicit module arm {
|
||||
requires arm
|
||||
|
||||
explicit module acle {
|
||||
header "arm_acle.h"
|
||||
export *
|
||||
}
|
||||
|
||||
explicit module neon {
|
||||
requires neon
|
||||
header "arm_neon.h"
|
||||
export *
|
||||
}
|
||||
}
|
||||
|
||||
explicit module intel {
|
||||
requires x86
|
||||
export *
|
||||
|
||||
header "immintrin.h"
|
||||
textual header "f16cintrin.h"
|
||||
textual header "avxintrin.h"
|
||||
textual header "avx2intrin.h"
|
||||
textual header "avx512fintrin.h"
|
||||
textual header "avx512erintrin.h"
|
||||
textual header "fmaintrin.h"
|
||||
|
||||
header "x86intrin.h"
|
||||
textual header "bmiintrin.h"
|
||||
textual header "bmi2intrin.h"
|
||||
textual header "lzcntintrin.h"
|
||||
textual header "xopintrin.h"
|
||||
textual header "fma4intrin.h"
|
||||
textual header "mwaitxintrin.h"
|
||||
|
||||
explicit module mm_malloc {
|
||||
requires !freestanding
|
||||
header "mm_malloc.h"
|
||||
export * // note: for <stdlib.h> dependency
|
||||
}
|
||||
|
||||
explicit module cpuid {
|
||||
requires gnuinlineasm
|
||||
header "cpuid.h"
|
||||
}
|
||||
|
||||
explicit module mmx {
|
||||
header "mmintrin.h"
|
||||
}
|
||||
|
||||
explicit module sse {
|
||||
export mm_malloc
|
||||
export mmx
|
||||
export sse2 // note: for hackish <emmintrin.h> dependency
|
||||
header "xmmintrin.h"
|
||||
}
|
||||
|
||||
explicit module sse2 {
|
||||
export sse
|
||||
header "emmintrin.h"
|
||||
}
|
||||
|
||||
explicit module sse3 {
|
||||
export sse2
|
||||
header "pmmintrin.h"
|
||||
}
|
||||
|
||||
explicit module ssse3 {
|
||||
export sse3
|
||||
header "tmmintrin.h"
|
||||
}
|
||||
|
||||
explicit module sse4_1 {
|
||||
export ssse3
|
||||
header "smmintrin.h"
|
||||
}
|
||||
|
||||
explicit module sse4_2 {
|
||||
export sse4_1
|
||||
header "nmmintrin.h"
|
||||
}
|
||||
|
||||
explicit module sse4a {
|
||||
export sse3
|
||||
header "ammintrin.h"
|
||||
}
|
||||
|
||||
explicit module popcnt {
|
||||
header "popcntintrin.h"
|
||||
}
|
||||
|
||||
explicit module mm3dnow {
|
||||
header "mm3dnow.h"
|
||||
}
|
||||
|
||||
explicit module aes_pclmul {
|
||||
header "wmmintrin.h"
|
||||
export aes
|
||||
export pclmul
|
||||
}
|
||||
|
||||
explicit module aes {
|
||||
header "__wmmintrin_aes.h"
|
||||
}
|
||||
|
||||
explicit module pclmul {
|
||||
header "__wmmintrin_pclmul.h"
|
||||
}
|
||||
}
|
||||
|
||||
explicit module systemz {
|
||||
requires systemz
|
||||
export *
|
||||
|
||||
header "s390intrin.h"
|
||||
|
||||
explicit module htm {
|
||||
requires htm
|
||||
header "htmintrin.h"
|
||||
header "htmxlintrin.h"
|
||||
}
|
||||
|
||||
explicit module zvector {
|
||||
requires zvector, vx
|
||||
header "vecintrin.h"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module _Builtin_stddef_max_align_t [system] [extern_c] {
|
||||
header "__stddef_max_align_t.h"
|
||||
}
|
||||
|
||||
module opencl_c {
|
||||
requires opencl
|
||||
header "opencl-c.h"
|
||||
}
|
|
@ -0,0 +1,583 @@
|
|||
/*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef _MSA_H
|
||||
#define _MSA_H 1
|
||||
|
||||
#if defined(__mips_msa)
|
||||
typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
|
||||
typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
|
||||
typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
|
||||
typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
|
||||
typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
|
||||
typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
|
||||
typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
|
||||
typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
|
||||
typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
|
||||
typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
|
||||
typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
|
||||
|
||||
#define __msa_sll_b __builtin_msa_sll_b
|
||||
#define __msa_sll_h __builtin_msa_sll_h
|
||||
#define __msa_sll_w __builtin_msa_sll_w
|
||||
#define __msa_sll_d __builtin_msa_sll_d
|
||||
#define __msa_slli_b __builtin_msa_slli_b
|
||||
#define __msa_slli_h __builtin_msa_slli_h
|
||||
#define __msa_slli_w __builtin_msa_slli_w
|
||||
#define __msa_slli_d __builtin_msa_slli_d
|
||||
#define __msa_sra_b __builtin_msa_sra_b
|
||||
#define __msa_sra_h __builtin_msa_sra_h
|
||||
#define __msa_sra_w __builtin_msa_sra_w
|
||||
#define __msa_sra_d __builtin_msa_sra_d
|
||||
#define __msa_srai_b __builtin_msa_srai_b
|
||||
#define __msa_srai_h __builtin_msa_srai_h
|
||||
#define __msa_srai_w __builtin_msa_srai_w
|
||||
#define __msa_srai_d __builtin_msa_srai_d
|
||||
#define __msa_srar_b __builtin_msa_srar_b
|
||||
#define __msa_srar_h __builtin_msa_srar_h
|
||||
#define __msa_srar_w __builtin_msa_srar_w
|
||||
#define __msa_srar_d __builtin_msa_srar_d
|
||||
#define __msa_srari_b __builtin_msa_srari_b
|
||||
#define __msa_srari_h __builtin_msa_srari_h
|
||||
#define __msa_srari_w __builtin_msa_srari_w
|
||||
#define __msa_srari_d __builtin_msa_srari_d
|
||||
#define __msa_srl_b __builtin_msa_srl_b
|
||||
#define __msa_srl_h __builtin_msa_srl_h
|
||||
#define __msa_srl_w __builtin_msa_srl_w
|
||||
#define __msa_srl_d __builtin_msa_srl_d
|
||||
#define __msa_srli_b __builtin_msa_srli_b
|
||||
#define __msa_srli_h __builtin_msa_srli_h
|
||||
#define __msa_srli_w __builtin_msa_srli_w
|
||||
#define __msa_srli_d __builtin_msa_srli_d
|
||||
#define __msa_srlr_b __builtin_msa_srlr_b
|
||||
#define __msa_srlr_h __builtin_msa_srlr_h
|
||||
#define __msa_srlr_w __builtin_msa_srlr_w
|
||||
#define __msa_srlr_d __builtin_msa_srlr_d
|
||||
#define __msa_srlri_b __builtin_msa_srlri_b
|
||||
#define __msa_srlri_h __builtin_msa_srlri_h
|
||||
#define __msa_srlri_w __builtin_msa_srlri_w
|
||||
#define __msa_srlri_d __builtin_msa_srlri_d
|
||||
#define __msa_bclr_b __builtin_msa_bclr_b
|
||||
#define __msa_bclr_h __builtin_msa_bclr_h
|
||||
#define __msa_bclr_w __builtin_msa_bclr_w
|
||||
#define __msa_bclr_d __builtin_msa_bclr_d
|
||||
#define __msa_bclri_b __builtin_msa_bclri_b
|
||||
#define __msa_bclri_h __builtin_msa_bclri_h
|
||||
#define __msa_bclri_w __builtin_msa_bclri_w
|
||||
#define __msa_bclri_d __builtin_msa_bclri_d
|
||||
#define __msa_bset_b __builtin_msa_bset_b
|
||||
#define __msa_bset_h __builtin_msa_bset_h
|
||||
#define __msa_bset_w __builtin_msa_bset_w
|
||||
#define __msa_bset_d __builtin_msa_bset_d
|
||||
#define __msa_bseti_b __builtin_msa_bseti_b
|
||||
#define __msa_bseti_h __builtin_msa_bseti_h
|
||||
#define __msa_bseti_w __builtin_msa_bseti_w
|
||||
#define __msa_bseti_d __builtin_msa_bseti_d
|
||||
#define __msa_bneg_b __builtin_msa_bneg_b
|
||||
#define __msa_bneg_h __builtin_msa_bneg_h
|
||||
#define __msa_bneg_w __builtin_msa_bneg_w
|
||||
#define __msa_bneg_d __builtin_msa_bneg_d
|
||||
#define __msa_bnegi_b __builtin_msa_bnegi_b
|
||||
#define __msa_bnegi_h __builtin_msa_bnegi_h
|
||||
#define __msa_bnegi_w __builtin_msa_bnegi_w
|
||||
#define __msa_bnegi_d __builtin_msa_bnegi_d
|
||||
#define __msa_binsl_b __builtin_msa_binsl_b
|
||||
#define __msa_binsl_h __builtin_msa_binsl_h
|
||||
#define __msa_binsl_w __builtin_msa_binsl_w
|
||||
#define __msa_binsl_d __builtin_msa_binsl_d
|
||||
#define __msa_binsli_b __builtin_msa_binsli_b
|
||||
#define __msa_binsli_h __builtin_msa_binsli_h
|
||||
#define __msa_binsli_w __builtin_msa_binsli_w
|
||||
#define __msa_binsli_d __builtin_msa_binsli_d
|
||||
#define __msa_binsr_b __builtin_msa_binsr_b
|
||||
#define __msa_binsr_h __builtin_msa_binsr_h
|
||||
#define __msa_binsr_w __builtin_msa_binsr_w
|
||||
#define __msa_binsr_d __builtin_msa_binsr_d
|
||||
#define __msa_binsri_b __builtin_msa_binsri_b
|
||||
#define __msa_binsri_h __builtin_msa_binsri_h
|
||||
#define __msa_binsri_w __builtin_msa_binsri_w
|
||||
#define __msa_binsri_d __builtin_msa_binsri_d
|
||||
#define __msa_addv_b __builtin_msa_addv_b
|
||||
#define __msa_addv_h __builtin_msa_addv_h
|
||||
#define __msa_addv_w __builtin_msa_addv_w
|
||||
#define __msa_addv_d __builtin_msa_addv_d
|
||||
#define __msa_addvi_b __builtin_msa_addvi_b
|
||||
#define __msa_addvi_h __builtin_msa_addvi_h
|
||||
#define __msa_addvi_w __builtin_msa_addvi_w
|
||||
#define __msa_addvi_d __builtin_msa_addvi_d
|
||||
#define __msa_subv_b __builtin_msa_subv_b
|
||||
#define __msa_subv_h __builtin_msa_subv_h
|
||||
#define __msa_subv_w __builtin_msa_subv_w
|
||||
#define __msa_subv_d __builtin_msa_subv_d
|
||||
#define __msa_subvi_b __builtin_msa_subvi_b
|
||||
#define __msa_subvi_h __builtin_msa_subvi_h
|
||||
#define __msa_subvi_w __builtin_msa_subvi_w
|
||||
#define __msa_subvi_d __builtin_msa_subvi_d
|
||||
#define __msa_max_s_b __builtin_msa_max_s_b
|
||||
#define __msa_max_s_h __builtin_msa_max_s_h
|
||||
#define __msa_max_s_w __builtin_msa_max_s_w
|
||||
#define __msa_max_s_d __builtin_msa_max_s_d
|
||||
#define __msa_maxi_s_b __builtin_msa_maxi_s_b
|
||||
#define __msa_maxi_s_h __builtin_msa_maxi_s_h
|
||||
#define __msa_maxi_s_w __builtin_msa_maxi_s_w
|
||||
#define __msa_maxi_s_d __builtin_msa_maxi_s_d
|
||||
#define __msa_max_u_b __builtin_msa_max_u_b
|
||||
#define __msa_max_u_h __builtin_msa_max_u_h
|
||||
#define __msa_max_u_w __builtin_msa_max_u_w
|
||||
#define __msa_max_u_d __builtin_msa_max_u_d
|
||||
#define __msa_maxi_u_b __builtin_msa_maxi_u_b
|
||||
#define __msa_maxi_u_h __builtin_msa_maxi_u_h
|
||||
#define __msa_maxi_u_w __builtin_msa_maxi_u_w
|
||||
#define __msa_maxi_u_d __builtin_msa_maxi_u_d
|
||||
#define __msa_min_s_b __builtin_msa_min_s_b
|
||||
#define __msa_min_s_h __builtin_msa_min_s_h
|
||||
#define __msa_min_s_w __builtin_msa_min_s_w
|
||||
#define __msa_min_s_d __builtin_msa_min_s_d
|
||||
#define __msa_mini_s_b __builtin_msa_mini_s_b
|
||||
#define __msa_mini_s_h __builtin_msa_mini_s_h
|
||||
#define __msa_mini_s_w __builtin_msa_mini_s_w
|
||||
#define __msa_mini_s_d __builtin_msa_mini_s_d
|
||||
#define __msa_min_u_b __builtin_msa_min_u_b
|
||||
#define __msa_min_u_h __builtin_msa_min_u_h
|
||||
#define __msa_min_u_w __builtin_msa_min_u_w
|
||||
#define __msa_min_u_d __builtin_msa_min_u_d
|
||||
#define __msa_mini_u_b __builtin_msa_mini_u_b
|
||||
#define __msa_mini_u_h __builtin_msa_mini_u_h
|
||||
#define __msa_mini_u_w __builtin_msa_mini_u_w
|
||||
#define __msa_mini_u_d __builtin_msa_mini_u_d
|
||||
#define __msa_max_a_b __builtin_msa_max_a_b
|
||||
#define __msa_max_a_h __builtin_msa_max_a_h
|
||||
#define __msa_max_a_w __builtin_msa_max_a_w
|
||||
#define __msa_max_a_d __builtin_msa_max_a_d
|
||||
#define __msa_min_a_b __builtin_msa_min_a_b
|
||||
#define __msa_min_a_h __builtin_msa_min_a_h
|
||||
#define __msa_min_a_w __builtin_msa_min_a_w
|
||||
#define __msa_min_a_d __builtin_msa_min_a_d
|
||||
#define __msa_ceq_b __builtin_msa_ceq_b
|
||||
#define __msa_ceq_h __builtin_msa_ceq_h
|
||||
#define __msa_ceq_w __builtin_msa_ceq_w
|
||||
#define __msa_ceq_d __builtin_msa_ceq_d
|
||||
#define __msa_ceqi_b __builtin_msa_ceqi_b
|
||||
#define __msa_ceqi_h __builtin_msa_ceqi_h
|
||||
#define __msa_ceqi_w __builtin_msa_ceqi_w
|
||||
#define __msa_ceqi_d __builtin_msa_ceqi_d
|
||||
#define __msa_clt_s_b __builtin_msa_clt_s_b
|
||||
#define __msa_clt_s_h __builtin_msa_clt_s_h
|
||||
#define __msa_clt_s_w __builtin_msa_clt_s_w
|
||||
#define __msa_clt_s_d __builtin_msa_clt_s_d
|
||||
#define __msa_clti_s_b __builtin_msa_clti_s_b
|
||||
#define __msa_clti_s_h __builtin_msa_clti_s_h
|
||||
#define __msa_clti_s_w __builtin_msa_clti_s_w
|
||||
#define __msa_clti_s_d __builtin_msa_clti_s_d
|
||||
#define __msa_clt_u_b __builtin_msa_clt_u_b
|
||||
#define __msa_clt_u_h __builtin_msa_clt_u_h
|
||||
#define __msa_clt_u_w __builtin_msa_clt_u_w
|
||||
#define __msa_clt_u_d __builtin_msa_clt_u_d
|
||||
#define __msa_clti_u_b __builtin_msa_clti_u_b
|
||||
#define __msa_clti_u_h __builtin_msa_clti_u_h
|
||||
#define __msa_clti_u_w __builtin_msa_clti_u_w
|
||||
#define __msa_clti_u_d __builtin_msa_clti_u_d
|
||||
#define __msa_cle_s_b __builtin_msa_cle_s_b
|
||||
#define __msa_cle_s_h __builtin_msa_cle_s_h
|
||||
#define __msa_cle_s_w __builtin_msa_cle_s_w
|
||||
#define __msa_cle_s_d __builtin_msa_cle_s_d
|
||||
#define __msa_clei_s_b __builtin_msa_clei_s_b
|
||||
#define __msa_clei_s_h __builtin_msa_clei_s_h
|
||||
#define __msa_clei_s_w __builtin_msa_clei_s_w
|
||||
#define __msa_clei_s_d __builtin_msa_clei_s_d
|
||||
#define __msa_cle_u_b __builtin_msa_cle_u_b
|
||||
#define __msa_cle_u_h __builtin_msa_cle_u_h
|
||||
#define __msa_cle_u_w __builtin_msa_cle_u_w
|
||||
#define __msa_cle_u_d __builtin_msa_cle_u_d
|
||||
#define __msa_clei_u_b __builtin_msa_clei_u_b
|
||||
#define __msa_clei_u_h __builtin_msa_clei_u_h
|
||||
#define __msa_clei_u_w __builtin_msa_clei_u_w
|
||||
#define __msa_clei_u_d __builtin_msa_clei_u_d
|
||||
#define __msa_ld_b __builtin_msa_ld_b
|
||||
#define __msa_ld_h __builtin_msa_ld_h
|
||||
#define __msa_ld_w __builtin_msa_ld_w
|
||||
#define __msa_ld_d __builtin_msa_ld_d
|
||||
#define __msa_st_b __builtin_msa_st_b
|
||||
#define __msa_st_h __builtin_msa_st_h
|
||||
#define __msa_st_w __builtin_msa_st_w
|
||||
#define __msa_st_d __builtin_msa_st_d
|
||||
#define __msa_sat_s_b __builtin_msa_sat_s_b
|
||||
#define __msa_sat_s_h __builtin_msa_sat_s_h
|
||||
#define __msa_sat_s_w __builtin_msa_sat_s_w
|
||||
#define __msa_sat_s_d __builtin_msa_sat_s_d
|
||||
#define __msa_sat_u_b __builtin_msa_sat_u_b
|
||||
#define __msa_sat_u_h __builtin_msa_sat_u_h
|
||||
#define __msa_sat_u_w __builtin_msa_sat_u_w
|
||||
#define __msa_sat_u_d __builtin_msa_sat_u_d
|
||||
#define __msa_add_a_b __builtin_msa_add_a_b
|
||||
#define __msa_add_a_h __builtin_msa_add_a_h
|
||||
#define __msa_add_a_w __builtin_msa_add_a_w
|
||||
#define __msa_add_a_d __builtin_msa_add_a_d
|
||||
#define __msa_adds_a_b __builtin_msa_adds_a_b
|
||||
#define __msa_adds_a_h __builtin_msa_adds_a_h
|
||||
#define __msa_adds_a_w __builtin_msa_adds_a_w
|
||||
#define __msa_adds_a_d __builtin_msa_adds_a_d
|
||||
#define __msa_adds_s_b __builtin_msa_adds_s_b
|
||||
#define __msa_adds_s_h __builtin_msa_adds_s_h
|
||||
#define __msa_adds_s_w __builtin_msa_adds_s_w
|
||||
#define __msa_adds_s_d __builtin_msa_adds_s_d
|
||||
#define __msa_adds_u_b __builtin_msa_adds_u_b
|
||||
#define __msa_adds_u_h __builtin_msa_adds_u_h
|
||||
#define __msa_adds_u_w __builtin_msa_adds_u_w
|
||||
#define __msa_adds_u_d __builtin_msa_adds_u_d
|
||||
#define __msa_ave_s_b __builtin_msa_ave_s_b
|
||||
#define __msa_ave_s_h __builtin_msa_ave_s_h
|
||||
#define __msa_ave_s_w __builtin_msa_ave_s_w
|
||||
#define __msa_ave_s_d __builtin_msa_ave_s_d
|
||||
#define __msa_ave_u_b __builtin_msa_ave_u_b
|
||||
#define __msa_ave_u_h __builtin_msa_ave_u_h
|
||||
#define __msa_ave_u_w __builtin_msa_ave_u_w
|
||||
#define __msa_ave_u_d __builtin_msa_ave_u_d
|
||||
#define __msa_aver_s_b __builtin_msa_aver_s_b
|
||||
#define __msa_aver_s_h __builtin_msa_aver_s_h
|
||||
#define __msa_aver_s_w __builtin_msa_aver_s_w
|
||||
#define __msa_aver_s_d __builtin_msa_aver_s_d
|
||||
#define __msa_aver_u_b __builtin_msa_aver_u_b
|
||||
#define __msa_aver_u_h __builtin_msa_aver_u_h
|
||||
#define __msa_aver_u_w __builtin_msa_aver_u_w
|
||||
#define __msa_aver_u_d __builtin_msa_aver_u_d
|
||||
#define __msa_subs_s_b __builtin_msa_subs_s_b
|
||||
#define __msa_subs_s_h __builtin_msa_subs_s_h
|
||||
#define __msa_subs_s_w __builtin_msa_subs_s_w
|
||||
#define __msa_subs_s_d __builtin_msa_subs_s_d
|
||||
#define __msa_subs_u_b __builtin_msa_subs_u_b
|
||||
#define __msa_subs_u_h __builtin_msa_subs_u_h
|
||||
#define __msa_subs_u_w __builtin_msa_subs_u_w
|
||||
#define __msa_subs_u_d __builtin_msa_subs_u_d
|
||||
#define __msa_subsuu_s_b __builtin_msa_subsuu_s_b
|
||||
#define __msa_subsuu_s_h __builtin_msa_subsuu_s_h
|
||||
#define __msa_subsuu_s_w __builtin_msa_subsuu_s_w
|
||||
#define __msa_subsuu_s_d __builtin_msa_subsuu_s_d
|
||||
#define __msa_subsus_u_b __builtin_msa_subsus_u_b
|
||||
#define __msa_subsus_u_h __builtin_msa_subsus_u_h
|
||||
#define __msa_subsus_u_w __builtin_msa_subsus_u_w
|
||||
#define __msa_subsus_u_d __builtin_msa_subsus_u_d
|
||||
#define __msa_asub_s_b __builtin_msa_asub_s_b
|
||||
#define __msa_asub_s_h __builtin_msa_asub_s_h
|
||||
#define __msa_asub_s_w __builtin_msa_asub_s_w
|
||||
#define __msa_asub_s_d __builtin_msa_asub_s_d
|
||||
#define __msa_asub_u_b __builtin_msa_asub_u_b
|
||||
#define __msa_asub_u_h __builtin_msa_asub_u_h
|
||||
#define __msa_asub_u_w __builtin_msa_asub_u_w
|
||||
#define __msa_asub_u_d __builtin_msa_asub_u_d
|
||||
#define __msa_mulv_b __builtin_msa_mulv_b
|
||||
#define __msa_mulv_h __builtin_msa_mulv_h
|
||||
#define __msa_mulv_w __builtin_msa_mulv_w
|
||||
#define __msa_mulv_d __builtin_msa_mulv_d
|
||||
#define __msa_maddv_b __builtin_msa_maddv_b
|
||||
#define __msa_maddv_h __builtin_msa_maddv_h
|
||||
#define __msa_maddv_w __builtin_msa_maddv_w
|
||||
#define __msa_maddv_d __builtin_msa_maddv_d
|
||||
#define __msa_msubv_b __builtin_msa_msubv_b
|
||||
#define __msa_msubv_h __builtin_msa_msubv_h
|
||||
#define __msa_msubv_w __builtin_msa_msubv_w
|
||||
#define __msa_msubv_d __builtin_msa_msubv_d
|
||||
#define __msa_div_s_b __builtin_msa_div_s_b
|
||||
#define __msa_div_s_h __builtin_msa_div_s_h
|
||||
#define __msa_div_s_w __builtin_msa_div_s_w
|
||||
#define __msa_div_s_d __builtin_msa_div_s_d
|
||||
#define __msa_div_u_b __builtin_msa_div_u_b
|
||||
#define __msa_div_u_h __builtin_msa_div_u_h
|
||||
#define __msa_div_u_w __builtin_msa_div_u_w
|
||||
#define __msa_div_u_d __builtin_msa_div_u_d
|
||||
#define __msa_hadd_s_h __builtin_msa_hadd_s_h
|
||||
#define __msa_hadd_s_w __builtin_msa_hadd_s_w
|
||||
#define __msa_hadd_s_d __builtin_msa_hadd_s_d
|
||||
#define __msa_hadd_u_h __builtin_msa_hadd_u_h
|
||||
#define __msa_hadd_u_w __builtin_msa_hadd_u_w
|
||||
#define __msa_hadd_u_d __builtin_msa_hadd_u_d
|
||||
#define __msa_hsub_s_h __builtin_msa_hsub_s_h
|
||||
#define __msa_hsub_s_w __builtin_msa_hsub_s_w
|
||||
#define __msa_hsub_s_d __builtin_msa_hsub_s_d
|
||||
#define __msa_hsub_u_h __builtin_msa_hsub_u_h
|
||||
#define __msa_hsub_u_w __builtin_msa_hsub_u_w
|
||||
#define __msa_hsub_u_d __builtin_msa_hsub_u_d
|
||||
#define __msa_mod_s_b __builtin_msa_mod_s_b
|
||||
#define __msa_mod_s_h __builtin_msa_mod_s_h
|
||||
#define __msa_mod_s_w __builtin_msa_mod_s_w
|
||||
#define __msa_mod_s_d __builtin_msa_mod_s_d
|
||||
#define __msa_mod_u_b __builtin_msa_mod_u_b
|
||||
#define __msa_mod_u_h __builtin_msa_mod_u_h
|
||||
#define __msa_mod_u_w __builtin_msa_mod_u_w
|
||||
#define __msa_mod_u_d __builtin_msa_mod_u_d
|
||||
#define __msa_dotp_s_h __builtin_msa_dotp_s_h
|
||||
#define __msa_dotp_s_w __builtin_msa_dotp_s_w
|
||||
#define __msa_dotp_s_d __builtin_msa_dotp_s_d
|
||||
#define __msa_dotp_u_h __builtin_msa_dotp_u_h
|
||||
#define __msa_dotp_u_w __builtin_msa_dotp_u_w
|
||||
#define __msa_dotp_u_d __builtin_msa_dotp_u_d
|
||||
#define __msa_dpadd_s_h __builtin_msa_dpadd_s_h
|
||||
#define __msa_dpadd_s_w __builtin_msa_dpadd_s_w
|
||||
#define __msa_dpadd_s_d __builtin_msa_dpadd_s_d
|
||||
#define __msa_dpadd_u_h __builtin_msa_dpadd_u_h
|
||||
#define __msa_dpadd_u_w __builtin_msa_dpadd_u_w
|
||||
#define __msa_dpadd_u_d __builtin_msa_dpadd_u_d
|
||||
#define __msa_dpsub_s_h __builtin_msa_dpsub_s_h
|
||||
#define __msa_dpsub_s_w __builtin_msa_dpsub_s_w
|
||||
#define __msa_dpsub_s_d __builtin_msa_dpsub_s_d
|
||||
#define __msa_dpsub_u_h __builtin_msa_dpsub_u_h
|
||||
#define __msa_dpsub_u_w __builtin_msa_dpsub_u_w
|
||||
#define __msa_dpsub_u_d __builtin_msa_dpsub_u_d
|
||||
#define __msa_sld_b __builtin_msa_sld_b
|
||||
#define __msa_sld_h __builtin_msa_sld_h
|
||||
#define __msa_sld_w __builtin_msa_sld_w
|
||||
#define __msa_sld_d __builtin_msa_sld_d
|
||||
#define __msa_sldi_b __builtin_msa_sldi_b
|
||||
#define __msa_sldi_h __builtin_msa_sldi_h
|
||||
#define __msa_sldi_w __builtin_msa_sldi_w
|
||||
#define __msa_sldi_d __builtin_msa_sldi_d
|
||||
#define __msa_splat_b __builtin_msa_splat_b
|
||||
#define __msa_splat_h __builtin_msa_splat_h
|
||||
#define __msa_splat_w __builtin_msa_splat_w
|
||||
#define __msa_splat_d __builtin_msa_splat_d
|
||||
#define __msa_splati_b __builtin_msa_splati_b
|
||||
#define __msa_splati_h __builtin_msa_splati_h
|
||||
#define __msa_splati_w __builtin_msa_splati_w
|
||||
#define __msa_splati_d __builtin_msa_splati_d
|
||||
#define __msa_pckev_b __builtin_msa_pckev_b
|
||||
#define __msa_pckev_h __builtin_msa_pckev_h
|
||||
#define __msa_pckev_w __builtin_msa_pckev_w
|
||||
#define __msa_pckev_d __builtin_msa_pckev_d
|
||||
#define __msa_pckod_b __builtin_msa_pckod_b
|
||||
#define __msa_pckod_h __builtin_msa_pckod_h
|
||||
#define __msa_pckod_w __builtin_msa_pckod_w
|
||||
#define __msa_pckod_d __builtin_msa_pckod_d
|
||||
#define __msa_ilvl_b __builtin_msa_ilvl_b
|
||||
#define __msa_ilvl_h __builtin_msa_ilvl_h
|
||||
#define __msa_ilvl_w __builtin_msa_ilvl_w
|
||||
#define __msa_ilvl_d __builtin_msa_ilvl_d
|
||||
#define __msa_ilvr_b __builtin_msa_ilvr_b
|
||||
#define __msa_ilvr_h __builtin_msa_ilvr_h
|
||||
#define __msa_ilvr_w __builtin_msa_ilvr_w
|
||||
#define __msa_ilvr_d __builtin_msa_ilvr_d
|
||||
#define __msa_ilvev_b __builtin_msa_ilvev_b
|
||||
#define __msa_ilvev_h __builtin_msa_ilvev_h
|
||||
#define __msa_ilvev_w __builtin_msa_ilvev_w
|
||||
#define __msa_ilvev_d __builtin_msa_ilvev_d
|
||||
#define __msa_ilvod_b __builtin_msa_ilvod_b
|
||||
#define __msa_ilvod_h __builtin_msa_ilvod_h
|
||||
#define __msa_ilvod_w __builtin_msa_ilvod_w
|
||||
#define __msa_ilvod_d __builtin_msa_ilvod_d
|
||||
#define __msa_vshf_b __builtin_msa_vshf_b
|
||||
#define __msa_vshf_h __builtin_msa_vshf_h
|
||||
#define __msa_vshf_w __builtin_msa_vshf_w
|
||||
#define __msa_vshf_d __builtin_msa_vshf_d
|
||||
#define __msa_and_v __builtin_msa_and_v
|
||||
#define __msa_andi_b __builtin_msa_andi_b
|
||||
#define __msa_or_v __builtin_msa_or_v
|
||||
#define __msa_ori_b __builtin_msa_ori_b
|
||||
#define __msa_nor_v __builtin_msa_nor_v
|
||||
#define __msa_nori_b __builtin_msa_nori_b
|
||||
#define __msa_xor_v __builtin_msa_xor_v
|
||||
#define __msa_xori_b __builtin_msa_xori_b
|
||||
#define __msa_bmnz_v __builtin_msa_bmnz_v
|
||||
#define __msa_bmnzi_b __builtin_msa_bmnzi_b
|
||||
#define __msa_bmz_v __builtin_msa_bmz_v
|
||||
#define __msa_bmzi_b __builtin_msa_bmzi_b
|
||||
#define __msa_bsel_v __builtin_msa_bsel_v
|
||||
#define __msa_bseli_b __builtin_msa_bseli_b
|
||||
#define __msa_shf_b __builtin_msa_shf_b
|
||||
#define __msa_shf_h __builtin_msa_shf_h
|
||||
#define __msa_shf_w __builtin_msa_shf_w
|
||||
#define __msa_test_bnz_v __builtin_msa_bnz_v
|
||||
#define __msa_test_bz_v __builtin_msa_bz_v
|
||||
#define __msa_fill_b __builtin_msa_fill_b
|
||||
#define __msa_fill_h __builtin_msa_fill_h
|
||||
#define __msa_fill_w __builtin_msa_fill_w
|
||||
#define __msa_fill_d __builtin_msa_fill_d
|
||||
#define __msa_pcnt_b __builtin_msa_pcnt_b
|
||||
#define __msa_pcnt_h __builtin_msa_pcnt_h
|
||||
#define __msa_pcnt_w __builtin_msa_pcnt_w
|
||||
#define __msa_pcnt_d __builtin_msa_pcnt_d
|
||||
#define __msa_nloc_b __builtin_msa_nloc_b
|
||||
#define __msa_nloc_h __builtin_msa_nloc_h
|
||||
#define __msa_nloc_w __builtin_msa_nloc_w
|
||||
#define __msa_nloc_d __builtin_msa_nloc_d
|
||||
#define __msa_nlzc_b __builtin_msa_nlzc_b
|
||||
#define __msa_nlzc_h __builtin_msa_nlzc_h
|
||||
#define __msa_nlzc_w __builtin_msa_nlzc_w
|
||||
#define __msa_nlzc_d __builtin_msa_nlzc_d
|
||||
#define __msa_copy_s_b __builtin_msa_copy_s_b
|
||||
#define __msa_copy_s_h __builtin_msa_copy_s_h
|
||||
#define __msa_copy_s_w __builtin_msa_copy_s_w
|
||||
#define __msa_copy_s_d __builtin_msa_copy_s_d
|
||||
#define __msa_copy_u_b __builtin_msa_copy_u_b
|
||||
#define __msa_copy_u_h __builtin_msa_copy_u_h
|
||||
#define __msa_copy_u_w __builtin_msa_copy_u_w
|
||||
#define __msa_copy_u_d __builtin_msa_copy_u_d
|
||||
#define __msa_insert_b __builtin_msa_insert_b
|
||||
#define __msa_insert_h __builtin_msa_insert_h
|
||||
#define __msa_insert_w __builtin_msa_insert_w
|
||||
#define __msa_insert_d __builtin_msa_insert_d
|
||||
#define __msa_insve_b __builtin_msa_insve_b
|
||||
#define __msa_insve_h __builtin_msa_insve_h
|
||||
#define __msa_insve_w __builtin_msa_insve_w
|
||||
#define __msa_insve_d __builtin_msa_insve_d
|
||||
#define __msa_test_bnz_b __builtin_msa_bnz_b
|
||||
#define __msa_test_bnz_h __builtin_msa_bnz_h
|
||||
#define __msa_test_bnz_w __builtin_msa_bnz_w
|
||||
#define __msa_test_bnz_d __builtin_msa_bnz_d
|
||||
#define __msa_test_bz_b __builtin_msa_bz_b
|
||||
#define __msa_test_bz_h __builtin_msa_bz_h
|
||||
#define __msa_test_bz_w __builtin_msa_bz_w
|
||||
#define __msa_test_bz_d __builtin_msa_bz_d
|
||||
#define __msa_ldi_b __builtin_msa_ldi_b
|
||||
#define __msa_ldi_h __builtin_msa_ldi_h
|
||||
#define __msa_ldi_w __builtin_msa_ldi_w
|
||||
#define __msa_ldi_d __builtin_msa_ldi_d
|
||||
#define __msa_fcaf_w __builtin_msa_fcaf_w
|
||||
#define __msa_fcaf_d __builtin_msa_fcaf_d
|
||||
#define __msa_fcor_w __builtin_msa_fcor_w
|
||||
#define __msa_fcor_d __builtin_msa_fcor_d
|
||||
#define __msa_fcun_w __builtin_msa_fcun_w
|
||||
#define __msa_fcun_d __builtin_msa_fcun_d
|
||||
#define __msa_fcune_w __builtin_msa_fcune_w
|
||||
#define __msa_fcune_d __builtin_msa_fcune_d
|
||||
#define __msa_fcueq_w __builtin_msa_fcueq_w
|
||||
#define __msa_fcueq_d __builtin_msa_fcueq_d
|
||||
#define __msa_fceq_w __builtin_msa_fceq_w
|
||||
#define __msa_fceq_d __builtin_msa_fceq_d
|
||||
#define __msa_fcne_w __builtin_msa_fcne_w
|
||||
#define __msa_fcne_d __builtin_msa_fcne_d
|
||||
#define __msa_fclt_w __builtin_msa_fclt_w
|
||||
#define __msa_fclt_d __builtin_msa_fclt_d
|
||||
#define __msa_fcult_w __builtin_msa_fcult_w
|
||||
#define __msa_fcult_d __builtin_msa_fcult_d
|
||||
#define __msa_fcle_w __builtin_msa_fcle_w
|
||||
#define __msa_fcle_d __builtin_msa_fcle_d
|
||||
#define __msa_fcule_w __builtin_msa_fcule_w
|
||||
#define __msa_fcule_d __builtin_msa_fcule_d
|
||||
#define __msa_fsaf_w __builtin_msa_fsaf_w
|
||||
#define __msa_fsaf_d __builtin_msa_fsaf_d
|
||||
#define __msa_fsor_w __builtin_msa_fsor_w
|
||||
#define __msa_fsor_d __builtin_msa_fsor_d
|
||||
#define __msa_fsun_w __builtin_msa_fsun_w
|
||||
#define __msa_fsun_d __builtin_msa_fsun_d
|
||||
#define __msa_fsune_w __builtin_msa_fsune_w
|
||||
#define __msa_fsune_d __builtin_msa_fsune_d
|
||||
#define __msa_fsueq_w __builtin_msa_fsueq_w
|
||||
#define __msa_fsueq_d __builtin_msa_fsueq_d
|
||||
#define __msa_fseq_w __builtin_msa_fseq_w
|
||||
#define __msa_fseq_d __builtin_msa_fseq_d
|
||||
#define __msa_fsne_w __builtin_msa_fsne_w
|
||||
#define __msa_fsne_d __builtin_msa_fsne_d
|
||||
#define __msa_fslt_w __builtin_msa_fslt_w
|
||||
#define __msa_fslt_d __builtin_msa_fslt_d
|
||||
#define __msa_fsult_w __builtin_msa_fsult_w
|
||||
#define __msa_fsult_d __builtin_msa_fsult_d
|
||||
#define __msa_fsle_w __builtin_msa_fsle_w
|
||||
#define __msa_fsle_d __builtin_msa_fsle_d
|
||||
#define __msa_fsule_w __builtin_msa_fsule_w
|
||||
#define __msa_fsule_d __builtin_msa_fsule_d
|
||||
#define __msa_fadd_w __builtin_msa_fadd_w
|
||||
#define __msa_fadd_d __builtin_msa_fadd_d
|
||||
#define __msa_fsub_w __builtin_msa_fsub_w
|
||||
#define __msa_fsub_d __builtin_msa_fsub_d
|
||||
#define __msa_fmul_w __builtin_msa_fmul_w
|
||||
#define __msa_fmul_d __builtin_msa_fmul_d
|
||||
#define __msa_fdiv_w __builtin_msa_fdiv_w
|
||||
#define __msa_fdiv_d __builtin_msa_fdiv_d
|
||||
#define __msa_fmadd_w __builtin_msa_fmadd_w
|
||||
#define __msa_fmadd_d __builtin_msa_fmadd_d
|
||||
#define __msa_fmsub_w __builtin_msa_fmsub_w
|
||||
#define __msa_fmsub_d __builtin_msa_fmsub_d
|
||||
#define __msa_fexp2_w __builtin_msa_fexp2_w
|
||||
#define __msa_fexp2_d __builtin_msa_fexp2_d
|
||||
#define __msa_fexdo_h __builtin_msa_fexdo_h
|
||||
#define __msa_fexdo_w __builtin_msa_fexdo_w
|
||||
#define __msa_ftq_h __builtin_msa_ftq_h
|
||||
#define __msa_ftq_w __builtin_msa_ftq_w
|
||||
#define __msa_fmin_w __builtin_msa_fmin_w
|
||||
#define __msa_fmin_d __builtin_msa_fmin_d
|
||||
#define __msa_fmin_a_w __builtin_msa_fmin_a_w
|
||||
#define __msa_fmin_a_d __builtin_msa_fmin_a_d
|
||||
#define __msa_fmax_w __builtin_msa_fmax_w
|
||||
#define __msa_fmax_d __builtin_msa_fmax_d
|
||||
#define __msa_fmax_a_w __builtin_msa_fmax_a_w
|
||||
#define __msa_fmax_a_d __builtin_msa_fmax_a_d
|
||||
#define __msa_mul_q_h __builtin_msa_mul_q_h
|
||||
#define __msa_mul_q_w __builtin_msa_mul_q_w
|
||||
#define __msa_mulr_q_h __builtin_msa_mulr_q_h
|
||||
#define __msa_mulr_q_w __builtin_msa_mulr_q_w
|
||||
#define __msa_madd_q_h __builtin_msa_madd_q_h
|
||||
#define __msa_madd_q_w __builtin_msa_madd_q_w
|
||||
#define __msa_maddr_q_h __builtin_msa_maddr_q_h
|
||||
#define __msa_maddr_q_w __builtin_msa_maddr_q_w
|
||||
#define __msa_msub_q_h __builtin_msa_msub_q_h
|
||||
#define __msa_msub_q_w __builtin_msa_msub_q_w
|
||||
#define __msa_msubr_q_h __builtin_msa_msubr_q_h
|
||||
#define __msa_msubr_q_w __builtin_msa_msubr_q_w
|
||||
#define __msa_fclass_w __builtin_msa_fclass_w
|
||||
#define __msa_fclass_d __builtin_msa_fclass_d
|
||||
#define __msa_fsqrt_w __builtin_msa_fsqrt_w
|
||||
#define __msa_fsqrt_d __builtin_msa_fsqrt_d
|
||||
#define __msa_frcp_w __builtin_msa_frcp_w
|
||||
#define __msa_frcp_d __builtin_msa_frcp_d
|
||||
#define __msa_frint_w __builtin_msa_frint_w
|
||||
#define __msa_frint_d __builtin_msa_frint_d
|
||||
#define __msa_frsqrt_w __builtin_msa_frsqrt_w
|
||||
#define __msa_frsqrt_d __builtin_msa_frsqrt_d
|
||||
#define __msa_flog2_w __builtin_msa_flog2_w
|
||||
#define __msa_flog2_d __builtin_msa_flog2_d
|
||||
#define __msa_fexupl_w __builtin_msa_fexupl_w
|
||||
#define __msa_fexupl_d __builtin_msa_fexupl_d
|
||||
#define __msa_fexupr_w __builtin_msa_fexupr_w
|
||||
#define __msa_fexupr_d __builtin_msa_fexupr_d
|
||||
#define __msa_ffql_w __builtin_msa_ffql_w
|
||||
#define __msa_ffql_d __builtin_msa_ffql_d
|
||||
#define __msa_ffqr_w __builtin_msa_ffqr_w
|
||||
#define __msa_ffqr_d __builtin_msa_ffqr_d
|
||||
#define __msa_ftint_s_w __builtin_msa_ftint_s_w
|
||||
#define __msa_ftint_s_d __builtin_msa_ftint_s_d
|
||||
#define __msa_ftint_u_w __builtin_msa_ftint_u_w
|
||||
#define __msa_ftint_u_d __builtin_msa_ftint_u_d
|
||||
#define __msa_ftrunc_s_w __builtin_msa_ftrunc_s_w
|
||||
#define __msa_ftrunc_s_d __builtin_msa_ftrunc_s_d
|
||||
#define __msa_ftrunc_u_w __builtin_msa_ftrunc_u_w
|
||||
#define __msa_ftrunc_u_d __builtin_msa_ftrunc_u_d
|
||||
#define __msa_ffint_s_w __builtin_msa_ffint_s_w
|
||||
#define __msa_ffint_s_d __builtin_msa_ffint_s_d
|
||||
#define __msa_ffint_u_w __builtin_msa_ffint_u_w
|
||||
#define __msa_ffint_u_d __builtin_msa_ffint_u_d
|
||||
#define __msa_cfcmsa __builtin_msa_cfcmsa
|
||||
#define __msa_move_v __builtin_msa_move_v
|
||||
#define __msa_cast_to_vector_float __builtin_msa_cast_to_vector_float
|
||||
#define __msa_cast_to_vector_double __builtin_msa_cast_to_vector_double
|
||||
#define __msa_cast_to_scalar_float __builtin_msa_cast_to_scalar_float
|
||||
#define __msa_cast_to_scalar_double __builtin_msa_cast_to_scalar_double
|
||||
#endif /* defined(__mips_msa) */
|
||||
#endif /* _MSA_H */
|
|
@ -0,0 +1,47 @@
|
|||
/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __X86INTRIN_H
|
||||
#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef _MWAITXINTRIN_H
|
||||
#define _MWAITXINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx")))
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_monitorx(void const * __p, unsigned __extensions, unsigned __hints)
|
||||
{
|
||||
__builtin_ia32_monitorx((void *)__p, __extensions, __hints);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
|
||||
{
|
||||
__builtin_ia32_mwaitx(__extensions, __hints, __clock);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* _MWAITXINTRIN_H */
|
|
@ -24,12 +24,7 @@
|
|||
#ifndef _NMMINTRIN_H
|
||||
#define _NMMINTRIN_H
|
||||
|
||||
#ifndef __SSE4_2__
|
||||
#error "SSE4.2 instruction set not enabled"
|
||||
#else
|
||||
|
||||
/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
|
||||
just include it now then. */
|
||||
#include <smmintrin.h>
|
||||
#endif /* __SSE4_2__ */
|
||||
#endif /* _NMMINTRIN_H */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,48 @@
|
|||
/*===------------- pkuintrin.h - PKU intrinsics ------------------===
|
||||
*
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __PKUINTRIN_H
|
||||
#define __PKUINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_rdpkru_u32(void)
|
||||
{
|
||||
return __builtin_ia32_rdpkru();
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_wrpkru(unsigned int __val)
|
||||
{
|
||||
return __builtin_ia32_wrpkru(__val);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -20,79 +20,241 @@
|
|||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __PMMINTRIN_H
|
||||
#define __PMMINTRIN_H
|
||||
|
||||
#ifndef __SSE3__
|
||||
#error "SSE3 instruction set not enabled"
|
||||
#else
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse3")))
|
||||
|
||||
/// \brief Loads data from an unaligned memory location to elements in a 128-bit
|
||||
/// vector. If the address of the data is not 16-byte aligned, the
|
||||
/// instruction may read two adjacent aligned blocks of memory to retrieve
|
||||
/// the requested data.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
|
||||
///
|
||||
/// \param __p
|
||||
/// A pointer to a 128-bit integer vector containing integer values.
|
||||
/// \returns A 128-bit vector containing the moved values.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_lddqu_si128(__m128i const *__p)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_lddqu((char const *)__p);
|
||||
}
|
||||
|
||||
/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
|
||||
/// two 128-bit vectors of [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float] containing the left source operand.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [4 x float] containing the right source operand.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
|
||||
/// differences of both operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_addsub_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
return __builtin_ia32_addsubps(__a, __b);
|
||||
return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally adds the adjacent pairs of values contained in two
|
||||
/// 128-bit vectors of [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float] containing one of the source operands.
|
||||
/// The horizontal sums of the values are stored in the lower bits of the
|
||||
/// destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [4 x float] containing one of the source operands.
|
||||
/// The horizontal sums of the values are stored in the upper bits of the
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
|
||||
/// both operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_hadd_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
return __builtin_ia32_haddps(__a, __b);
|
||||
return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally subtracts the adjacent pairs of values contained in two
|
||||
/// 128-bit vectors of [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float] containing one of the source operands.
|
||||
/// The horizontal differences between the values are stored in the lower
|
||||
/// bits of the destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [4 x float] containing one of the source operands.
|
||||
/// The horizontal differences between the values are stored in the upper
|
||||
/// bits of the destination.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the horizontal
|
||||
/// differences of both operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_hsub_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
return __builtin_ia32_hsubps(__a, __b);
|
||||
return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
|
||||
}
|
||||
|
||||
/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
|
||||
/// vector of [4 x float] to float values stored in a 128-bit vector of
|
||||
/// [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float]. \n
|
||||
/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
|
||||
/// the destination. \n
|
||||
/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
|
||||
/// values.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_movehdup_ps(__m128 __a)
|
||||
{
|
||||
return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
|
||||
}
|
||||
|
||||
/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
|
||||
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float] \n
|
||||
/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
|
||||
/// the destination. \n
|
||||
/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
|
||||
/// values.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_moveldup_ps(__m128 __a)
|
||||
{
|
||||
return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
|
||||
}
|
||||
|
||||
/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
|
||||
/// two 128-bit vectors of [2 x double].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double] containing the left source operand.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double] containing the right source operand.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the alternating sums
|
||||
/// and differences of both operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_addsub_pd(__m128d __a, __m128d __b)
|
||||
{
|
||||
return __builtin_ia32_addsubpd(__a, __b);
|
||||
return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally adds the pairs of values contained in two 128-bit
|
||||
/// vectors of [2 x double].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double] containing one of the source operands.
|
||||
/// The horizontal sum of the values is stored in the lower bits of the
|
||||
/// destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double] containing one of the source operands.
|
||||
/// The horizontal sum of the values is stored in the upper bits of the
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
|
||||
/// both operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_hadd_pd(__m128d __a, __m128d __b)
|
||||
{
|
||||
return __builtin_ia32_haddpd(__a, __b);
|
||||
return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
|
||||
/// vectors of [2 x double].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double] containing one of the source operands.
|
||||
/// The horizontal difference of the values is stored in the lower bits of
|
||||
/// the destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [2 x double] containing one of the source operands.
|
||||
/// The horizontal difference of the values is stored in the upper bits of
|
||||
/// the destination.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the horizontal
|
||||
/// differences of both operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_hsub_pd(__m128d __a, __m128d __b)
|
||||
{
|
||||
return __builtin_ia32_hsubpd(__a, __b);
|
||||
return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
|
||||
}
|
||||
|
||||
/// \brief Moves and duplicates one double-precision value to double-precision
|
||||
/// values stored in a 128-bit vector of [2 x double].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m128d _mm_loaddup_pd(double const * dp);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
|
||||
///
|
||||
/// \param dp
|
||||
/// A pointer to a double-precision value to be moved and duplicated.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the moved and
|
||||
/// duplicated values.
|
||||
#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
|
||||
|
||||
/// \brief Moves and duplicates the double-precision value in the lower bits of
|
||||
/// a 128-bit vector of [2 x double] to double-precision values stored in a
|
||||
/// 128-bit vector of [2 x double].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
|
||||
/// [127:64] and [63:0] of the destination.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the moved and
|
||||
/// duplicated values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
_mm_movedup_pd(__m128d __a)
|
||||
{
|
||||
return __builtin_shufflevector(__a, __a, 0, 0);
|
||||
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
|
||||
}
|
||||
|
||||
#define _MM_DENORMALS_ZERO_ON (0x0040)
|
||||
|
@ -103,12 +265,40 @@ _mm_movedup_pd(__m128d __a)
|
|||
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
|
||||
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
|
||||
|
||||
/// \brief Establishes a linear address memory range to be monitored and puts
|
||||
/// the processor in the monitor event pending state. Data stored in the
|
||||
/// monitored address range causes the processor to exit the pending state.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
|
||||
///
|
||||
/// \param __p
|
||||
/// The memory range to be monitored. The size of the range is determined by
|
||||
/// CPUID function 0000_0005h.
|
||||
/// \param __extensions
|
||||
/// Optional extensions for the monitoring state.
|
||||
/// \param __hints
|
||||
/// Optional hints for the monitoring state.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
|
||||
{
|
||||
__builtin_ia32_monitor((void *)__p, __extensions, __hints);
|
||||
}
|
||||
|
||||
/// \brief Used with the MONITOR instruction to wait while the processor is in
|
||||
/// the monitor event pending state. Data stored in the monitored address
|
||||
/// range causes the processor to exit the pending state.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
|
||||
///
|
||||
/// \param __extensions
|
||||
/// Optional extensions for the monitoring state, which may vary by
|
||||
/// processor.
|
||||
/// \param __hints
|
||||
/// Optional hints for the monitoring state, which may vary by processor.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_mwait(unsigned __extensions, unsigned __hints)
|
||||
{
|
||||
|
@ -117,6 +307,4 @@ _mm_mwait(unsigned __extensions, unsigned __hints)
|
|||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __SSE3__ */
|
||||
|
||||
#endif /* __PMMINTRIN_H */
|
||||
|
|
|
@ -21,28 +21,76 @@
|
|||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __POPCNT__
|
||||
#error "POPCNT instruction set not enabled"
|
||||
#endif
|
||||
|
||||
#ifndef _POPCNTINTRIN_H
|
||||
#define _POPCNTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
|
||||
|
||||
/// \brief Counts the number of bits in the source operand having a value of 1.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// An unsigned 32-bit integer operand.
|
||||
/// \returns A 32-bit integer containing the number of bits with value 1 in the
|
||||
/// source operand.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_popcnt_u32(unsigned int __A)
|
||||
{
|
||||
return __builtin_popcount(__A);
|
||||
}
|
||||
|
||||
/// \brief Counts the number of bits in the source operand having a value of 1.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A signed 32-bit integer operand.
|
||||
/// \returns A 32-bit integer containing the number of bits with value 1 in the
|
||||
/// source operand.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_popcnt32(int __A)
|
||||
{
|
||||
return __builtin_popcount(__A);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
/// \brief Counts the number of bits in the source operand having a value of 1.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// An unsigned 64-bit integer operand.
|
||||
/// \returns A 64-bit integer containing the number of bits with value 1 in the
|
||||
/// source operand.
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
_mm_popcnt_u64(unsigned long long __A)
|
||||
{
|
||||
return __builtin_popcountll(__A);
|
||||
}
|
||||
|
||||
/// \brief Counts the number of bits in the source operand having a value of 1.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A signed 64-bit integer operand.
|
||||
/// \returns A 64-bit integer containing the number of bits with value 1 in the
|
||||
/// source operand.
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
_popcnt64(long long __A)
|
||||
{
|
||||
return __builtin_popcountll(__A);
|
||||
}
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
|
|
@ -29,6 +29,12 @@
|
|||
#define __PRFCHWINTRIN_H
|
||||
|
||||
#if defined(__PRFCHW__) || defined(__3dNOW__)
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_m_prefetch(void *__P)
|
||||
{
|
||||
__builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
|
||||
}
|
||||
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||
_m_prefetchw(void *__P)
|
||||
{
|
||||
|
|
|
@ -28,10 +28,8 @@
|
|||
#ifndef __RDSEEDINTRIN_H
|
||||
#define __RDSEEDINTRIN_H
|
||||
|
||||
#ifdef __RDSEED__
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_rdseed16_step(unsigned short *__p)
|
||||
|
@ -55,5 +53,4 @@ _rdseed64_step(unsigned long long *__p)
|
|||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __RDSEED__ */
|
||||
#endif /* __RDSEEDINTRIN_H */
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
#define _XABORT_CODE(x) (((x) >> 24) & 0xFF)
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_xbegin(void)
|
||||
|
|
|
@ -28,15 +28,11 @@
|
|||
#ifndef __SHAINTRIN_H
|
||||
#define __SHAINTRIN_H
|
||||
|
||||
#if !defined (__SHA__)
|
||||
# error "SHA instructions not enabled"
|
||||
#endif
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
|
||||
|
||||
#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
|
||||
__builtin_ia32_sha1rnds4((V1), (V2), (M)); })
|
||||
__builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
|
||||
|
|
|
@ -24,14 +24,10 @@
|
|||
#ifndef _SMMINTRIN_H
|
||||
#define _SMMINTRIN_H
|
||||
|
||||
#ifndef __SSE4_1__
|
||||
#error "SSE4.1 instruction set not enabled"
|
||||
#else
|
||||
|
||||
#include <tmmintrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
|
||||
|
||||
/* SSE4 Rounding macros. */
|
||||
#define _MM_FROUND_TO_NEAREST_INT 0x00
|
||||
|
@ -61,35 +57,28 @@
|
|||
#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
|
||||
|
||||
#define _mm_round_ps(X, M) __extension__ ({ \
|
||||
__m128 __X = (X); \
|
||||
(__m128) __builtin_ia32_roundps((__v4sf)__X, (M)); })
|
||||
(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
|
||||
|
||||
#define _mm_round_ss(X, Y, M) __extension__ ({ \
|
||||
__m128 __X = (X); \
|
||||
__m128 __Y = (Y); \
|
||||
(__m128) __builtin_ia32_roundss((__v4sf)__X, (__v4sf)__Y, (M)); })
|
||||
(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
|
||||
(__v4sf)(__m128)(Y), (M)); })
|
||||
|
||||
#define _mm_round_pd(X, M) __extension__ ({ \
|
||||
__m128d __X = (X); \
|
||||
(__m128d) __builtin_ia32_roundpd((__v2df)__X, (M)); })
|
||||
(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
|
||||
|
||||
#define _mm_round_sd(X, Y, M) __extension__ ({ \
|
||||
__m128d __X = (X); \
|
||||
__m128d __Y = (Y); \
|
||||
(__m128d) __builtin_ia32_roundsd((__v2df)__X, (__v2df)__Y, (M)); })
|
||||
(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
|
||||
(__v2df)(__m128d)(Y), (M)); })
|
||||
|
||||
/* SSE4 Packed Blending Intrinsics. */
|
||||
#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
|
||||
__m128d __V1 = (V1); \
|
||||
__m128d __V2 = (V2); \
|
||||
(__m128d)__builtin_shufflevector((__v2df)__V1, (__v2df)__V2, \
|
||||
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
|
||||
(__v2df)(__m128d)(V2), \
|
||||
(((M) & 0x01) ? 2 : 0), \
|
||||
(((M) & 0x02) ? 3 : 1)); })
|
||||
|
||||
#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
|
||||
__m128 __V1 = (V1); \
|
||||
__m128 __V2 = (V2); \
|
||||
(__m128)__builtin_shufflevector((__v4sf)__V1, (__v4sf)__V2, \
|
||||
(__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
|
||||
(((M) & 0x01) ? 4 : 0), \
|
||||
(((M) & 0x02) ? 5 : 1), \
|
||||
(((M) & 0x04) ? 6 : 2), \
|
||||
|
@ -117,9 +106,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
|
|||
}
|
||||
|
||||
#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
|
||||
__m128i __V1 = (V1); \
|
||||
__m128i __V2 = (V2); \
|
||||
(__m128i)__builtin_shufflevector((__v8hi)__V1, (__v8hi)__V2, \
|
||||
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
|
||||
(__v8hi)(__m128i)(V2), \
|
||||
(((M) & 0x01) ? 8 : 0), \
|
||||
(((M) & 0x02) ? 9 : 1), \
|
||||
(((M) & 0x04) ? 10 : 2), \
|
||||
|
@ -133,7 +121,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
|
|||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
|
||||
{
|
||||
return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
|
||||
return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
|
@ -144,20 +132,18 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
|
|||
|
||||
/* SSE4 Floating Point Dot Product Instructions. */
|
||||
#define _mm_dp_ps(X, Y, M) __extension__ ({ \
|
||||
__m128 __X = (X); \
|
||||
__m128 __Y = (Y); \
|
||||
(__m128) __builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, (M)); })
|
||||
(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
|
||||
(__v4sf)(__m128)(Y), (M)); })
|
||||
|
||||
#define _mm_dp_pd(X, Y, M) __extension__ ({\
|
||||
__m128d __X = (X); \
|
||||
__m128d __Y = (Y); \
|
||||
(__m128d) __builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, (M)); })
|
||||
(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
|
||||
(__v2df)(__m128d)(Y), (M)); })
|
||||
|
||||
/* SSE4 Streaming Load Hint Instruction. */
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_stream_load_si128 (__m128i *__V)
|
||||
_mm_stream_load_si128 (__m128i const *__V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
|
||||
return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
|
||||
}
|
||||
|
||||
/* SSE4 Packed Integer Min/Max Instructions. */
|
||||
|
@ -213,7 +199,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
|
||||
#define _mm_extract_ps(X, N) (__extension__ \
|
||||
({ union { int __i; float __f; } __t; \
|
||||
__v4sf __a = (__v4sf)(X); \
|
||||
__v4sf __a = (__v4sf)(__m128)(X); \
|
||||
__t.__f = __a[(N) & 3]; \
|
||||
__t.__i;}))
|
||||
|
||||
|
@ -221,39 +207,44 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||
/* Extract a single-precision float from X at index N into D. */
|
||||
#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
|
||||
(D) = __a[N]; }))
|
||||
|
||||
|
||||
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
|
||||
an index suitable for _mm_insert_ps. */
|
||||
#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
|
||||
|
||||
|
||||
/* Extract a float from X at index N into the first index of the return. */
|
||||
#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
|
||||
_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
|
||||
|
||||
|
||||
/* Insert int into packed integer array at index. */
|
||||
#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
|
||||
__a[(N) & 15] = (I); \
|
||||
__a;}))
|
||||
#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
|
||||
__a[(N) & 3] = (I); \
|
||||
__a;}))
|
||||
#define _mm_insert_epi8(X, I, N) (__extension__ \
|
||||
({ __v16qi __a = (__v16qi)(__m128i)(X); \
|
||||
__a[(N) & 15] = (I); \
|
||||
(__m128i)__a;}))
|
||||
#define _mm_insert_epi32(X, I, N) (__extension__ \
|
||||
({ __v4si __a = (__v4si)(__m128i)(X); \
|
||||
__a[(N) & 3] = (I); \
|
||||
(__m128i)__a;}))
|
||||
#ifdef __x86_64__
|
||||
#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
|
||||
__a[(N) & 1] = (I); \
|
||||
__a;}))
|
||||
#define _mm_insert_epi64(X, I, N) (__extension__ \
|
||||
({ __v2di __a = (__v2di)(__m128i)(X); \
|
||||
__a[(N) & 1] = (I); \
|
||||
(__m128i)__a;}))
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
/* Extract int from packed integer array at index. This returns the element
|
||||
* as a zero extended value, so it is unsigned.
|
||||
*/
|
||||
#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
|
||||
(int)(unsigned char) \
|
||||
__a[(N) & 15];}))
|
||||
#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
|
||||
__a[(N) & 3];}))
|
||||
#define _mm_extract_epi8(X, N) (__extension__ \
|
||||
({ __v16qi __a = (__v16qi)(__m128i)(X); \
|
||||
(int)(unsigned char) __a[(N) & 15];}))
|
||||
#define _mm_extract_epi32(X, N) (__extension__ \
|
||||
({ __v4si __a = (__v4si)(__m128i)(X); \
|
||||
(int)__a[(N) & 3];}))
|
||||
#ifdef __x86_64__
|
||||
#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
|
||||
__a[(N) & 1];}))
|
||||
#define _mm_extract_epi64(X, N) (__extension__ \
|
||||
({ __v2di __a = (__v2di)(__m128i)(X); \
|
||||
(long long)__a[(N) & 1];}))
|
||||
#endif /* __x86_64 */
|
||||
|
||||
/* SSE4 128-bit Packed Integer Comparisons. */
|
||||
|
@ -290,74 +281,80 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
|
|||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepi8_epi16(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovsxbw128((__v16qi) __V);
|
||||
/* This function always performs a signed extension, but __v16qi is a char
|
||||
which may be signed or unsigned, so use __v16qs. */
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepi8_epi32(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovsxbd128((__v16qi) __V);
|
||||
/* This function always performs a signed extension, but __v16qi is a char
|
||||
which may be signed or unsigned, so use __v16qs. */
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepi8_epi64(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovsxbq128((__v16qi) __V);
|
||||
/* This function always performs a signed extension, but __v16qi is a char
|
||||
which may be signed or unsigned, so use __v16qs. */
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepi16_epi32(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovsxwd128((__v8hi) __V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepi16_epi64(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovsxwq128((__v8hi)__V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepi32_epi64(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovsxdq128((__v4si)__V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
|
||||
}
|
||||
|
||||
/* SSE4 Packed Integer Zero-Extension. */
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepu8_epi16(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepu8_epi32(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepu8_epi64(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepu16_epi32(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepu16_epi64(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cvtepu32_epi64(__m128i __V)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovzxdq128((__v4si)__V);
|
||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
|
||||
}
|
||||
|
||||
/* SSE4 Pack with Unsigned Saturation. */
|
||||
|
@ -369,9 +366,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
|
|||
|
||||
/* SSE4 Multiple Packed Sums of Absolute Difference. */
|
||||
#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
|
||||
__m128i __X = (X); \
|
||||
__m128i __Y = (Y); \
|
||||
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, (M)); })
|
||||
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
|
||||
(__v16qi)(__m128i)(Y), (M)); })
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_minpos_epu16(__m128i __V)
|
||||
|
@ -379,9 +375,13 @@ _mm_minpos_epu16(__m128i __V)
|
|||
return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
|
||||
}
|
||||
|
||||
/* Handle the sse4.2 definitions here. */
|
||||
|
||||
/* These definitions are normally in nmmintrin.h, but gcc puts them in here
|
||||
so we'll do the same. */
|
||||
#ifdef __SSE4_2__
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
||||
|
||||
/* These specify the type of data that we're comparing. */
|
||||
#define _SIDD_UBYTE_OPS 0x00
|
||||
|
@ -410,36 +410,59 @@ _mm_minpos_epu16(__m128i __V)
|
|||
#define _SIDD_UNIT_MASK 0x40
|
||||
|
||||
/* SSE4.2 Packed Comparison Intrinsics. */
|
||||
#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
|
||||
#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
|
||||
#define _mm_cmpistrm(A, B, M) \
|
||||
(__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (int)(M))
|
||||
#define _mm_cmpistri(A, B, M) \
|
||||
(int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (int)(M))
|
||||
|
||||
#define _mm_cmpestrm(A, LA, B, LB, M) \
|
||||
__builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
|
||||
(__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||
(int)(M))
|
||||
#define _mm_cmpestri(A, LA, B, LB, M) \
|
||||
__builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
|
||||
|
||||
(int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||
(int)(M))
|
||||
|
||||
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
|
||||
#define _mm_cmpistra(A, B, M) \
|
||||
__builtin_ia32_pcmpistria128((A), (B), (M))
|
||||
(int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (int)(M))
|
||||
#define _mm_cmpistrc(A, B, M) \
|
||||
__builtin_ia32_pcmpistric128((A), (B), (M))
|
||||
(int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (int)(M))
|
||||
#define _mm_cmpistro(A, B, M) \
|
||||
__builtin_ia32_pcmpistrio128((A), (B), (M))
|
||||
(int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (int)(M))
|
||||
#define _mm_cmpistrs(A, B, M) \
|
||||
__builtin_ia32_pcmpistris128((A), (B), (M))
|
||||
(int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (int)(M))
|
||||
#define _mm_cmpistrz(A, B, M) \
|
||||
__builtin_ia32_pcmpistriz128((A), (B), (M))
|
||||
(int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (int)(M))
|
||||
|
||||
#define _mm_cmpestra(A, LA, B, LB, M) \
|
||||
__builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
|
||||
(int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||
(int)(M))
|
||||
#define _mm_cmpestrc(A, LA, B, LB, M) \
|
||||
__builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
|
||||
(int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||
(int)(M))
|
||||
#define _mm_cmpestro(A, LA, B, LB, M) \
|
||||
__builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
|
||||
(int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||
(int)(M))
|
||||
#define _mm_cmpestrs(A, LA, B, LB, M) \
|
||||
__builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
|
||||
(int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||
(int)(M))
|
||||
#define _mm_cmpestrz(A, LA, B, LB, M) \
|
||||
__builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
|
||||
(int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||
(int)(M))
|
||||
|
||||
/* SSE4.2 Compare Packed Data -- Greater Than. */
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
|
@ -481,7 +504,4 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
|
|||
#include <popcntintrin.h>
|
||||
#endif
|
||||
|
||||
#endif /* __SSE4_2__ */
|
||||
#endif /* __SSE4_1__ */
|
||||
|
||||
#endif /* _SMMINTRIN_H */
|
||||
|
|
|
@ -45,11 +45,11 @@ extern "C" {
|
|||
#define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
|
||||
#define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
|
||||
#define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE
|
||||
#define ATOMIC_SHORT_T_LOCK_FREE __GCC_ATOMIC_SHORT_T_LOCK_FREE
|
||||
#define ATOMIC_INT_T_LOCK_FREE __GCC_ATOMIC_INT_T_LOCK_FREE
|
||||
#define ATOMIC_LONG_T_LOCK_FREE __GCC_ATOMIC_LONG_T_LOCK_FREE
|
||||
#define ATOMIC_LLONG_T_LOCK_FREE __GCC_ATOMIC_LLONG_T_LOCK_FREE
|
||||
#define ATOMIC_POINTER_T_LOCK_FREE __GCC_ATOMIC_POINTER_T_LOCK_FREE
|
||||
#define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE
|
||||
#define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
|
||||
#define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE
|
||||
#define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE
|
||||
#define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE
|
||||
|
||||
/* 7.17.2 Initialization */
|
||||
|
||||
|
|
|
@ -77,14 +77,14 @@
|
|||
* C99 7.18.1.2 Minimum-width integer types.
|
||||
* C99 7.18.1.3 Fastest minimum-width integer types.
|
||||
*
|
||||
* The standard requires that exact-width type be defined for 8-, 16-, 32-, and
|
||||
* The standard requires that exact-width type be defined for 8-, 16-, 32-, and
|
||||
* 64-bit types if they are implemented. Other exact width types are optional.
|
||||
* This implementation defines an exact-width types for every integer width
|
||||
* that is represented in the standard integer types.
|
||||
*
|
||||
* The standard also requires minimum-width types be defined for 8-, 16-, 32-,
|
||||
* and 64-bit widths regardless of whether there are corresponding exact-width
|
||||
* types.
|
||||
* types.
|
||||
*
|
||||
* To accommodate targets that are missing types that are exactly 8, 16, 32, or
|
||||
* 64 bits wide, this implementation takes an approach of cascading
|
||||
|
@ -97,7 +97,7 @@
|
|||
* suboptimal.
|
||||
*
|
||||
* In violation of the standard, some targets do not implement a type that is
|
||||
* wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
|
||||
* wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
|
||||
* To accommodate these targets, a required minimum-width type is only
|
||||
* defined if there exists an exact-width type of equal or greater width.
|
||||
*/
|
||||
|
@ -247,7 +247,7 @@ typedef __uint_least8_t uint_fast8_t;
|
|||
#endif /* __int_least8_t */
|
||||
|
||||
/* prevent glibc sys/types.h from defining conflicting types */
|
||||
#ifndef __int8_t_defined
|
||||
#ifndef __int8_t_defined
|
||||
# define __int8_t_defined
|
||||
#endif /* __int8_t_defined */
|
||||
|
||||
|
@ -280,9 +280,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||
*
|
||||
* The standard requires that integer constant macros be defined for all the
|
||||
* minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
|
||||
* types are required, the corresponding integer constant macros are defined
|
||||
* types are required, the corresponding integer constant macros are defined
|
||||
* here. This implementation also defines minimum-width types for every other
|
||||
* integer width that the target implements, so corresponding macros are
|
||||
* integer width that the target implements, so corresponding macros are
|
||||
* defined below, too.
|
||||
*
|
||||
* These macros are defined using the same successive-shrinking approach as
|
||||
|
@ -452,7 +452,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||
#endif /* __int_least8_t */
|
||||
|
||||
|
||||
/* C99 7.18.2.1 Limits of exact-width integer types.
|
||||
/* C99 7.18.2.1 Limits of exact-width integer types.
|
||||
* C99 7.18.2.2 Limits of minimum-width integer types.
|
||||
* C99 7.18.2.3 Limits of fastest minimum-width integer types.
|
||||
*
|
||||
|
|
|
@ -21,10 +21,6 @@
|
|||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __TBM__
|
||||
#error "TBM instruction set is not enabled"
|
||||
#endif
|
||||
|
||||
#ifndef __X86INTRIN_H
|
||||
#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
|
||||
#endif
|
||||
|
@ -33,119 +29,123 @@
|
|||
#define __TBMINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
|
||||
|
||||
#define __bextri_u32(a, b) (__builtin_ia32_bextri_u32((a), (b)))
|
||||
#define __bextri_u32(a, b) \
|
||||
((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
|
||||
(unsigned int)(b)))
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blcfill_u32(unsigned int a)
|
||||
__blcfill_u32(unsigned int __a)
|
||||
{
|
||||
return a & (a + 1);
|
||||
return __a & (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blci_u32(unsigned int a)
|
||||
__blci_u32(unsigned int __a)
|
||||
{
|
||||
return a | ~(a + 1);
|
||||
return __a | ~(__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blcic_u32(unsigned int a)
|
||||
__blcic_u32(unsigned int __a)
|
||||
{
|
||||
return ~a & (a + 1);
|
||||
return ~__a & (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blcmsk_u32(unsigned int a)
|
||||
__blcmsk_u32(unsigned int __a)
|
||||
{
|
||||
return a ^ (a + 1);
|
||||
return __a ^ (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blcs_u32(unsigned int a)
|
||||
__blcs_u32(unsigned int __a)
|
||||
{
|
||||
return a | (a + 1);
|
||||
return __a | (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsfill_u32(unsigned int a)
|
||||
__blsfill_u32(unsigned int __a)
|
||||
{
|
||||
return a | (a - 1);
|
||||
return __a | (__a - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsic_u32(unsigned int a)
|
||||
__blsic_u32(unsigned int __a)
|
||||
{
|
||||
return ~a | (a - 1);
|
||||
return ~__a | (__a - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__t1mskc_u32(unsigned int a)
|
||||
__t1mskc_u32(unsigned int __a)
|
||||
{
|
||||
return ~a | (a + 1);
|
||||
return ~__a | (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__tzmsk_u32(unsigned int a)
|
||||
__tzmsk_u32(unsigned int __a)
|
||||
{
|
||||
return ~a & (a - 1);
|
||||
return ~__a & (__a - 1);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define __bextri_u64(a, b) (__builtin_ia32_bextri_u64((a), (int)(b)))
|
||||
#define __bextri_u64(a, b) \
|
||||
((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
|
||||
(unsigned long long)(b)))
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blcfill_u64(unsigned long long a)
|
||||
__blcfill_u64(unsigned long long __a)
|
||||
{
|
||||
return a & (a + 1);
|
||||
return __a & (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blci_u64(unsigned long long a)
|
||||
__blci_u64(unsigned long long __a)
|
||||
{
|
||||
return a | ~(a + 1);
|
||||
return __a | ~(__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blcic_u64(unsigned long long a)
|
||||
__blcic_u64(unsigned long long __a)
|
||||
{
|
||||
return ~a & (a + 1);
|
||||
return ~__a & (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blcmsk_u64(unsigned long long a)
|
||||
__blcmsk_u64(unsigned long long __a)
|
||||
{
|
||||
return a ^ (a + 1);
|
||||
return __a ^ (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blcs_u64(unsigned long long a)
|
||||
__blcs_u64(unsigned long long __a)
|
||||
{
|
||||
return a | (a + 1);
|
||||
return __a | (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsfill_u64(unsigned long long a)
|
||||
__blsfill_u64(unsigned long long __a)
|
||||
{
|
||||
return a | (a - 1);
|
||||
return __a | (__a - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsic_u64(unsigned long long a)
|
||||
__blsic_u64(unsigned long long __a)
|
||||
{
|
||||
return ~a | (a - 1);
|
||||
return ~__a | (__a - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__t1mskc_u64(unsigned long long a)
|
||||
__t1mskc_u64(unsigned long long __a)
|
||||
{
|
||||
return ~a | (a + 1);
|
||||
return ~__a | (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__tzmsk_u64(unsigned long long a)
|
||||
__tzmsk_u64(unsigned long long __a)
|
||||
{
|
||||
return ~a & (a - 1);
|
||||
return ~__a & (__a - 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -490,7 +490,7 @@ static double _Complex
|
|||
|
||||
static long double _Complex
|
||||
_TG_ATTRS
|
||||
__tg_pow(long double _Complex __x, long double _Complex __y)
|
||||
__tg_pow(long double _Complex __x, long double _Complex __y)
|
||||
{return cpowl(__x, __y);}
|
||||
|
||||
#undef pow
|
||||
|
|
|
@ -20,203 +20,748 @@
|
|||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __TMMINTRIN_H
|
||||
#define __TMMINTRIN_H
|
||||
|
||||
#ifndef __SSSE3__
|
||||
#error "SSSE3 instruction set not enabled"
|
||||
#else
|
||||
|
||||
#include <pmmintrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
|
||||
|
||||
/// \brief Computes the absolute value of each of the packed 8-bit signed
|
||||
/// integers in the source operand and stores the 8-bit unsigned integer
|
||||
/// results in the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PABSB instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [8 x i8].
|
||||
/// \returns A 64-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_abs_pi8(__m64 __a)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
|
||||
}
|
||||
|
||||
/// \brief Computes the absolute value of each of the packed 8-bit signed
|
||||
/// integers in the source operand and stores the 8-bit unsigned integer
|
||||
/// results in the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPABSB instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [16 x i8].
|
||||
/// \returns A 128-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_abs_epi8(__m128i __a)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
|
||||
}
|
||||
|
||||
/// \brief Computes the absolute value of each of the packed 16-bit signed
|
||||
/// integers in the source operand and stores the 16-bit unsigned integer
|
||||
/// results in the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PABSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_abs_pi16(__m64 __a)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
|
||||
}
|
||||
|
||||
/// \brief Computes the absolute value of each of the packed 16-bit signed
|
||||
/// integers in the source operand and stores the 16-bit unsigned integer
|
||||
/// results in the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPABSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [8 x i16].
|
||||
/// \returns A 128-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_abs_epi16(__m128i __a)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
|
||||
}
|
||||
|
||||
/// \brief Computes the absolute value of each of the packed 32-bit signed
|
||||
/// integers in the source operand and stores the 32-bit unsigned integer
|
||||
/// results in the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PABSD instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [2 x i32].
|
||||
/// \returns A 64-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_abs_pi32(__m64 __a)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pabsd((__v2si)__a);
|
||||
}
|
||||
|
||||
/// \brief Computes the absolute value of each of the packed 32-bit signed
|
||||
/// integers in the source operand and stores the 32-bit unsigned integer
|
||||
/// results in the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPABSD instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x i32].
|
||||
/// \returns A 128-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_abs_epi32(__m128i __a)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
|
||||
}
|
||||
|
||||
/// \brief Concatenates the two 128-bit integer vector operands, and
|
||||
/// right-shifts the result by the number of bytes specified in the immediate
|
||||
/// operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PALIGNR instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// A 128-bit vector of [16 x i8] containing one of the source operands.
|
||||
/// \param b
|
||||
/// A 128-bit vector of [16 x i8] containing one of the source operands.
|
||||
/// \param n
|
||||
/// An immediate operand specifying how many bytes to right-shift the result.
|
||||
/// \returns A 128-bit integer vector containing the concatenated right-shifted
|
||||
/// value.
|
||||
#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
|
||||
__m128i __a = (a); \
|
||||
__m128i __b = (b); \
|
||||
(__m128i)__builtin_ia32_palignr128((__v16qi)__a, (__v16qi)__b, (n)); })
|
||||
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
|
||||
(__v16qi)(__m128i)(b), (n)); })
|
||||
|
||||
/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
|
||||
/// the result by the number of bytes specified in the immediate operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PALIGNR instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// A 64-bit vector of [8 x i8] containing one of the source operands.
|
||||
/// \param b
|
||||
/// A 64-bit vector of [8 x i8] containing one of the source operands.
|
||||
/// \param n
|
||||
/// An immediate operand specifying how many bytes to right-shift the result.
|
||||
/// \returns A 64-bit integer vector containing the concatenated right-shifted
|
||||
/// value.
|
||||
#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
|
||||
__m64 __a = (a); \
|
||||
__m64 __b = (b); \
|
||||
(__m64)__builtin_ia32_palignr((__v8qi)__a, (__v8qi)__b, (n)); })
|
||||
(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
|
||||
|
||||
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||
/// 128-bit vectors of [8 x i16].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPHADDW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the lower bits of the
|
||||
/// destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the upper bits of the
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
|
||||
/// both operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_hadd_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||
/// 128-bit vectors of [4 x i32].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPHADDD instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the lower bits of the
|
||||
/// destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the upper bits of the
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
|
||||
/// both operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_hadd_epi32(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||
/// 64-bit vectors of [4 x i16].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PHADDW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the lower bits of the
|
||||
/// destination.
|
||||
/// \param __b
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the upper bits of the
|
||||
/// destination.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
|
||||
/// operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hadd_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||
/// 64-bit vectors of [2 x i32].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PHADDD instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the lower bits of the
|
||||
/// destination.
|
||||
/// \param __b
|
||||
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the upper bits of the
|
||||
/// destination.
|
||||
/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
|
||||
/// operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hadd_pi32(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||
/// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
|
||||
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPHADDSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the lower bits of the
|
||||
/// destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the upper bits of the
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
|
||||
/// sums of both operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_hadds_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||
/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
|
||||
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PHADDSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the lower bits of the
|
||||
/// destination.
|
||||
/// \param __b
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
|
||||
/// horizontal sums of the values are stored in the upper bits of the
|
||||
/// destination.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
|
||||
/// sums of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hadds_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
|
||||
/// packed 128-bit vectors of [8 x i16].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPHSUBW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the lower bits of
|
||||
/// the destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the upper bits of
|
||||
/// the destination.
|
||||
/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
|
||||
/// of both operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_hsub_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
|
||||
/// packed 128-bit vectors of [4 x i32].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPHSUBD instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the lower bits of
|
||||
/// the destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the upper bits of
|
||||
/// the destination.
|
||||
/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
|
||||
/// of both operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_hsub_epi32(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
|
||||
/// packed 64-bit vectors of [4 x i16].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PHSUBW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the lower bits of
|
||||
/// the destination.
|
||||
/// \param __b
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the upper bits of
|
||||
/// the destination.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
|
||||
/// of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hsub_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
|
||||
/// packed 64-bit vectors of [2 x i32].
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PHSUBD instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the lower bits of
|
||||
/// the destination.
|
||||
/// \param __b
|
||||
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the upper bits of
|
||||
/// the destination.
|
||||
/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
|
||||
/// of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hsub_pi32(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
|
||||
/// packed 128-bit vectors of [8 x i16]. Positive differences greater than
|
||||
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
|
||||
/// saturated to 8000h.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPHSUBSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the lower bits of
|
||||
/// the destination.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the upper bits of
|
||||
/// the destination.
|
||||
/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
|
||||
/// differences of both operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_hsubs_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
|
||||
/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
|
||||
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
|
||||
/// saturated to 8000h.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PHSUBSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the lower bits of
|
||||
/// the destination.
|
||||
/// \param __b
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
|
||||
/// horizontal differences between the values are stored in the upper bits of
|
||||
/// the destination.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
|
||||
/// differences of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hsubs_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
|
||||
/// values contained in the first source operand and packed 8-bit signed
|
||||
/// integer values contained in the second source operand, adds pairs of
|
||||
/// contiguous products with signed saturation, and writes the 16-bit sums to
|
||||
/// the corresponding bits in the destination. For example, bits [7:0] of
|
||||
/// both operands are multiplied, bits [15:8] of both operands are
|
||||
/// multiplied, and the sum of both results is written to bits [15:0] of the
|
||||
/// destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector containing the first source operand.
|
||||
/// \param __b
|
||||
/// A 128-bit integer vector containing the second source operand.
|
||||
/// \returns A 128-bit integer vector containing the sums of products of both
|
||||
/// operands: \n
|
||||
/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
|
||||
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
|
||||
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
|
||||
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
|
||||
/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
|
||||
/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
|
||||
/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
|
||||
/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maddubs_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
|
||||
}
|
||||
|
||||
/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
|
||||
/// values contained in the first source operand and packed 8-bit signed
|
||||
/// integer values contained in the second source operand, adds pairs of
|
||||
/// contiguous products with signed saturation, and writes the 16-bit sums to
|
||||
/// the corresponding bits in the destination. For example, bits [7:0] of
|
||||
/// both operands are multiplied, bits [15:8] of both operands are
|
||||
/// multiplied, and the sum of both results is written to bits [15:0] of the
|
||||
/// destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PMADDUBSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit integer vector containing the first source operand.
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing the second source operand.
|
||||
/// \returns A 64-bit integer vector containing the sums of products of both
|
||||
/// operands: \n
|
||||
/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
|
||||
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
|
||||
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
|
||||
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_maddubs_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
|
||||
}
|
||||
|
||||
/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
|
||||
/// products to the 18 most significant bits by right-shifting, rounds the
|
||||
/// truncated value by adding 1, and writes bits [16:1] to the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPMULHRSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands.
|
||||
/// \param __b
|
||||
/// A 128-bit vector of [8 x i16] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
|
||||
/// products of both operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_mulhrs_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
|
||||
/// products to the 18 most significant bits by right-shifting, rounds the
|
||||
/// truncated value by adding 1, and writes bits [16:1] to the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PMULHRSW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands.
|
||||
/// \param __b
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
|
||||
/// products of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_mulhrs_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
|
||||
}
|
||||
|
||||
/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
|
||||
/// destination or clears 8-bit values in the destination, as specified by
|
||||
/// the second source operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPSHUFB instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector containing the values to be copied.
|
||||
/// \param __b
|
||||
/// A 128-bit integer vector containing control bytes corresponding to
|
||||
/// positions in the destination:
|
||||
/// Bit 7: \n
|
||||
/// 1: Clear the corresponding byte in the destination. \n
|
||||
/// 0: Copy the selected source byte to the corresponding byte in the
|
||||
/// destination. \n
|
||||
/// Bits [6:4] Reserved. \n
|
||||
/// Bits [3:0] select the source byte to be copied.
|
||||
/// \returns A 128-bit integer vector containing the copied or cleared values.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_shuffle_epi8(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
|
||||
}
|
||||
|
||||
/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
|
||||
/// destination or clears 8-bit values in the destination, as specified by
|
||||
/// the second source operand.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PSHUFB instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit integer vector containing the values to be copied.
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing control bytes corresponding to
|
||||
/// positions in the destination:
|
||||
/// Bit 7: \n
|
||||
/// 1: Clear the corresponding byte in the destination. \n
|
||||
/// 0: Copy the selected source byte to the corresponding byte in the
|
||||
/// destination. \n
|
||||
/// Bits [3:0] select the source byte to be copied.
|
||||
/// \returns A 64-bit integer vector containing the copied or cleared values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_shuffle_pi8(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
|
||||
}
|
||||
|
||||
/// \brief For each 8-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// byte in the second source is negative, calculate the two's complement of
|
||||
/// the corresponding byte in the first source, and write that value to the
|
||||
/// destination. If the byte in the second source is positive, copy the
|
||||
/// corresponding byte from the first source to the destination. If the byte
|
||||
/// in the second source is zero, clear the corresponding byte in the
|
||||
/// destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPSIGNB instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector containing the values to be copied.
|
||||
/// \param __b
|
||||
/// A 128-bit integer vector containing control bytes corresponding to
|
||||
/// positions in the destination.
|
||||
/// \returns A 128-bit integer vector containing the resultant values.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_sign_epi8(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
|
||||
}
|
||||
|
||||
/// \brief For each 16-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// word in the second source is negative, calculate the two's complement of
|
||||
/// the corresponding word in the first source, and write that value to the
|
||||
/// destination. If the word in the second source is positive, copy the
|
||||
/// corresponding word from the first source to the destination. If the word
|
||||
/// in the second source is zero, clear the corresponding word in the
|
||||
/// destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPSIGNW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector containing the values to be copied.
|
||||
/// \param __b
|
||||
/// A 128-bit integer vector containing control words corresponding to
|
||||
/// positions in the destination.
|
||||
/// \returns A 128-bit integer vector containing the resultant values.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_sign_epi16(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
|
||||
}
|
||||
|
||||
/// \brief For each 32-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// doubleword in the second source is negative, calculate the two's
|
||||
/// complement of the corresponding word in the first source, and write that
|
||||
/// value to the destination. If the doubleword in the second source is
|
||||
/// positive, copy the corresponding word from the first source to the
|
||||
/// destination. If the doubleword in the second source is zero, clear the
|
||||
/// corresponding word in the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c VPSIGND instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector containing the values to be copied.
|
||||
/// \param __b
|
||||
/// A 128-bit integer vector containing control doublewords corresponding to
|
||||
/// positions in the destination.
|
||||
/// \returns A 128-bit integer vector containing the resultant values.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_sign_epi32(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
|
||||
}
|
||||
|
||||
/// \brief For each 8-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// byte in the second source is negative, calculate the two's complement of
|
||||
/// the corresponding byte in the first source, and write that value to the
|
||||
/// destination. If the byte in the second source is positive, copy the
|
||||
/// corresponding byte from the first source to the destination. If the byte
|
||||
/// in the second source is zero, clear the corresponding byte in the
|
||||
/// destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PSIGNB instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit integer vector containing the values to be copied.
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing control bytes corresponding to
|
||||
/// positions in the destination.
|
||||
/// \returns A 64-bit integer vector containing the resultant values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_sign_pi8(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
|
||||
}
|
||||
|
||||
/// \brief For each 16-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// word in the second source is negative, calculate the two's complement of
|
||||
/// the corresponding word in the first source, and write that value to the
|
||||
/// destination. If the word in the second source is positive, copy the
|
||||
/// corresponding word from the first source to the destination. If the word
|
||||
/// in the second source is zero, clear the corresponding word in the
|
||||
/// destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PSIGNW instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit integer vector containing the values to be copied.
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing control words corresponding to
|
||||
/// positions in the destination.
|
||||
/// \returns A 64-bit integer vector containing the resultant values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_sign_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
|
||||
}
|
||||
|
||||
/// \brief For each 32-bit integer in the first source operand, perform one of
|
||||
/// the following actions as specified by the second source operand: If the
|
||||
/// doubleword in the second source is negative, calculate the two's
|
||||
/// complement of the corresponding doubleword in the first source, and
|
||||
/// write that value to the destination. If the doubleword in the second
|
||||
/// source is positive, copy the corresponding doubleword from the first
|
||||
/// source to the destination. If the doubleword in the second source is
|
||||
/// zero, clear the corresponding doubleword in the destination.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PSIGND instruction.
|
||||
///
|
||||
/// \param __a
|
||||
/// A 64-bit integer vector containing the values to be copied.
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing two control doublewords corresponding
|
||||
/// to positions in the destination.
|
||||
/// \returns A 64-bit integer vector containing the resultant values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_sign_pi32(__m64 __a, __m64 __b)
|
||||
{
|
||||
|
@ -225,6 +770,4 @@ _mm_sign_pi32(__m64 __a, __m64 __b)
|
|||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __SSSE3__ */
|
||||
|
||||
#endif /* __TMMINTRIN_H */
|
||||
|
|
|
@ -79,6 +79,10 @@ struct _Unwind_Context;
|
|||
struct _Unwind_Exception;
|
||||
typedef enum {
|
||||
_URC_NO_REASON = 0,
|
||||
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
|
||||
!defined(__ARM_DWARF_EH__)
|
||||
_URC_OK = 0, /* used by ARM EHABI */
|
||||
#endif
|
||||
_URC_FOREIGN_EXCEPTION_CAUGHT = 1,
|
||||
|
||||
_URC_FATAL_PHASE2_ERROR = 2,
|
||||
|
@ -88,7 +92,11 @@ typedef enum {
|
|||
_URC_END_OF_STACK = 5,
|
||||
_URC_HANDLER_FOUND = 6,
|
||||
_URC_INSTALL_CONTEXT = 7,
|
||||
_URC_CONTINUE_UNWIND = 8
|
||||
_URC_CONTINUE_UNWIND = 8,
|
||||
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
|
||||
!defined(__ARM_DWARF_EH__)
|
||||
_URC_FAILURE = 9 /* used by ARM EHABI */
|
||||
#endif
|
||||
} _Unwind_Reason_Code;
|
||||
|
||||
typedef enum {
|
||||
|
@ -150,6 +158,15 @@ typedef enum {
|
|||
_UVRSR_FAILED = 2
|
||||
} _Unwind_VRS_Result;
|
||||
|
||||
#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__)
|
||||
typedef uint32_t _Unwind_State;
|
||||
#define _US_VIRTUAL_UNWIND_FRAME ((_Unwind_State)0)
|
||||
#define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1)
|
||||
#define _US_UNWIND_FRAME_RESUME ((_Unwind_State)2)
|
||||
#define _US_ACTION_MASK ((_Unwind_State)3)
|
||||
#define _US_FORCE_UNWIND ((_Unwind_State)8)
|
||||
#endif
|
||||
|
||||
_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
|
||||
_Unwind_VRS_RegClass __regclass,
|
||||
uint32_t __regno,
|
||||
|
|
|
@ -26,17 +26,8 @@
|
|||
|
||||
#include <emmintrin.h>
|
||||
|
||||
#if !defined (__AES__) && !defined (__PCLMUL__)
|
||||
# error "AES/PCLMUL instructions not enabled"
|
||||
#else
|
||||
|
||||
#ifdef __AES__
|
||||
#include <__wmmintrin_aes.h>
|
||||
#endif /* __AES__ */
|
||||
|
||||
#ifdef __PCLMUL__
|
||||
#include <__wmmintrin_pclmul.h>
|
||||
#endif /* __PCLMUL__ */
|
||||
|
||||
#endif /* __AES__ || __PCLMUL__ */
|
||||
#endif /* _WMMINTRIN_H */
|
||||
|
|
|
@ -28,54 +28,58 @@
|
|||
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifdef __3dNOW__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__3dNOW__)
|
||||
#include <mm3dnow.h>
|
||||
#endif
|
||||
|
||||
#ifdef __BMI__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
|
||||
#include <bmiintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __BMI2__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
|
||||
#include <bmi2intrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __LZCNT__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
|
||||
#include <lzcntintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __POPCNT__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
|
||||
#include <popcntintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __RDSEED__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
|
||||
#include <rdseedintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __PRFCHW__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
|
||||
#include <prfchwintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __SSE4A__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE4A__)
|
||||
#include <ammintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __FMA4__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA4__)
|
||||
#include <fma4intrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __XOP__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XOP__)
|
||||
#include <xopintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __TBM__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__TBM__)
|
||||
#include <tbmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
|
||||
#include <f16cintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
|
||||
#include <mwaitxintrin.h>
|
||||
#endif
|
||||
|
||||
/* FIXME: LWP */
|
||||
|
||||
#endif /* __X86INTRIN_H */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -28,14 +28,10 @@
|
|||
#ifndef __XOPINTRIN_H
|
||||
#define __XOPINTRIN_H
|
||||
|
||||
#ifndef __XOP__
|
||||
# error "XOP instruction set is not enabled"
|
||||
#else
|
||||
|
||||
#include <fma4intrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
|
||||
|
@ -202,13 +198,13 @@ _mm_hsubq_epi32(__m128i __A)
|
|||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C);
|
||||
return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpcmov_256(__A, __B, __C);
|
||||
return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
|
@ -242,20 +238,16 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
|
|||
}
|
||||
|
||||
#define _mm_roti_epi8(A, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
(__m128i)__builtin_ia32_vprotbi((__v16qi)__A, (N)); })
|
||||
(__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
|
||||
|
||||
#define _mm_roti_epi16(A, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
(__m128i)__builtin_ia32_vprotwi((__v8hi)__A, (N)); })
|
||||
(__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
|
||||
|
||||
#define _mm_roti_epi32(A, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
(__m128i)__builtin_ia32_vprotdi((__v4si)__A, (N)); })
|
||||
(__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
|
||||
|
||||
#define _mm_roti_epi64(A, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
(__m128i)__builtin_ia32_vprotqi((__v2di)__A, (N)); })
|
||||
(__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
_mm_shl_epi8(__m128i __A, __m128i __B)
|
||||
|
@ -306,44 +298,36 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
|
|||
}
|
||||
|
||||
#define _mm_com_epu8(A, B, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
__m128i __B = (B); \
|
||||
(__m128i)__builtin_ia32_vpcomub((__v16qi)__A, (__v16qi)__B, (N)); })
|
||||
(__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (N)); })
|
||||
|
||||
#define _mm_com_epu16(A, B, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
__m128i __B = (B); \
|
||||
(__m128i)__builtin_ia32_vpcomuw((__v8hi)__A, (__v8hi)__B, (N)); })
|
||||
(__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
|
||||
(__v8hi)(__m128i)(B), (N)); })
|
||||
|
||||
#define _mm_com_epu32(A, B, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
__m128i __B = (B); \
|
||||
(__m128i)__builtin_ia32_vpcomud((__v4si)__A, (__v4si)__B, (N)); })
|
||||
(__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
|
||||
(__v4si)(__m128i)(B), (N)); })
|
||||
|
||||
#define _mm_com_epu64(A, B, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
__m128i __B = (B); \
|
||||
(__m128i)__builtin_ia32_vpcomuq((__v2di)__A, (__v2di)__B, (N)); })
|
||||
(__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
|
||||
(__v2di)(__m128i)(B), (N)); })
|
||||
|
||||
#define _mm_com_epi8(A, B, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
__m128i __B = (B); \
|
||||
(__m128i)__builtin_ia32_vpcomb((__v16qi)__A, (__v16qi)__B, (N)); })
|
||||
(__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), (N)); })
|
||||
|
||||
#define _mm_com_epi16(A, B, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
__m128i __B = (B); \
|
||||
(__m128i)__builtin_ia32_vpcomw((__v8hi)__A, (__v8hi)__B, (N)); })
|
||||
(__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
|
||||
(__v8hi)(__m128i)(B), (N)); })
|
||||
|
||||
#define _mm_com_epi32(A, B, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
__m128i __B = (B); \
|
||||
(__m128i)__builtin_ia32_vpcomd((__v4si)__A, (__v4si)__B, (N)); })
|
||||
(__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
|
||||
(__v4si)(__m128i)(B), (N)); })
|
||||
|
||||
#define _mm_com_epi64(A, B, N) __extension__ ({ \
|
||||
__m128i __A = (A); \
|
||||
__m128i __B = (B); \
|
||||
(__m128i)__builtin_ia32_vpcomq((__v2di)__A, (__v2di)__B, (N)); })
|
||||
(__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
|
||||
(__v2di)(__m128i)(B), (N)); })
|
||||
|
||||
#define _MM_PCOMCTRL_LT 0
|
||||
#define _MM_PCOMCTRL_LE 1
|
||||
|
@ -739,32 +723,23 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
|
|||
}
|
||||
|
||||
#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
|
||||
__m128d __X = (X); \
|
||||
__m128d __Y = (Y); \
|
||||
__m128i __C = (C); \
|
||||
(__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, \
|
||||
(__v2di)__C, (I)); })
|
||||
(__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
|
||||
(__v2df)(__m128d)(Y), \
|
||||
(__v2di)(__m128i)(C), (I)); })
|
||||
|
||||
#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
|
||||
__m256d __X = (X); \
|
||||
__m256d __Y = (Y); \
|
||||
__m256i __C = (C); \
|
||||
(__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, \
|
||||
(__v4di)__C, (I)); })
|
||||
(__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
|
||||
(__v4df)(__m256d)(Y), \
|
||||
(__v4di)(__m256i)(C), (I)); })
|
||||
|
||||
#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
|
||||
__m128 __X = (X); \
|
||||
__m128 __Y = (Y); \
|
||||
__m128i __C = (C); \
|
||||
(__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, \
|
||||
(__v4si)__C, (I)); })
|
||||
(__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
|
||||
(__v4si)(__m128i)(C), (I)); })
|
||||
|
||||
#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
|
||||
__m256 __X = (X); \
|
||||
__m256 __Y = (Y); \
|
||||
__m256i __C = (C); \
|
||||
(__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, \
|
||||
(__v8si)__C, (I)); })
|
||||
(__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
|
||||
(__v8sf)(__m256)(Y), \
|
||||
(__v8si)(__m256i)(C), (I)); })
|
||||
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_frcz_ss(__m128 __A)
|
||||
|
@ -804,6 +779,4 @@ _mm256_frcz_pd(__m256d __A)
|
|||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __XOP__ */
|
||||
|
||||
#endif /* __XOPINTRIN_H */
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __XSAVECINTRIN_H
|
||||
#define __XSAVECINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsavec")))
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xsavec(void *__p, unsigned long long __m) {
|
||||
__builtin_ia32_xsavec(__p, __m);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xsavec64(void *__p, unsigned long long __m) {
|
||||
__builtin_ia32_xsavec64(__p, __m);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,58 @@
|
|||
/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __XSAVEINTRIN_H
|
||||
#define __XSAVEINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsave")))
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xsave(void *__p, unsigned long long __m) {
|
||||
return __builtin_ia32_xsave(__p, __m);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xrstor(void *__p, unsigned long long __m) {
|
||||
return __builtin_ia32_xrstor(__p, __m);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xsave64(void *__p, unsigned long long __m) {
|
||||
return __builtin_ia32_xsave64(__p, __m);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xrstor64(void *__p, unsigned long long __m) {
|
||||
return __builtin_ia32_xrstor64(__p, __m);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,48 @@
|
|||
/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __XSAVEOPTINTRIN_H
|
||||
#define __XSAVEOPTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaveopt")))
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xsaveopt(void *__p, unsigned long long __m) {
|
||||
return __builtin_ia32_xsaveopt(__p, __m);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xsaveopt64(void *__p, unsigned long long __m) {
|
||||
return __builtin_ia32_xsaveopt64(__p, __m);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
|
@ -0,0 +1,58 @@
|
|||
/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __XSAVESINTRIN_H
|
||||
#define __XSAVESINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaves")))
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xsaves(void *__p, unsigned long long __m) {
|
||||
__builtin_ia32_xsaves(__p, __m);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xrstors(void *__p, unsigned long long __m) {
|
||||
__builtin_ia32_xrstors(__p, __m);
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xrstors64(void *__p, unsigned long long __m) {
|
||||
__builtin_ia32_xrstors64(__p, __m);
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_xsaves64(void *__p, unsigned long long __m) {
|
||||
__builtin_ia32_xsaves64(__p, __m);
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue