151 lines
5.1 KiB
C
151 lines
5.1 KiB
C
|
/*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------===
|
||
|
*
|
||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||
|
*
|
||
|
*===-----------------------------------------------------------------------===
|
||
|
*/
|
||
|
|
||
|
/* Implemented from the specification included in the Intel C++ Compiler
|
||
|
User Guide and Reference, version 9.0. */
|
||
|
|
||
|
#ifndef NO_WARN_X86_INTRINSICS
|
||
|
/* This header is distributed to simplify porting x86_64 code that
|
||
|
makes explicit use of Intel intrinsics to powerpc64le.
|
||
|
It is the user's responsibility to determine if the results are
|
||
|
acceptable and make additional changes as necessary.
|
||
|
Note that much code that uses Intel intrinsics can be rewritten in
|
||
|
standard C or GNU C extensions, which are more portable and better
|
||
|
optimized across multiple targets.
|
||
|
|
||
|
In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA
|
||
|
is a good match for most SIMD operations. However the Horizontal
|
||
|
add/sub requires the data pairs be permuted into a separate
|
||
|
registers with vertical even/odd alignment for the operation.
|
||
|
And the addsub operation requires the sign of only the even numbered
|
||
|
elements be flipped (xored with -0.0).
|
||
|
For larger blocks of code using these intrinsic implementations,
|
||
|
the compiler be should be able to schedule instructions to avoid
|
||
|
additional latency.
|
||
|
|
||
|
In the specific case of the monitor and mwait instructions there are
|
||
|
no direct equivalent in the PowerISA at this time. So those
|
||
|
intrinsics are not implemented. */
|
||
|
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
|
||
|
#endif
|
||
|
|
||
|
#ifndef PMMINTRIN_H_
|
||
|
#define PMMINTRIN_H_
|
||
|
|
||
|
#if defined(__linux__) && defined(__ppc64__)
|
||
|
|
||
|
/* We need definitions from the SSE2 and SSE header files*/
|
||
|
#include <emmintrin.h>
|
||
|
|
||
|
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_addsub_ps (__m128 __X, __m128 __Y)
|
||
|
{
|
||
|
const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
|
||
|
__v4sf even_neg_Y = vec_xor(__Y, even_n0);
|
||
|
return (__m128) vec_add (__X, even_neg_Y);
|
||
|
}
|
||
|
|
||
|
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_addsub_pd (__m128d __X, __m128d __Y)
|
||
|
{
|
||
|
const __v2df even_n0 = {-0.0, 0.0};
|
||
|
__v2df even_neg_Y = vec_xor(__Y, even_n0);
|
||
|
return (__m128d) vec_add (__X, even_neg_Y);
|
||
|
}
|
||
|
|
||
|
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_hadd_ps (__m128 __X, __m128 __Y)
|
||
|
{
|
||
|
__vector unsigned char xform2 = {
|
||
|
0x00, 0x01, 0x02, 0x03,
|
||
|
0x08, 0x09, 0x0A, 0x0B,
|
||
|
0x10, 0x11, 0x12, 0x13,
|
||
|
0x18, 0x19, 0x1A, 0x1B
|
||
|
};
|
||
|
__vector unsigned char xform1 = {
|
||
|
0x04, 0x05, 0x06, 0x07,
|
||
|
0x0C, 0x0D, 0x0E, 0x0F,
|
||
|
0x14, 0x15, 0x16, 0x17,
|
||
|
0x1C, 0x1D, 0x1E, 0x1F
|
||
|
};
|
||
|
return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
|
||
|
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
|
||
|
}
|
||
|
|
||
|
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_hsub_ps (__m128 __X, __m128 __Y)
|
||
|
{
|
||
|
__vector unsigned char xform2 = {
|
||
|
0x00, 0x01, 0x02, 0x03,
|
||
|
0x08, 0x09, 0x0A, 0x0B,
|
||
|
0x10, 0x11, 0x12, 0x13,
|
||
|
0x18, 0x19, 0x1A, 0x1B
|
||
|
};
|
||
|
__vector unsigned char xform1 = {
|
||
|
0x04, 0x05, 0x06, 0x07,
|
||
|
0x0C, 0x0D, 0x0E, 0x0F,
|
||
|
0x14, 0x15, 0x16, 0x17,
|
||
|
0x1C, 0x1D, 0x1E, 0x1F
|
||
|
};
|
||
|
return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
|
||
|
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
|
||
|
}
|
||
|
|
||
|
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_hadd_pd (__m128d __X, __m128d __Y)
|
||
|
{
|
||
|
return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
|
||
|
vec_mergel ((__v2df) __X, (__v2df)__Y));
|
||
|
}
|
||
|
|
||
|
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_hsub_pd (__m128d __X, __m128d __Y)
|
||
|
{
|
||
|
return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
|
||
|
vec_mergel ((__v2df) __X, (__v2df)__Y));
|
||
|
}
|
||
|
|
||
|
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_movehdup_ps (__m128 __X)
|
||
|
{
|
||
|
return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
|
||
|
}
|
||
|
|
||
|
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_moveldup_ps (__m128 __X)
|
||
|
{
|
||
|
return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
|
||
|
}
|
||
|
|
||
|
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_loaddup_pd (double const *__P)
|
||
|
{
|
||
|
return (__m128d) vec_splats (*__P);
|
||
|
}
|
||
|
|
||
|
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_movedup_pd (__m128d __X)
|
||
|
{
|
||
|
return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
|
||
|
}
|
||
|
|
||
|
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||
|
_mm_lddqu_si128 (__m128i const *__P)
|
||
|
{
|
||
|
return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
|
||
|
}
|
||
|
|
||
|
/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */
|
||
|
|
||
|
#else
|
||
|
#include_next <pmmintrin.h>
|
||
|
#endif /* defined(__linux__) && defined(__ppc64__) */
|
||
|
|
||
|
#endif /* PMMINTRIN_H_ */
|