Avoid duplicate definitions of the same wrapper functions
parent
3bdbbe8aa8
commit
ad3095a03d
|
@ -70,6 +70,47 @@ struct PhaseShifterT {
|
|||
|
||||
void process(al::span<float> dst, const float *RESTRICT src) const;
|
||||
void processAccum(al::span<float> dst, const float *RESTRICT src) const;
|
||||
|
||||
private:
|
||||
#if defined(HAVE_NEON)
|
||||
/* There doesn't seem to be NEON intrinsics to do this kind of stipple
|
||||
* shuffling, so there's two custom methods for it.
|
||||
*/
|
||||
static auto shuffle_2020(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))};
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3);
|
||||
return ret;
|
||||
}
|
||||
static auto shuffle_3131(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))};
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3);
|
||||
return ret;
|
||||
}
|
||||
static auto unpacklo(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))};
|
||||
return vcombine_f32(result.val[0], result.val[1]);
|
||||
}
|
||||
static auto unpackhi(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))};
|
||||
return vcombine_f32(result.val[0], result.val[1]);
|
||||
}
|
||||
static auto load4(float32_t a, float32_t b, float32_t c, float32_t d)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(a)};
|
||||
ret = vsetq_lane_f32(b, ret, 1);
|
||||
ret = vsetq_lane_f32(c, ret, 2);
|
||||
ret = vsetq_lane_f32(d, ret, 3);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
template<size_t S>
|
||||
|
@ -123,35 +164,6 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT
|
|||
size_t pos{0};
|
||||
if(size_t todo{dst.size()>>1})
|
||||
{
|
||||
/* There doesn't seem to be NEON intrinsics to do this kind of stipple
|
||||
* shuffling, so there's two custom methods for it.
|
||||
*/
|
||||
auto shuffle_2020 = [](float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))};
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3);
|
||||
return ret;
|
||||
};
|
||||
auto shuffle_3131 = [](float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))};
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3);
|
||||
return ret;
|
||||
};
|
||||
auto unpacklo = [](float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))};
|
||||
return vcombine_f32(result.val[0], result.val[1]);
|
||||
};
|
||||
auto unpackhi = [](float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))};
|
||||
return vcombine_f32(result.val[0], result.val[1]);
|
||||
};
|
||||
do {
|
||||
float32x4_t r04{vdupq_n_f32(0.0f)};
|
||||
float32x4_t r14{vdupq_n_f32(0.0f)};
|
||||
|
@ -175,14 +187,6 @@ inline void PhaseShifterT<S>::process(al::span<float> dst, const float *RESTRICT
|
|||
}
|
||||
if((dst.size()&1))
|
||||
{
|
||||
auto load4 = [](float32_t a, float32_t b, float32_t c, float32_t d)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(a)};
|
||||
ret = vsetq_lane_f32(b, ret, 1);
|
||||
ret = vsetq_lane_f32(c, ret, 2);
|
||||
ret = vsetq_lane_f32(d, ret, 3);
|
||||
return ret;
|
||||
};
|
||||
float32x4_t r4{vdupq_n_f32(0.0f)};
|
||||
for(size_t j{0};j < mCoeffs.size();j+=4)
|
||||
{
|
||||
|
@ -245,9 +249,6 @@ inline void PhaseShifterT<S>::processAccum(al::span<float> dst, const float *RES
|
|||
for(size_t j{0};j < mCoeffs.size();j+=4)
|
||||
{
|
||||
const __m128 coeffs{_mm_load_ps(&mCoeffs[j])};
|
||||
/* NOTE: This could alternatively be done with two unaligned loads
|
||||
* and a shuffle. Which would be better?
|
||||
*/
|
||||
const __m128 s{_mm_setr_ps(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
|
||||
r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
|
||||
}
|
||||
|
@ -262,32 +263,6 @@ inline void PhaseShifterT<S>::processAccum(al::span<float> dst, const float *RES
|
|||
size_t pos{0};
|
||||
if(size_t todo{dst.size()>>1})
|
||||
{
|
||||
auto shuffle_2020 = [](float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))};
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3);
|
||||
return ret;
|
||||
};
|
||||
auto shuffle_3131 = [](float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))};
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2);
|
||||
ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3);
|
||||
return ret;
|
||||
};
|
||||
auto unpacklo = [](float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))};
|
||||
return vcombine_f32(result.val[0], result.val[1]);
|
||||
};
|
||||
auto unpackhi = [](float32x4_t a, float32x4_t b)
|
||||
{
|
||||
float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))};
|
||||
return vcombine_f32(result.val[0], result.val[1]);
|
||||
};
|
||||
do {
|
||||
float32x4_t r04{vdupq_n_f32(0.0f)};
|
||||
float32x4_t r14{vdupq_n_f32(0.0f)};
|
||||
|
@ -311,14 +286,6 @@ inline void PhaseShifterT<S>::processAccum(al::span<float> dst, const float *RES
|
|||
}
|
||||
if((dst.size()&1))
|
||||
{
|
||||
auto load4 = [](float32_t a, float32_t b, float32_t c, float32_t d)
|
||||
{
|
||||
float32x4_t ret{vmovq_n_f32(a)};
|
||||
ret = vsetq_lane_f32(b, ret, 1);
|
||||
ret = vsetq_lane_f32(c, ret, 2);
|
||||
ret = vsetq_lane_f32(d, ret, 3);
|
||||
return ret;
|
||||
};
|
||||
float32x4_t r4{vdupq_n_f32(0.0f)};
|
||||
for(size_t j{0};j < mCoeffs.size();j+=4)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue