From 28086f6cb7f0e5c35a97ba60c8e28969981a8817 Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Fri, 14 Sep 2012 07:01:58 -0700 Subject: [PATCH] Implement an SSE cubic resampler --- Alc/ALu.c | 6 +++- Alc/mixer_defs.h | 1 + Alc/mixer_sse.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/Alc/ALu.c b/Alc/ALu.c index 3ab723dd..b94216ff 100644 --- a/Alc/ALu.c +++ b/Alc/ALu.c @@ -66,13 +66,17 @@ static ResamplerFunc SelectResampler(enum Resampler Resampler, ALuint increment) #endif return Resample_lerp32_C; case CubicResampler: +#ifdef HAVE_SSE + if((CPUCapFlags&CPU_CAP_SSE)) + return Resample_cubic32_SSE; +#endif return Resample_cubic32_C; case ResamplerMax: /* Shouldn't happen */ break; } - return NULL; + return Resample_point32_C; } diff --git a/Alc/mixer_defs.h b/Alc/mixer_defs.h index 91ae24d6..f9dbadbb 100644 --- a/Alc/mixer_defs.h +++ b/Alc/mixer_defs.h @@ -17,6 +17,7 @@ void Resample_cubic32_C(const ALfloat *src, ALuint frac, ALuint increment, ALuin /* SSE resamplers */ void Resample_lerp32_SSE(const ALfloat *src, ALuint frac, ALuint increment, ALuint NumChannels, ALfloat *RESTRICT dst, ALuint dstlen); +void Resample_cubic32_SSE(const ALfloat *src, ALuint frac, ALuint increment, ALuint NumChannels, ALfloat *RESTRICT dst, ALuint dstlen); /* C mixers */ diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c index aff0152b..84884de3 100644 --- a/Alc/mixer_sse.c +++ b/Alc/mixer_sse.c @@ -58,6 +58,92 @@ void Resample_lerp32_SSE(const ALfloat *data, ALuint frac, } } +void Resample_cubic32_SSE(const ALfloat *data, ALuint frac, + ALuint increment, ALuint NumChannels, ALfloat *RESTRICT OutBuffer, + ALuint BufferSize) +{ + /* Cubic interpolation mainly consists of a matrix4 * vector4 operation, + * followed by scalars being applied to the resulting elements before all + * four are added together for the final sample. */ + static const __m128 matrix[4] = { + { -0.5, 1.0f, -0.5f, 0.0f }, + { 1.5, -2.5f, 0.0f, 1.0f }, + { -1.5, 2.0f, 0.5f, 0.0f }, + { 0.5, -0.5f, 0.0f, 0.0f }, + }; + ALIGN(16) float value[4]; + ALuint pos = 0; + ALuint i, j; + + for(i = 0;i < BufferSize+1-3;i+=4) + { + __m128 result, final[4]; + + for(j = 0;j < 4;j++) + { + __m128 val4, s; + ALfloat mu; + + val4 = _mm_set_ps(data[(pos-1)*NumChannels], + data[(pos )*NumChannels], + data[(pos+1)*NumChannels], + data[(pos+2)*NumChannels]); + mu = frac * (1.0f/FRACTIONONE); + s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu); + + /* result = matrix * val4 */ + result = _mm_mul_ps(val4, matrix[0]) ; + result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1])); + result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2])); + result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3])); + + /* final[j] = result * { mu^0, mu^1, mu^2, mu^3 } */ + final[j] = _mm_mul_ps(result, s); + + frac += increment; + pos += frac>>FRACTIONBITS; + frac &= FRACTIONMASK; + } + /* Transpose the final "matrix" so adding the rows will give the four + * samples. TODO: Is this faster than doing.. + * _mm_store_ps(value, result); + * OutBuffer[i] = value[0] + value[1] + value[2] + value[3]; + * ..for each sample? + */ + _MM_TRANSPOSE4_PS(final[0], final[1], final[2], final[3]); + result = _mm_add_ps(_mm_add_ps(final[0], final[1]), + _mm_add_ps(final[2], final[3])); + + _mm_store_ps(&OutBuffer[i], result); + } + for(;i < BufferSize+1;i++) + { + __m128 val4, s, result; + ALfloat mu; + + val4 = _mm_set_ps(data[(pos-1)*NumChannels], + data[(pos )*NumChannels], + data[(pos+1)*NumChannels], + data[(pos+2)*NumChannels]); + mu = frac * (1.0f/FRACTIONONE); + s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu); + + /* result = matrix * val4 */ + result = _mm_mul_ps(val4, matrix[0]) ; + result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1])); + result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2])); + result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3])); + + /* value = result * { mu^0, mu^1, mu^2, mu^3 } */ + _mm_store_ps(value, _mm_mul_ps(result, s)); + + OutBuffer[i] = value[0] + value[1] + value[2] + value[3]; + + frac += increment; + pos += frac>>FRACTIONBITS; + frac &= FRACTIONMASK; + } +} static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],