openal-soft/Alc/mixer_sse.c
Chris Robinson ecdc93f3ca Calculate HRTF stepping params right before mixing
This means we track the current params and the target params, rather than the
target params and the stepping. This closer matches the non-HRTF mixers.
2016-02-14 03:23:06 -08:00

263 lines
8.8 KiB
C

#include "config.h"
#include <xmmintrin.h>
#include "AL/al.h"
#include "AL/alc.h"
#include "alMain.h"
#include "alu.h"
#include "alSource.h"
#include "alAuxEffectSlot.h"
#include "mixer_defs.h"
const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *src, ALuint frac,
ALuint increment, ALfloat *restrict dst, ALuint dstlen)
{
const __m128 sf4 = _mm_set1_ps(state->sf);
const ALuint m = state->m;
const ALint l = state->l;
const ALfloat *fil, *scd, *phd, *spd;
ALuint pi, j_f, i;
ALfloat pf;
ALint j_s;
__m128 r4;
for(i = 0;i < dstlen;i++)
{
// Calculate the phase index and factor.
#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
pi = frac >> FRAC_PHASE_BITDIFF;
pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
#undef FRAC_PHASE_BITDIFF
fil = state->coeffs[pi].filter;
scd = state->coeffs[pi].scDelta;
phd = state->coeffs[pi].phDelta;
spd = state->coeffs[pi].spDelta;
// Apply the scale and phase interpolated filter.
r4 = _mm_setzero_ps();
{
const __m128 pf4 = _mm_set1_ps(pf);
for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4)
{
const __m128 f4 = _mm_add_ps(
_mm_add_ps(
_mm_load_ps(&fil[j_f]),
_mm_mul_ps(sf4, _mm_load_ps(&scd[j_f]))
),
_mm_mul_ps(
pf4,
_mm_add_ps(
_mm_load_ps(&phd[j_f]),
_mm_mul_ps(sf4, _mm_load_ps(&spd[j_f]))
)
)
);
r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s])));
}
}
r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
dst[i] = _mm_cvtss_f32(r4);
frac += increment;
src += frac>>FRACTIONBITS;
frac &= FRACTIONMASK;
}
return dst;
}
static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
const ALuint IrSize,
ALfloat (*restrict Coeffs)[2],
const ALfloat (*restrict CoeffStep)[2],
ALfloat left, ALfloat right)
{
const __m128 lrlr = _mm_setr_ps(left, right, left, right);
__m128 coeffs, deltas, imp0, imp1;
__m128 vals = _mm_setzero_ps();
ALuint i;
if((Offset&1))
{
const ALuint o0 = Offset&HRIR_MASK;
const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
coeffs = _mm_load_ps(&Coeffs[0][0]);
deltas = _mm_load_ps(&CoeffStep[0][0]);
vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
imp0 = _mm_mul_ps(lrlr, coeffs);
coeffs = _mm_add_ps(coeffs, deltas);
vals = _mm_add_ps(imp0, vals);
_mm_store_ps(&Coeffs[0][0], coeffs);
_mm_storel_pi((__m64*)&Values[o0][0], vals);
for(i = 1;i < IrSize-1;i += 2)
{
const ALuint o2 = (Offset+i)&HRIR_MASK;
coeffs = _mm_load_ps(&Coeffs[i+1][0]);
deltas = _mm_load_ps(&CoeffStep[i+1][0]);
vals = _mm_load_ps(&Values[o2][0]);
imp1 = _mm_mul_ps(lrlr, coeffs);
coeffs = _mm_add_ps(coeffs, deltas);
imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
vals = _mm_add_ps(imp0, vals);
_mm_store_ps(&Coeffs[i+1][0], coeffs);
_mm_store_ps(&Values[o2][0], vals);
imp0 = imp1;
}
vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
imp0 = _mm_movehl_ps(imp0, imp0);
vals = _mm_add_ps(imp0, vals);
_mm_storel_pi((__m64*)&Values[o1][0], vals);
}
else
{
for(i = 0;i < IrSize;i += 2)
{
const ALuint o = (Offset + i)&HRIR_MASK;
coeffs = _mm_load_ps(&Coeffs[i][0]);
deltas = _mm_load_ps(&CoeffStep[i][0]);
vals = _mm_load_ps(&Values[o][0]);
imp0 = _mm_mul_ps(lrlr, coeffs);
coeffs = _mm_add_ps(coeffs, deltas);
vals = _mm_add_ps(imp0, vals);
_mm_store_ps(&Coeffs[i][0], coeffs);
_mm_store_ps(&Values[o][0], vals);
}
}
}
static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
const ALuint IrSize,
ALfloat (*restrict Coeffs)[2],
ALfloat left, ALfloat right)
{
const __m128 lrlr = _mm_setr_ps(left, right, left, right);
__m128 vals = _mm_setzero_ps();
__m128 coeffs;
ALuint i;
if((Offset&1))
{
const ALuint o0 = Offset&HRIR_MASK;
const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
__m128 imp0, imp1;
coeffs = _mm_load_ps(&Coeffs[0][0]);
vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
imp0 = _mm_mul_ps(lrlr, coeffs);
vals = _mm_add_ps(imp0, vals);
_mm_storel_pi((__m64*)&Values[o0][0], vals);
for(i = 1;i < IrSize-1;i += 2)
{
const ALuint o2 = (Offset+i)&HRIR_MASK;
coeffs = _mm_load_ps(&Coeffs[i+1][0]);
vals = _mm_load_ps(&Values[o2][0]);
imp1 = _mm_mul_ps(lrlr, coeffs);
imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
vals = _mm_add_ps(imp0, vals);
_mm_store_ps(&Values[o2][0], vals);
imp0 = imp1;
}
vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
imp0 = _mm_movehl_ps(imp0, imp0);
vals = _mm_add_ps(imp0, vals);
_mm_storel_pi((__m64*)&Values[o1][0], vals);
}
else
{
for(i = 0;i < IrSize;i += 2)
{
const ALuint o = (Offset + i)&HRIR_MASK;
coeffs = _mm_load_ps(&Coeffs[i][0]);
vals = _mm_load_ps(&Values[o][0]);
vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
_mm_store_ps(&Values[o][0], vals);
}
}
}
#define MixHrtf MixHrtf_SSE
#include "mixer_inc.c"
#undef MixHrtf
void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
{
ALfloat gain, step;
__m128 gain4;
ALuint c;
for(c = 0;c < OutChans;c++)
{
ALuint pos = 0;
gain = Gains[c].Current;
step = Gains[c].Step;
if(step != 0.0f && Counter > 0)
{
ALuint minsize = minu(BufferSize, Counter);
/* Mix with applying gain steps in aligned multiples of 4. */
if(minsize-pos > 3)
{
__m128 step4;
gain4 = _mm_setr_ps(
gain,
gain + step,
gain + step + step,
gain + step + step + step
);
step4 = _mm_set1_ps(step + step + step + step);
do {
const __m128 val4 = _mm_load_ps(&data[pos]);
__m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
gain4 = _mm_add_ps(gain4, step4);
_mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
pos += 4;
} while(minsize-pos > 3);
/* NOTE: gain4 now represents the next four gains after the
* last four mixed samples, so the lowest element represents
* the next gain to apply.
*/
gain = _mm_cvtss_f32(gain4);
}
/* Mix with applying left over gain steps that aren't aligned multiples of 4. */
for(;pos < minsize;pos++)
{
OutBuffer[c][OutPos+pos] += data[pos]*gain;
gain += step;
}
if(pos == Counter)
gain = Gains[c].Target;
Gains[c].Current = gain;
/* Mix until pos is aligned with 4 or the mix is done. */
minsize = minu(BufferSize, (pos+3)&~3);
for(;pos < minsize;pos++)
OutBuffer[c][OutPos+pos] += data[pos]*gain;
}
if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
continue;
gain4 = _mm_set1_ps(gain);
for(;BufferSize-pos > 3;pos += 4)
{
const __m128 val4 = _mm_load_ps(&data[pos]);
__m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
_mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
}
for(;pos < BufferSize;pos++)
OutBuffer[c][OutPos+pos] += data[pos]*gain;
}
}