Use SSE to do 4 samples at once (non-HRTF direct mix), instead of to apply a matrix row

MaxCHannels no longer needs to be a multiple of 4 now, either.
This commit is contained in:
Chris Robinson 2012-09-09 04:02:54 -07:00
parent f56dddfa73
commit f1ce139997
2 changed files with 52 additions and 32 deletions

View File

@ -65,20 +65,6 @@ static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
}
static __inline void ApplyValue(ALfloat *RESTRICT Output, ALfloat value, const ALfloat *DrySend)
{
const __m128 val4 = _mm_set1_ps(value);
ALuint c;
for(c = 0;c < MaxChannels;c += 4)
{
const __m128 gains = _mm_load_ps(&DrySend[c]);
__m128 out = _mm_load_ps(&Output[c]);
out = _mm_add_ps(out, _mm_mul_ps(val4, gains));
_mm_store_ps(&Output[c], out);
}
}
void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
const ALfloat *RESTRICT data, ALuint srcchan,
ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
@ -86,9 +72,9 @@ void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
ALfloat (*RESTRICT DryBuffer)[MaxChannels];
ALfloat *RESTRICT ClickRemoval, *RESTRICT PendingClicks;
ALIGN(16) ALfloat DrySend[MaxChannels];
ALIGN(16) ALfloat value[4];
FILTER *DryFilter;
ALuint pos;
ALfloat value;
ALuint c;
(void)Source;
@ -103,19 +89,54 @@ void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
pos = 0;
if(OutPos == 0)
{
value = lpFilter2PC(DryFilter, srcchan, data[pos]);
ApplyValue(ClickRemoval, -value, DrySend);
value[0] = lpFilter2PC(DryFilter, srcchan, data[pos]);
for(c = 0;c < MaxChannels;c++)
ClickRemoval[c] -= value[0]*DrySend[c];
}
for(pos = 0;pos < BufferSize;pos++)
for(pos = 0;pos < BufferSize-3;pos += 4)
{
value = lpFilter2P(DryFilter, srcchan, data[pos]);
ApplyValue(DryBuffer[OutPos], value, DrySend);
__m128 val4;
value[0] = lpFilter2P(DryFilter, srcchan, data[pos ]);
value[1] = lpFilter2P(DryFilter, srcchan, data[pos+1]);
value[2] = lpFilter2P(DryFilter, srcchan, data[pos+2]);
value[3] = lpFilter2P(DryFilter, srcchan, data[pos+3]);
val4 = _mm_load_ps(value);
for(c = 0;c < MaxChannels;c++)
{
const __m128 gain = _mm_set1_ps(DrySend[c]);
__m128 dry4;
value[0] = DryBuffer[OutPos ][c];
value[1] = DryBuffer[OutPos+1][c];
value[2] = DryBuffer[OutPos+2][c];
value[3] = DryBuffer[OutPos+3][c];
dry4 = _mm_load_ps(value);
dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
_mm_store_ps(value, dry4);
DryBuffer[OutPos ][c] = value[0];
DryBuffer[OutPos+1][c] = value[1];
DryBuffer[OutPos+2][c] = value[2];
DryBuffer[OutPos+3][c] = value[3];
}
OutPos += 4;
}
for(;pos < BufferSize;pos++)
{
value[0] = lpFilter2P(DryFilter, srcchan, data[pos]);
for(c = 0;c < MaxChannels;c++)
DryBuffer[OutPos][c] += value[0]*DrySend[c];
OutPos++;
}
if(OutPos == SamplesToDo)
{
value = lpFilter2PC(DryFilter, srcchan, data[pos]);
ApplyValue(PendingClicks, value, DrySend);
value[0] = lpFilter2PC(DryFilter, srcchan, data[pos]);
for(c = 0;c < MaxChannels;c++)
PendingClicks[c] += value[0]*DrySend[c];
}
}
#define NO_MIXDIRECT

View File

@ -128,17 +128,16 @@ enum Resampler {
enum Channel {
FrontLeft = 0,
FrontRight, /* 1 */
FrontCenter, /* 2 */
LFE, /* 3 */
BackLeft, /* 4 */
BackRight, /* 5 */
BackCenter, /* 6 */
SideLeft, /* 7 */
SideRight, /* 8 */
FrontRight,
FrontCenter,
LFE,
BackLeft,
BackRight,
BackCenter,
SideLeft,
SideRight,
/* Must be a multiple of 4 */
MaxChannels = 12,
MaxChannels,
};
enum DistanceModel {