openal-soft/Alc/mixer_neon.c

#include "config.h"

#include <arm_neon.h>

#include "AL/al.h"
#include "AL/alc.h"
#include "alMain.h"
#include "alu.h"
#include "hrtf.h"


static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
                                   const ALuint IrSize,
                                   ALfloat (*restrict Coeffs)[2],
                                   const ALfloat (*restrict CoeffStep)[2],
                                   ALfloat left, ALfloat right)
{
    ALuint c;
    float32x4_t leftright4;
    {
        float32x2_t leftright2 = vdup_n_f32(0.0);
        leftright2 = vset_lane_f32(left, leftright2, 0);
        leftright2 = vset_lane_f32(right, leftright2, 1);
        leftright4 = vcombine_f32(leftright2, leftright2);
    }
    for(c = 0;c < IrSize;c += 2)
    {
        const ALuint o0 = (Offset+c)&HRIR_MASK;
        const ALuint o1 = (o0+1)&HRIR_MASK;
        float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
                                        vld1_f32((float32_t*)&Values[o1][0]));
        float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
        float32x4_t deltas = vld1q_f32(&CoeffStep[c][0]);

        vals = vmlaq_f32(vals, coefs, leftright4);
        coefs = vaddq_f32(coefs, deltas);

        vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
        vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
        vst1q_f32(&Coeffs[c][0], coefs);
    }
}

static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
                               const ALuint IrSize,
                               ALfloat (*restrict Coeffs)[2],
                               ALfloat left, ALfloat right)
{
    ALuint c;
    float32x4_t leftright4;
    {
        float32x2_t leftright2 = vdup_n_f32(0.0);
        leftright2 = vset_lane_f32(left, leftright2, 0);
        leftright2 = vset_lane_f32(right, leftright2, 1);
        leftright4 = vcombine_f32(leftright2, leftright2);
    }
    for(c = 0;c < IrSize;c += 2)
    {
        const ALuint o0 = (Offset+c)&HRIR_MASK;
        const ALuint o1 = (o0+1)&HRIR_MASK;
        float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
                                        vld1_f32((float32_t*)&Values[o1][0]));
        float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);

        vals = vmlaq_f32(vals, coefs, leftright4);

        vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
        vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
    }
}

#define MixHrtf MixHrtf_Neon
#define MixDirectHrtf MixDirectHrtf_Neon
#include "mixer_inc.c"
#undef MixHrtf


void Mix_Neon(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
              MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
{
    ALfloat gain, step;
    float32x4_t gain4;
    ALuint c;

    for(c = 0;c < OutChans;c++)
    {
        ALuint pos = 0;
        gain = Gains[c].Current;
        step = Gains[c].Step;
        if(step != 0.0f && Counter > 0)
        {
            ALuint minsize = minu(BufferSize, Counter);
            /* Mix with applying gain steps in aligned multiples of 4. */
            if(minsize-pos > 3)
            {
                float32x4_t step4;
                gain4 = vsetq_lane_f32(gain, gain4, 0);
                gain4 = vsetq_lane_f32(gain + step, gain4, 1);
                gain4 = vsetq_lane_f32(gain + step + step, gain4, 2);
                gain4 = vsetq_lane_f32(gain + step + step + step, gain4, 3);
                step4 = vdupq_n_f32(step + step + step + step);
                do {
                    const float32x4_t val4 = vld1q_f32(&data[pos]);
                    float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
                    dry4 = vmlaq_f32(dry4, val4, gain4);
                    gain4 = vaddq_f32(gain4, step4);
                    vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
                    pos += 4;
                } while(minsize-pos > 3);
                /* NOTE: gain4 now represents the next four gains after the
                 * last four mixed samples, so the lowest element represents
                 * the next gain to apply.
                 */
                gain = vgetq_lane_f32(gain4, 0);
            }
            /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
            for(;pos < minsize;pos++)
            {
                OutBuffer[c][OutPos+pos] += data[pos]*gain;
                gain += step;
            }
            if(pos == Counter)
                gain = Gains[c].Target;
            Gains[c].Current = gain;

            /* Mix until pos is aligned with 4 or the mix is done. */
            minsize = minu(BufferSize, (pos+3)&~3);
            for(;pos < minsize;pos++)
                OutBuffer[c][OutPos+pos] += data[pos]*gain;
        }

        if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
            continue;
        gain4 = vdupq_n_f32(gain);
        for(;BufferSize-pos > 3;pos += 4)
        {
            const float32x4_t val4 = vld1q_f32(&data[pos]);
            float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
            dry4 = vmlaq_f32(dry4, val4, gain4);
            vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
        }
        for(;pos < BufferSize;pos++)
            OutBuffer[c][OutPos+pos] += data[pos]*gain;
    }
}

void MixRow_Neon(ALfloat *OutBuffer, const ALfloat *Gains, ALfloat (*restrict data)[BUFFERSIZE], ALuint InChans, ALuint BufferSize)
{
    float32x4_t gain4;
    ALuint c;

    for(c = 0;c < InChans;c++)
    {
        ALuint pos = 0;
        ALfloat gain = Gains[c];
        if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
            continue;

        gain4 = vdupq_n_f32(gain);
        for(;BufferSize-pos > 3;pos += 4)
        {
            const float32x4_t val4 = vld1q_f32(&data[c][pos]);
            float32x4_t dry4 = vld1q_f32(&OutBuffer[pos]);
            dry4 = vmlaq_f32(dry4, val4, gain4);
            vst1q_f32(&OutBuffer[pos], dry4);
        }
        for(;pos < BufferSize;pos++)
            OutBuffer[pos] += data[c][pos]*gain;
    }
}
Move mixers into separate source files 2012-08-15 01:01:55 -07:00			`#include "config.h"`

			`#include <arm_neon.h>`

			`#include "AL/al.h"`
			`#include "AL/alc.h"`
			`#include "alMain.h"`
			`#include "alu.h"`
Attempt to restore the Neon-enhanced ApplyCoeffsStep method Unable to test, but it hopefully works. 2014-02-23 21:28:34 -08:00			`#include "hrtf.h"`
Move mixers into separate source files 2012-08-15 01:01:55 -07:00

Partially revert "Use a different method for HRTF mixing" The sound localization with virtual channel mixing was just too poor, so while it's more costly to do per-source HRTF mixing, it's unavoidable if you want good localization. This is only partially reverted because having the virtual channel is still beneficial, particularly with B-Format rendering and effect mixing which otherwise skip HRTF processing. As before, the number of virtual channels can potentially be customized, specifying more or less channels depending on the system's needs. 2014-11-23 10:49:54 -08:00			`static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],`
			`const ALuint IrSize,`
			`ALfloat (*restrict Coeffs)[2],`
			`const ALfloat (*restrict CoeffStep)[2],`
			`ALfloat left, ALfloat right)`
			`{`
			`ALuint c;`
			`float32x4_t leftright4;`
			`{`
			`float32x2_t leftright2 = vdup_n_f32(0.0);`
			`leftright2 = vset_lane_f32(left, leftright2, 0);`
			`leftright2 = vset_lane_f32(right, leftright2, 1);`
			`leftright4 = vcombine_f32(leftright2, leftright2);`
			`}`
			`for(c = 0;c < IrSize;c += 2)`
			`{`
			`const ALuint o0 = (Offset+c)&HRIR_MASK;`
			`const ALuint o1 = (o0+1)&HRIR_MASK;`
			`float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),`
			`vld1_f32((float32_t*)&Values[o1][0]));`
			`float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);`
			`float32x4_t deltas = vld1q_f32(&CoeffStep[c][0]);`

			`vals = vmlaq_f32(vals, coefs, leftright4);`
			`coefs = vaddq_f32(coefs, deltas);`

			`vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));`
			`vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));`
			`vst1q_f32(&Coeffs[c][0], coefs);`
			`}`
			`}`

Use C99's inline instead of __inline 2013-05-28 22:27:07 -07:00			`static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],`
			`const ALuint IrSize,`
			`ALfloat (*restrict Coeffs)[2],`
			`ALfloat left, ALfloat right)`
Move mixers into separate source files 2012-08-15 01:01:55 -07:00			`{`
			`ALuint c;`
			`float32x4_t leftright4;`
			`{`
			`float32x2_t leftright2 = vdup_n_f32(0.0);`
			`leftright2 = vset_lane_f32(left, leftright2, 0);`
			`leftright2 = vset_lane_f32(right, leftright2, 1);`
			`leftright4 = vcombine_f32(leftright2, leftright2);`
			`}`
Update HRTF code This update allows for much more flexibility in the HRTF data. It also allows for HRTF table file names to include "%r" to represent the device's playback rate (e.g. if you set hrtf-%r.mhr, then it will try to use hrtf-44100.mhr or hrtf-48000.mhr depending if the device's output rate is 44100 or 48000, respectively). The makehrtf utility has also been updated to support more options and input file formats, as well as the new mhr format. 2012-09-11 01:59:42 -07:00			`for(c = 0;c < IrSize;c += 2)`
Move mixers into separate source files 2012-08-15 01:01:55 -07:00			`{`
			`const ALuint o0 = (Offset+c)&HRIR_MASK;`
			`const ALuint o1 = (o0+1)&HRIR_MASK;`
			`float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),`
			`vld1_f32((float32_t*)&Values[o1][0]));`
			`float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);`

			`vals = vmlaq_f32(vals, coefs, leftright4);`

			`vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));`
			`vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));`
			`}`
			`}`

Define MixHrtf directly instead of through a SUFFIX macro 2015-08-15 01:37:46 -07:00			`#define MixHrtf MixHrtf_Neon`
Use a more specialized mixer function for B-Format to HRTF 2016-08-12 05:26:36 -07:00			`#define MixDirectHrtf MixDirectHrtf_Neon`
Move mixers into separate source files 2012-08-15 01:01:55 -07:00			`#include "mixer_inc.c"`
Define MixHrtf directly instead of through a SUFFIX macro 2015-08-15 01:37:46 -07:00			`#undef MixHrtf`
Implement dry and wet mixers for Neon Code provided by Philippe Simons <simons.philippe@gmail.com>. 2014-01-26 01:34:39 -08:00

Fix Neon mixer definition 2014-08-31 23:46:43 -07:00			`void Mix_Neon(const ALfloat data, ALuint OutChans, ALfloat (restrict OutBuffer)[BUFFERSIZE],`
			`MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)`
Implement dry and wet mixers for Neon Code provided by Philippe Simons <simons.philippe@gmail.com>. 2014-01-26 01:34:39 -08:00			`{`
Combine the direct and send mixers 2014-06-13 13:34:19 -07:00			`ALfloat gain, step;`
			`float32x4_t gain4;`
Implement dry and wet mixers for Neon Code provided by Philippe Simons <simons.philippe@gmail.com>. 2014-01-26 01:34:39 -08:00			`ALuint c;`

Combine the direct and send mixers 2014-06-13 13:34:19 -07:00			`for(c = 0;c < OutChans;c++)`
Implement dry and wet mixers for Neon Code provided by Philippe Simons <simons.philippe@gmail.com>. 2014-01-26 01:34:39 -08:00			`{`
Step mixing gains per-sample for non-HRTF mixing This fades the dry mixing gains using a logarithmic curve, which should produce a smoother transition than a linear one. It functions similarly to a linear fade except that step = (target - current) / numsteps; ... gain += step; becomes step = powf(target / current, 1.0f / numsteps); ... gain *= step; where 'target' and 'current' are clamped to a lower bound that is greater than 0 (which makes no sense on a logarithmic scale). Consequently, the non-HRTF direct mixers do not do not feed into the click removal and pending click buffers, as this per-sample fading would do an adequate job of stopping clicks and pops caused by extreme gain changes. These buffers should be removed shortly. 2014-03-23 06:57:00 -07:00			`ALuint pos = 0;`
Combine the direct and send mixers 2014-06-13 13:34:19 -07:00			`gain = Gains[c].Current;`
			`step = Gains[c].Step;`
Use linear gain stepping 2014-11-25 02:08:48 -08:00			`if(step != 0.0f && Counter > 0)`
Step mixing gains per-sample for non-HRTF mixing This fades the dry mixing gains using a logarithmic curve, which should produce a smoother transition than a linear one. It functions similarly to a linear fade except that step = (target - current) / numsteps; ... gain += step; becomes step = powf(target / current, 1.0f / numsteps); ... gain *= step; where 'target' and 'current' are clamped to a lower bound that is greater than 0 (which makes no sense on a logarithmic scale). Consequently, the non-HRTF direct mixers do not do not feed into the click removal and pending click buffers, as this per-sample fading would do an adequate job of stopping clicks and pops caused by extreme gain changes. These buffers should be removed shortly. 2014-03-23 06:57:00 -07:00			`{`
Avoid double-checks for the stepping mixer loops 2015-09-30 17:25:28 -07:00			`ALuint minsize = minu(BufferSize, Counter);`
Mix gain steps using SIMD with Neon 2016-08-05 18:47:26 -07:00			`/* Mix with applying gain steps in aligned multiples of 4. */`
			`if(minsize-pos > 3)`
			`{`
			`float32x4_t step4;`
			`gain4 = vsetq_lane_f32(gain, gain4, 0);`
			`gain4 = vsetq_lane_f32(gain + step, gain4, 1);`
			`gain4 = vsetq_lane_f32(gain + step + step, gain4, 2);`
			`gain4 = vsetq_lane_f32(gain + step + step + step, gain4, 3);`
			`step4 = vdupq_n_f32(step + step + step + step);`
			`do {`
			`const float32x4_t val4 = vld1q_f32(&data[pos]);`
			`float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);`
			`dry4 = vmlaq_f32(dry4, val4, gain4);`
			`gain4 = vaddq_f32(gain4, step4);`
			`vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);`
			`pos += 4;`
			`} while(minsize-pos > 3);`
			`/* NOTE: gain4 now represents the next four gains after the`
			`* last four mixed samples, so the lowest element represents`
			`* the next gain to apply.`
			`*/`
			`gain = vgetq_lane_f32(gain4, 0);`
			`}`
			`/* Mix with applying left over gain steps that aren't aligned multiples of 4. */`
Avoid double-checks for the stepping mixer loops 2015-09-30 17:25:28 -07:00			`for(;pos < minsize;pos++)`
Step mixing gains per-sample for non-HRTF mixing This fades the dry mixing gains using a logarithmic curve, which should produce a smoother transition than a linear one. It functions similarly to a linear fade except that step = (target - current) / numsteps; ... gain += step; becomes step = powf(target / current, 1.0f / numsteps); ... gain *= step; where 'target' and 'current' are clamped to a lower bound that is greater than 0 (which makes no sense on a logarithmic scale). Consequently, the non-HRTF direct mixers do not do not feed into the click removal and pending click buffers, as this per-sample fading would do an adequate job of stopping clicks and pops caused by extreme gain changes. These buffers should be removed shortly. 2014-03-23 06:57:00 -07:00			`{`
Combine the direct and send mixers 2014-06-13 13:34:19 -07:00			`OutBuffer[c][OutPos+pos] += data[pos]*gain;`
Use linear gain stepping 2014-11-25 02:08:48 -08:00			`gain += step;`
Step mixing gains per-sample for non-HRTF mixing This fades the dry mixing gains using a logarithmic curve, which should produce a smoother transition than a linear one. It functions similarly to a linear fade except that step = (target - current) / numsteps; ... gain += step; becomes step = powf(target / current, 1.0f / numsteps); ... gain *= step; where 'target' and 'current' are clamped to a lower bound that is greater than 0 (which makes no sense on a logarithmic scale). Consequently, the non-HRTF direct mixers do not do not feed into the click removal and pending click buffers, as this per-sample fading would do an adequate job of stopping clicks and pops caused by extreme gain changes. These buffers should be removed shortly. 2014-03-23 06:57:00 -07:00			`}`
Always use the current gains when mixing The current gain gets explicitly set to the target when the stepping is finished to ensure the target is still used. This way, however, will allow for asynchronously 'canceling' a fade by setting the counter to 0. 2014-05-04 00:13:19 -07:00			`if(pos == Counter)`
Combine the direct and send mixers 2014-06-13 13:34:19 -07:00			`gain = Gains[c].Target;`
			`Gains[c].Current = gain;`
Avoid double-checks for the stepping mixer loops 2015-09-30 17:25:28 -07:00
Make sure all gain steps are applied with the SSE and Neon mixers 2014-05-03 17:24:46 -07:00			`/* Mix until pos is aligned with 4 or the mix is done. */`
Use the correct realignment size for post-stepping mixing 2015-10-18 13:46:52 -07:00			`minsize = minu(BufferSize, (pos+3)&~3);`
Avoid double-checks for the stepping mixer loops 2015-09-30 17:25:28 -07:00			`for(;pos < minsize;pos++)`
Combine the direct and send mixers 2014-06-13 13:34:19 -07:00			`OutBuffer[c][OutPos+pos] += data[pos]*gain;`
Step mixing gains per-sample for non-HRTF mixing This fades the dry mixing gains using a logarithmic curve, which should produce a smoother transition than a linear one. It functions similarly to a linear fade except that step = (target - current) / numsteps; ... gain += step; becomes step = powf(target / current, 1.0f / numsteps); ... gain *= step; where 'target' and 'current' are clamped to a lower bound that is greater than 0 (which makes no sense on a logarithmic scale). Consequently, the non-HRTF direct mixers do not do not feed into the click removal and pending click buffers, as this per-sample fading would do an adequate job of stopping clicks and pops caused by extreme gain changes. These buffers should be removed shortly. 2014-03-23 06:57:00 -07:00			`}`

Check the absolute gain value for silence Future B-Format support will be using negative gains, which still need to be applied. 2014-10-31 16:55:19 -07:00			`if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))`
Implement dry and wet mixers for Neon Code provided by Philippe Simons <simons.philippe@gmail.com>. 2014-01-26 01:34:39 -08:00			`continue;`
Combine the direct and send mixers 2014-06-13 13:34:19 -07:00			`gain4 = vdupq_n_f32(gain);`
Step mixing gains per-sample for non-HRTF mixing This fades the dry mixing gains using a logarithmic curve, which should produce a smoother transition than a linear one. It functions similarly to a linear fade except that step = (target - current) / numsteps; ... gain += step; becomes step = powf(target / current, 1.0f / numsteps); ... gain *= step; where 'target' and 'current' are clamped to a lower bound that is greater than 0 (which makes no sense on a logarithmic scale). Consequently, the non-HRTF direct mixers do not do not feed into the click removal and pending click buffers, as this per-sample fading would do an adequate job of stopping clicks and pops caused by extreme gain changes. These buffers should be removed shortly. 2014-03-23 06:57:00 -07:00			`for(;BufferSize-pos > 3;pos += 4)`
Implement dry and wet mixers for Neon Code provided by Philippe Simons <simons.philippe@gmail.com>. 2014-01-26 01:34:39 -08:00			`{`
			`const float32x4_t val4 = vld1q_f32(&data[pos]);`
			`float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);`
Replace separate vaddq_f32/vmulq_f32 calls with a vmlaq_f32 2015-09-30 13:34:09 -07:00			`dry4 = vmlaq_f32(dry4, val4, gain4);`
Implement dry and wet mixers for Neon Code provided by Philippe Simons <simons.philippe@gmail.com>. 2014-01-26 01:34:39 -08:00			`vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);`
			`}`
			`for(;pos < BufferSize;pos++)`
Combine the direct and send mixers 2014-06-13 13:34:19 -07:00			`OutBuffer[c][OutPos+pos] += data[pos]*gain;`
Add gain stepping to the send mixers 2014-03-23 16:11:21 -07:00			`}`
Implement dry and wet mixers for Neon Code provided by Philippe Simons <simons.philippe@gmail.com>. 2014-01-26 01:34:39 -08:00			`}`
Implement a Neon-enhanced MixRow 2016-06-01 23:39:13 -07:00
Rename MatrixMixerFunc to RowMixerFunc 2016-09-02 00:29:46 -07:00			`void MixRow_Neon(ALfloat OutBuffer, const ALfloat Gains, ALfloat (*restrict data)[BUFFERSIZE], ALuint InChans, ALuint BufferSize)`
Implement a Neon-enhanced MixRow 2016-06-01 23:39:13 -07:00			`{`
			`float32x4_t gain4;`
			`ALuint c;`

			`for(c = 0;c < InChans;c++)`
			`{`
			`ALuint pos = 0;`
Rename MatrixMixerFunc to RowMixerFunc 2016-09-02 00:29:46 -07:00			`ALfloat gain = Gains[c];`
Implement a Neon-enhanced MixRow 2016-06-01 23:39:13 -07:00			`if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))`
			`continue;`

			`gain4 = vdupq_n_f32(gain);`
			`for(;BufferSize-pos > 3;pos += 4)`
			`{`
			`const float32x4_t val4 = vld1q_f32(&data[c][pos]);`
			`float32x4_t dry4 = vld1q_f32(&OutBuffer[pos]);`
			`dry4 = vmlaq_f32(dry4, val4, gain4);`
			`vst1q_f32(&OutBuffer[pos], dry4);`
			`}`
			`for(;pos < BufferSize;pos++)`
			`OutBuffer[pos] += data[c][pos]*gain;`
			`}`
			`}`