Implement dry and wet mixers for Neon

Code provided by Philippe Simons <simons.philippe@gmail.com>.
This commit is contained in:
Chris Robinson 2014-01-26 01:34:39 -08:00
parent 49baa9128d
commit a4bc0a46e9
4 changed files with 91 additions and 5 deletions

View File

@ -1745,8 +1745,8 @@ static ALCenum UpdateDeviceParams(ALCdevice *device, const ALCint *attrList)
device->UpdateSize = (ALuint64)device->UpdateSize * freq /
device->Frequency;
/* SSE does best with the update size being a multiple of 4 */
if((CPUCapFlags&CPU_CAP_SSE))
/* SSE and Neon do best with the update size being a multiple of 4 */
if((CPUCapFlags&(CPU_CAP_SSE|CPU_CAP_NEON)) != 0)
device->UpdateSize = (device->UpdateSize+3)&~3;
device->Frequency = freq;
@ -1861,6 +1861,8 @@ static ALCenum UpdateDeviceParams(ALCdevice *device, const ALCint *attrList)
{
if((CPUCapFlags&CPU_CAP_SSE))
WARN("SSE performs best with multiple of 4 update sizes (%u)\n", device->UpdateSize);
if((CPUCapFlags&CPU_CAP_NEON))
WARN("NEON performs best with multiple of 4 update sizes (%u)\n", device->UpdateSize);
}
SetMixerFPUMode(&oldMode);

View File

@ -118,6 +118,10 @@ static DryMixerFunc SelectDirectMixer(void)
if((CPUCapFlags&CPU_CAP_SSE))
return MixDirect_SSE;
#endif
#ifdef HAVE_NEON
if((CPUCapFlags&CPU_CAP_NEON))
return MixDirect_Neon;
#endif
return MixDirect_C;
}
@ -128,6 +132,10 @@ static WetMixerFunc SelectSendMixer(void)
if((CPUCapFlags&CPU_CAP_SSE))
return MixSend_SSE;
#endif
#ifdef HAVE_NEON
if((CPUCapFlags&CPU_CAP_NEON))
return MixSend_Neon;
#endif
return MixSend_C;
}

View File

@ -27,5 +27,7 @@ void MixSend_SSE(const struct SendParams*,const ALfloat*restrict,ALuint,ALuint,A
/* Neon mixers */
void MixDirect_Hrtf_Neon(const struct DirectParams*,const ALfloat*restrict,ALuint,ALuint,ALuint,ALuint);
void MixDirect_Neon(const struct DirectParams*,const ALfloat*restrict,ALuint,ALuint,ALuint,ALuint);
void MixSend_Neon(const struct SendParams*,const ALfloat*restrict,ALuint,ALuint,ALuint);
#endif /* MIXER_DEFS_H */

View File

@ -14,11 +14,15 @@ static inline void ApplyCoeffsStep(const ALuint IrSize,
ALfloat (*restrict Coeffs)[2],
const ALfloat (*restrict CoeffStep)[2])
{
float32x4_t coeffs, deltas;
ALuint c;
for(c = 0;c < IrSize;c++)
for(c = 0;c < IrSize;c += 2)
{
Coeffs[c][0] += CoeffStep[c][0];
Coeffs[c][1] += CoeffStep[c][1];
coeffs = vld1q_f32(&Coeffs[c][0]);
deltas = vld1q_f32(&CoeffStep[c][0]);
coeffs = vaddq_f32(coeffs, deltas);
vst1q_f32(&Coeffs[c][0], coeffs);
}
}
@ -54,3 +58,73 @@ static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
#define SUFFIX Neon
#include "mixer_inc.c"
#undef SUFFIX
void MixDirect_Neon(const DirectParams *params, const ALfloat *restrict data, ALuint srcchan,
ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
{
ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
ALfloat *restrict ClickRemoval = params->ClickRemoval;
ALfloat *restrict PendingClicks = params->PendingClicks;
ALfloat DrySend;
float32x4_t gain;
ALuint pos;
ALuint c;
for(c = 0;c < MaxChannels;c++)
{
DrySend = params->Gains[srcchan][c];
if(!(DrySend > GAIN_SILENCE_THRESHOLD))
continue;
if(OutPos == 0)
ClickRemoval[c] -= data[0]*DrySend;
gain = vdupq_n_f32(DrySend);
for(pos = 0;BufferSize-pos > 3;pos += 4)
{
const float32x4_t val4 = vld1q_f32(&data[pos]);
float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
dry4 = vaddq_f32(dry4, vmulq_f32(val4, gain));
vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
}
for(;pos < BufferSize;pos++)
OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
if(OutPos+pos == SamplesToDo)
PendingClicks[c] += data[pos]*DrySend;
}
}
void MixSend_Neon(const SendParams *params, const ALfloat *restrict data,
ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
{
ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
ALfloat *restrict ClickRemoval = params->ClickRemoval;
ALfloat *restrict PendingClicks = params->PendingClicks;
ALfloat WetGain;
float32x4_t gain;
ALuint pos;
WetGain = params->Gain;
if(!(WetGain > GAIN_SILENCE_THRESHOLD))
return;
if(OutPos == 0)
ClickRemoval[0] -= data[0] * WetGain;
gain = vdupq_n_f32(WetGain);
for(pos = 0;BufferSize-pos > 3;pos += 4)
{
const float32x4_t val4 = vld1q_f32(&data[pos]);
float32x4_t wet4 = vld1q_f32(&OutBuffer[0][OutPos+pos]);
wet4 = vaddq_f32(wet4, vmulq_f32(val4, gain));
vst1q_f32(&OutBuffer[0][OutPos+pos], wet4);
}
for(;pos < BufferSize;pos++)
OutBuffer[0][OutPos+pos] += data[pos] * WetGain;
if(OutPos+pos == SamplesToDo)
PendingClicks[0] += data[pos] * WetGain;
}