Avoid some extraneous load calls

This likely doesn't change anything given a working optimizer, but it cleans up
the code some.
This commit is contained in:
Chris Robinson 2017-08-30 11:30:19 -07:00
parent 2916efee21
commit cd15b1775e
2 changed files with 23 additions and 26 deletions

View File

@ -144,8 +144,8 @@ const ALfloat *Resample_bsinc_Neon(const InterpState *state,
const ALfloat *const filter = state->bsinc.filter;
const float32x4_t sf4 = vdupq_n_f32(state->bsinc.sf);
const ALsizei m = state->bsinc.m;
const ALfloat *fil, *scd, *phd, *spd;
ALsizei pi, i, j;
const float32x4_t *fil, *scd, *phd, *spd;
ALsizei pi, i, j, offset;
float32x4_t r4;
ALfloat pf;
@ -158,23 +158,22 @@ const ALfloat *Resample_bsinc_Neon(const InterpState *state,
pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
#undef FRAC_PHASE_BITDIFF
fil = ASSUME_ALIGNED(filter + m*pi*4, 16);
scd = ASSUME_ALIGNED(fil + m, 16);
phd = ASSUME_ALIGNED(scd + m, 16);
spd = ASSUME_ALIGNED(phd + m, 16);
offset = m*pi*4;
fil = ASSUME_ALIGNED(filter + offset, 16); offset += m;
scd = ASSUME_ALIGNED(filter + offset, 16); offset += m;
phd = ASSUME_ALIGNED(filter + offset, 16); offset += m;
spd = ASSUME_ALIGNED(filter + offset, 16);
// Apply the scale and phase interpolated filter.
r4 = vdupq_n_f32(0.0f);
{
const float32x4_t pf4 = vdupq_n_f32(pf);
for(j = 0;j < m;j+=4)
for(j = 0;j < m;j+=4,fil++,scd++,phd++,spd++)
{
/* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
const float32x4_t f4 = vmlaq_f32(vmlaq_f32(vld1q_f32(&fil[j]),
sf4, vld1q_f32(&scd[j])),
pf4, vmlaq_f32(vld1q_f32(&phd[j]),
sf4, vld1q_f32(&spd[j])
)
const float32x4_t f4 = vmlaq_f32(
vmlaq_f32(*fil, sf4, *scd),
pf4, vmlaq_f32(*phd, sf4, *spd)
);
/* r += f*src */
r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j]));

View File

@ -19,8 +19,8 @@ const ALfloat *Resample_bsinc_SSE(const InterpState *state, const ALfloat *restr
const ALfloat *const filter = state->bsinc.filter;
const __m128 sf4 = _mm_set1_ps(state->bsinc.sf);
const ALsizei m = state->bsinc.m;
const ALfloat *fil, *scd, *phd, *spd;
ALsizei pi, i, j;
const __m128 *fil, *scd, *phd, *spd;
ALsizei pi, i, j, offset;
ALfloat pf;
__m128 r4;
@ -33,30 +33,28 @@ const ALfloat *Resample_bsinc_SSE(const InterpState *state, const ALfloat *restr
pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
#undef FRAC_PHASE_BITDIFF
fil = ASSUME_ALIGNED(filter + m*pi*4, 16);
scd = ASSUME_ALIGNED(fil + m, 16);
phd = ASSUME_ALIGNED(scd + m, 16);
spd = ASSUME_ALIGNED(phd + m, 16);
offset = m*pi*4;
fil = ASSUME_ALIGNED(filter + offset, 16); offset += m;
scd = ASSUME_ALIGNED(filter + offset, 16); offset += m;
phd = ASSUME_ALIGNED(filter + offset, 16); offset += m;
spd = ASSUME_ALIGNED(filter + offset, 16);
// Apply the scale and phase interpolated filter.
r4 = _mm_setzero_ps();
{
const __m128 pf4 = _mm_set1_ps(pf);
#define LD4(x) _mm_load_ps(x)
#define ULD4(x) _mm_loadu_ps(x)
#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
for(j = 0;j < m;j+=4)
for(j = 0;j < m;j+=4,fil++,scd++,phd++,spd++)
{
/* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
const __m128 f4 = MLA4(MLA4(LD4(&fil[j]), sf4, LD4(&scd[j])),
pf4, MLA4(LD4(&phd[j]), sf4, LD4(&spd[j]))
const __m128 f4 = MLA4(
MLA4(*fil, sf4, *scd),
pf4, MLA4(*phd, sf4, *spd)
);
/* r += f*src */
r4 = MLA4(r4, f4, ULD4(&src[j]));
r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
}
#undef MLA4
#undef ULD4
#undef LD4
}
r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));