[MultiVarStore] Add NEON codepath

Slows down in my testing.
This commit is contained in:
Behdad Esfahbod 2024-09-26 13:52:12 -06:00
parent 0c215616d8
commit 7cedf7a80b

View file

@ -40,6 +40,8 @@
#ifdef __SSE__
#include <xmmintrin.h>
#elif defined(__ARM_NEON)
#include <arm_neon.h>
#endif
@ -3154,7 +3156,6 @@ struct MultiVarData
{
if (scalar == 1.f)
{
// SSE version
unsigned i = 0;
for (; i + 4 <= count; i += 4)
{
@ -3167,7 +3168,6 @@ struct MultiVarData
}
else
{
// SSE version
unsigned i = 0;
__m128 s = _mm_set1_ps (scalar);
for (; i + 4 <= count; i += 4)
@ -3181,6 +3181,35 @@ struct MultiVarData
}
return;
}
#elif defined(__ARM_NEON)
{
if (scalar == 1.f)
{
unsigned i = 0;
for (; i + 4 <= count; i += 4)
{
float32x4_t a = vld1q_f32 (in + i);
float32x4_t b = vld1q_f32 (out + i);
vst1q_f32 (out + i, vaddq_f32 (a, b));
}
for (; i < count; i++)
out[i] += in[i];
}
else
{
unsigned i = 0;
float32x4_t s = vdupq_n_f32 (scalar);
for (; i + 4 <= count; i += 4)
{
float32x4_t a = vld1q_f32 (in + i);
float32x4_t b = vld1q_f32 (out + i);
vst1q_f32 (out + i, vmlaq_f32 (b, a, s));
}
for (; i < count; i++)
out[i] += in[i] * scalar;
}
return;
}
#endif
// Fallback