mirror of
https://github.com/harfbuzz/harfbuzz.git
synced 2025-04-05 05:25:05 +00:00
[MultiVarStore] Add NEON codepath
Slows down in my testing.
This commit is contained in:
parent
0c215616d8
commit
7cedf7a80b
1 changed files with 31 additions and 2 deletions
|
@ -40,6 +40,8 @@
|
|||
|
||||
#ifdef __SSE__
|
||||
#include <xmmintrin.h>
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -3154,7 +3156,6 @@ struct MultiVarData
|
|||
{
|
||||
if (scalar == 1.f)
|
||||
{
|
||||
// SSE version
|
||||
unsigned i = 0;
|
||||
for (; i + 4 <= count; i += 4)
|
||||
{
|
||||
|
@ -3167,7 +3168,6 @@ struct MultiVarData
|
|||
}
|
||||
else
|
||||
{
|
||||
// SSE version
|
||||
unsigned i = 0;
|
||||
__m128 s = _mm_set1_ps (scalar);
|
||||
for (; i + 4 <= count; i += 4)
|
||||
|
@ -3181,6 +3181,35 @@ struct MultiVarData
|
|||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(__ARM_NEON)
|
||||
{
|
||||
if (scalar == 1.f)
|
||||
{
|
||||
unsigned i = 0;
|
||||
for (; i + 4 <= count; i += 4)
|
||||
{
|
||||
float32x4_t a = vld1q_f32 (in + i);
|
||||
float32x4_t b = vld1q_f32 (out + i);
|
||||
vst1q_f32 (out + i, vaddq_f32 (a, b));
|
||||
}
|
||||
for (; i < count; i++)
|
||||
out[i] += in[i];
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned i = 0;
|
||||
float32x4_t s = vdupq_n_f32 (scalar);
|
||||
for (; i + 4 <= count; i += 4)
|
||||
{
|
||||
float32x4_t a = vld1q_f32 (in + i);
|
||||
float32x4_t b = vld1q_f32 (out + i);
|
||||
vst1q_f32 (out + i, vmlaq_f32 (b, a, s));
|
||||
}
|
||||
for (; i < count; i++)
|
||||
out[i] += in[i] * scalar;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Fallback
|
||||
|
|
Loading…
Add table
Reference in a new issue