From 7cedf7a80b2858d8a210234e86984013fdda6505 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Thu, 26 Sep 2024 13:52:12 -0600 Subject: [PATCH] [MultiVarStore] Add NEON codepath Slows down in my testing. --- src/hb-ot-layout-common.hh | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/hb-ot-layout-common.hh b/src/hb-ot-layout-common.hh index eb9276150..accd977d2 100644 --- a/src/hb-ot-layout-common.hh +++ b/src/hb-ot-layout-common.hh @@ -40,6 +40,8 @@ #ifdef __SSE__ #include +#elif defined(__ARM_NEON) +#include #endif @@ -3154,7 +3156,6 @@ struct MultiVarData { if (scalar == 1.f) { - // SSE version unsigned i = 0; for (; i + 4 <= count; i += 4) { @@ -3167,7 +3168,6 @@ struct MultiVarData } else { - // SSE version unsigned i = 0; __m128 s = _mm_set1_ps (scalar); for (; i + 4 <= count; i += 4) @@ -3181,6 +3181,35 @@ struct MultiVarData } return; } +#elif defined(__ARM_NEON) + { + if (scalar == 1.f) + { + unsigned i = 0; + for (; i + 4 <= count; i += 4) + { + float32x4_t a = vld1q_f32 (in + i); + float32x4_t b = vld1q_f32 (out + i); + vst1q_f32 (out + i, vaddq_f32 (a, b)); + } + for (; i < count; i++) + out[i] += in[i]; + } + else + { + unsigned i = 0; + float32x4_t s = vdupq_n_f32 (scalar); + for (; i + 4 <= count; i += 4) + { + float32x4_t a = vld1q_f32 (in + i); + float32x4_t b = vld1q_f32 (out + i); + vst1q_f32 (out + i, vmlaq_f32 (b, a, s)); + } + for (; i < count; i++) + out[i] += in[i] * scalar; + } + return; + } #endif // Fallback