From 0eaf833366ea049b6b6b578ded0e25168ea26f6a Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Sun, 29 Sep 2024 09:30:11 -0600 Subject: [PATCH] [MultiVarStore] Reinstate SSE / NEON paths Code is both smaller and slightly faster. --- src/hb-ot-layout-common.hh | 55 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/src/hb-ot-layout-common.hh b/src/hb-ot-layout-common.hh index 0aee4bb38..82f88086c 100644 --- a/src/hb-ot-layout-common.hh +++ b/src/hb-ot-layout-common.hh @@ -38,6 +38,12 @@ #include "OT/Layout/Common/Coverage.hh" #include "OT/Layout/types.hh" +#ifdef __SSE__ +#include +#elif defined(__ARM_NEON) +#include +#endif + // TODO(garretrieger): cleanup these after migration. using OT::Layout::Common::Coverage; @@ -3147,11 +3153,56 @@ struct MultiVarData values_iter += count; if (scalar == 1.f) - for (unsigned i = 0; i < count; i++) + { + unsigned i = 0; + for (; i + 4 <= count; i += 4) + { +#ifdef __SSE__ + __m128 a = _mm_loadu_ps (in + i); + __m128 b = _mm_loadu_ps (out + i); + _mm_storeu_ps (out + i, _mm_add_ps (a, b)); +#elif defined(__ARM_NEON) + float32x4_t a = vld1q_f32 (in + i); + float32x4_t b = vld1q_f32 (out + i); + vst1q_f32 (out + i, vaddq_f32 (a, b)); +#else + out[i + 0] += in[i + 0]; + out[i + 1] += in[i + 1]; + out[i + 2] += in[i + 2]; + out[i + 3] += in[i + 3]; +#endif + } + for (; i < count; i++) out[i] += in[i]; + } else - for (unsigned i = 0; i < count; i++) + { + unsigned i = 0; +#ifdef __SSE__ + __m128 s = _mm_set1_ps (scalar); +#elif defined(__ARM_NEON) + float32x4_t s = vdupq_n_f32 (scalar); +#endif + for (; i + 4 <= count; i += 4) + { +#ifdef __SSE__ + __m128 a = _mm_loadu_ps (in + i); + __m128 b = _mm_loadu_ps (out + i); + _mm_storeu_ps (out + i, _mm_add_ps (_mm_mul_ps (a, s), b)); +#elif defined(__ARM_NEON) + float32x4_t a = vld1q_f32 (in + i); + float32x4_t b = vld1q_f32 (out + i); + vst1q_f32 (out + i, vmlaq_f32 (b, a, s)); +#else + out[i + 0] += in[i + 0] * scalar; + out[i + 1] += in[i + 1] * scalar; + out[i + 2] += in[i + 2] * scalar; + out[i + 3] += in[i + 3] * scalar; +#endif + } + for (; i < count; i++) out[i] += in[i] * scalar; + } } template