diff --git a/src/hb-ot-layout.cc b/src/hb-ot-layout.cc index 97f3473af..5d6f0cc13 100644 --- a/src/hb-ot-layout.cc +++ b/src/hb-ot-layout.cc @@ -30,7 +30,9 @@ #include "hb.hh" -#include +#pragma GCC target("avx2") + +#include #ifndef HB_NO_OT_LAYOUT @@ -1854,11 +1856,9 @@ struct GPOSProxy const GPOS::accelerator_t &accel; }; -inline bool uint32x4_is_not_zero(uint32x4_t v) +static inline bool m256_is_not_zero(__m256i v) { - // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics - uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v)); - return vget_lane_u32(vpmax_u32(tmp, tmp), 0); + return !_mm256_testz_si256 (v, v); } static inline bool @@ -1868,26 +1868,25 @@ apply_forward (OT::hb_ot_apply_context_t *c, { bool use_cache = accel.cache_enter (c); + static const __m256i indices = _mm256_set_epi32 (35, 30, 25, 20, 15, 10, 5, 0); + bool ret = false; hb_buffer_t *buffer = c->buffer; while (buffer->idx < buffer->len && buffer->successful) { - uint32x4_t lookup_masksv = vdupq_n_u32 (c->lookup_mask); + __m256i lookup_masksv = _mm256_set1_epi32 (c->lookup_mask); const auto *in = buffer->info; unsigned i = buffer->idx; - while (i + 4 < buffer->len) + while (i + 8 < buffer->len) { - const uint32_t codepoints[4] = {in[i+0].codepoint, in[i+1].codepoint, in[i+2].codepoint, in[i+3].codepoint}; - uint32x4_t codepointsv = vld1q_u32 (codepoints); - - const uint32_t masks[4] = {in[i+0].mask, in[i+1].mask, in[i+2].mask, in[i+3].mask}; - uint32x4_t masksv = vld1q_u32 (masks); + __m256i codepointsv = _mm256_i32gather_epi32 ((const int *) &in[i].codepoint, indices, 4); + __m256i masksv = _mm256_i32gather_epi32 ((const int *) &in[i].mask, indices, 4); if (accel.may_have (codepointsv) && - uint32x4_is_not_zero (vandq_u32 (lookup_masksv, masksv))) + m256_is_not_zero (_mm256_and_si256 (lookup_masksv, masksv))) break; - i += 4; + i += 8; } (void) buffer->next_glyphs (i - buffer->idx); diff --git a/src/hb-set-digest.hh b/src/hb-set-digest.hh index 2a2749edc..78ace6157 100644 --- a/src/hb-set-digest.hh +++ b/src/hb-set-digest.hh @@ -30,7 +30,9 @@ #include "hb.hh" #include "hb-machinery.hh" -#include +#pragma GCC target("avx2") + +#include /* * The set-digests here implement various "filters" that support @@ -126,15 +128,13 @@ struct hb_set_digest_bits_pattern_t bool may_have (hb_codepoint_t g) const { return mask & mask_for (g); } - static inline bool uint32x4_is_not_zero(uint32x4_t v) + static inline bool m256_is_not_zero(__m256i v) { - // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics - uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v)); - return vget_lane_u32(vpmax_u32(tmp, tmp), 0); + return !_mm256_testz_si256 (v, v); } - bool may_have (const uint32x4_t &g) const - { return uint32x4_is_not_zero (vandq_u32 (vdupq_n_u32 (mask), mask_for (g))); } + bool may_have (const __m256i &g) const + { return m256_is_not_zero (_mm256_and_si256 (_mm256_set1_epi32 (mask), mask_for (g))); } private: @@ -143,15 +143,15 @@ struct hb_set_digest_bits_pattern_t template - static uint32x4_t shifted (uint32x4_t v) { return v; } + static __m256i shifted (__m256i v) { return v; } template - static uint32x4_t shifted (uint32x4_t v) { return vshrq_n_u32 (v, shift); } + static __m256i shifted (__m256i v) { return _mm256_srli_epi32 (v, shift); } - static uint32x4_t mask_for (const uint32x4_t &g) + static __m256i mask_for (const __m256i &g) { - uint32x4_t a = vandq_u32 (shifted (g), vdupq_n_u32 (mask_bits - 1)); - return vshlq_u32 (vdupq_n_u32 (1), a); + __m256i a = _mm256_and_si256 (shifted (g), _mm256_set1_epi32 (mask_bits - 1)); + return _mm256_sllv_epi32 (_mm256_set1_epi32 (1), a); } mask_t mask;