diff --git a/src/hb-ot-layout-gsubgpos.hh b/src/hb-ot-layout-gsubgpos.hh index aae2bf9ce..7717cadb3 100644 --- a/src/hb-ot-layout-gsubgpos.hh +++ b/src/hb-ot-layout-gsubgpos.hh @@ -4085,7 +4085,8 @@ struct hb_ot_layout_lookup_accelerator_t return thiz; } - bool may_have (hb_codepoint_t g) const + template + bool may_have (const T &g) const { return digest.may_have (g); } bool apply (hb_ot_apply_context_t *c, unsigned subtables_count, bool use_cache) const diff --git a/src/hb-ot-layout.cc b/src/hb-ot-layout.cc index c66ee8cfd..97f3473af 100644 --- a/src/hb-ot-layout.cc +++ b/src/hb-ot-layout.cc @@ -30,6 +30,8 @@ #include "hb.hh" +#include + #ifndef HB_NO_OT_LAYOUT #ifdef HB_NO_OT_TAG @@ -1852,6 +1854,12 @@ struct GPOSProxy const GPOS::accelerator_t &accel; }; +inline bool uint32x4_is_not_zero(uint32x4_t v) +{ + // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics + uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v)); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0); +} static inline bool apply_forward (OT::hb_ot_apply_context_t *c, @@ -1864,6 +1872,25 @@ apply_forward (OT::hb_ot_apply_context_t *c, hb_buffer_t *buffer = c->buffer; while (buffer->idx < buffer->len && buffer->successful) { + uint32x4_t lookup_masksv = vdupq_n_u32 (c->lookup_mask); + const auto *in = buffer->info; + unsigned i = buffer->idx; + while (i + 4 < buffer->len) + { + const uint32_t codepoints[4] = {in[i+0].codepoint, in[i+1].codepoint, in[i+2].codepoint, in[i+3].codepoint}; + uint32x4_t codepointsv = vld1q_u32 (codepoints); + + const uint32_t masks[4] = {in[i+0].mask, in[i+1].mask, in[i+2].mask, in[i+3].mask}; + uint32x4_t masksv = vld1q_u32 (masks); + + if (accel.may_have (codepointsv) && + uint32x4_is_not_zero (vandq_u32 (lookup_masksv, masksv))) + break; + + i += 4; + } + (void) buffer->next_glyphs (i - buffer->idx); + bool applied = false; if (accel.digest.may_have (buffer->cur().codepoint) && (buffer->cur().mask & c->lookup_mask) && diff --git a/src/hb-set-digest.hh b/src/hb-set-digest.hh index dab713729..2a2749edc 100644 --- a/src/hb-set-digest.hh +++ b/src/hb-set-digest.hh @@ -30,6 +30,8 @@ #include "hb.hh" #include "hb-machinery.hh" +#include + /* * The set-digests here implement various "filters" that support * "approximate member query". Conceptually these are like Bloom @@ -124,10 +126,34 @@ struct hb_set_digest_bits_pattern_t bool may_have (hb_codepoint_t g) const { return mask & mask_for (g); } + static inline bool uint32x4_is_not_zero(uint32x4_t v) + { + // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics + uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v)); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0); + } + + bool may_have (const uint32x4_t &g) const + { return uint32x4_is_not_zero (vandq_u32 (vdupq_n_u32 (mask), mask_for (g))); } + private: static mask_t mask_for (hb_codepoint_t g) { return ((mask_t) 1) << ((g >> shift) & (mask_bits - 1)); } + + template + static uint32x4_t shifted (uint32x4_t v) { return v; } + template + static uint32x4_t shifted (uint32x4_t v) { return vshrq_n_u32 (v, shift); } + + static uint32x4_t mask_for (const uint32x4_t &g) + { + uint32x4_t a = vandq_u32 (shifted (g), vdupq_n_u32 (mask_bits - 1)); + return vshlq_u32 (vdupq_n_u32 (1), a); + } + mask_t mask; }; @@ -179,7 +205,8 @@ struct hb_set_digest_combiner_t return head.may_have (o.head) && tail.may_have (o.tail); } - bool may_have (hb_codepoint_t g) const + template + bool may_have (const T &g) const { return head.may_have (g) && tail.may_have (g); } @@ -200,11 +227,11 @@ struct hb_set_digest_combiner_t using hb_set_digest_t = hb_set_digest_combiner_t < - hb_set_digest_bits_pattern_t, + hb_set_digest_bits_pattern_t, hb_set_digest_combiner_t < - hb_set_digest_bits_pattern_t, - hb_set_digest_bits_pattern_t + hb_set_digest_bits_pattern_t, + hb_set_digest_bits_pattern_t > > ;