diff --git a/src/hb-ot-layout-gsubgpos.hh b/src/hb-ot-layout-gsubgpos.hh
index aae2bf9ce..7717cadb3 100644
--- a/src/hb-ot-layout-gsubgpos.hh
+++ b/src/hb-ot-layout-gsubgpos.hh
@@ -4085,7 +4085,8 @@ struct hb_ot_layout_lookup_accelerator_t
     return thiz;
   }
 
-  bool may_have (hb_codepoint_t g) const
+  template <typename T>
+  bool may_have (const T &g) const
   { return digest.may_have (g); }
 
   bool apply (hb_ot_apply_context_t *c, unsigned subtables_count, bool use_cache) const
diff --git a/src/hb-ot-layout.cc b/src/hb-ot-layout.cc
index c66ee8cfd..97f3473af 100644
--- a/src/hb-ot-layout.cc
+++ b/src/hb-ot-layout.cc
@@ -30,6 +30,8 @@
 
 #include "hb.hh"
 
+#include <arm_neon.h>
+
 #ifndef HB_NO_OT_LAYOUT
 
 #ifdef HB_NO_OT_TAG
@@ -1852,6 +1854,12 @@ struct GPOSProxy
   const GPOS::accelerator_t &accel;
 };
 
+inline bool uint32x4_is_not_zero(uint32x4_t v)
+{
+  // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics
+  uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));
+  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
+}
 
 static inline bool
 apply_forward (OT::hb_ot_apply_context_t *c,
@@ -1864,6 +1872,25 @@ apply_forward (OT::hb_ot_apply_context_t *c,
   hb_buffer_t *buffer = c->buffer;
   while (buffer->idx < buffer->len && buffer->successful)
   {
+    uint32x4_t lookup_masksv = vdupq_n_u32 (c->lookup_mask);
+    const auto *in = buffer->info;
+    unsigned i = buffer->idx;
+    while (i + 4 < buffer->len)
+    {
+      const uint32_t codepoints[4] = {in[i+0].codepoint, in[i+1].codepoint, in[i+2].codepoint, in[i+3].codepoint};
+      uint32x4_t codepointsv = vld1q_u32 (codepoints);
+
+      const uint32_t masks[4] = {in[i+0].mask, in[i+1].mask, in[i+2].mask, in[i+3].mask};
+      uint32x4_t masksv = vld1q_u32 (masks);
+
+      if (accel.may_have (codepointsv) &&
+          uint32x4_is_not_zero (vandq_u32 (lookup_masksv, masksv)))
+	break;
+
+      i += 4;
+    }
+    (void) buffer->next_glyphs (i - buffer->idx);
+
     bool applied = false;
     if (accel.digest.may_have (buffer->cur().codepoint) &&
 	(buffer->cur().mask & c->lookup_mask) &&
diff --git a/src/hb-set-digest.hh b/src/hb-set-digest.hh
index dab713729..2a2749edc 100644
--- a/src/hb-set-digest.hh
+++ b/src/hb-set-digest.hh
@@ -30,6 +30,8 @@
 #include "hb.hh"
 #include "hb-machinery.hh"
 
+#include <arm_neon.h>
+
 /*
  * The set-digests here implement various "filters" that support
  * "approximate member query".  Conceptually these are like Bloom
@@ -124,10 +126,34 @@ struct hb_set_digest_bits_pattern_t
   bool may_have (hb_codepoint_t g) const
   { return mask & mask_for (g); }
 
+  static inline bool uint32x4_is_not_zero(uint32x4_t v)
+  {
+    // https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics
+    uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));
+    return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
+  }
+
+  bool may_have (const uint32x4_t &g) const
+  { return uint32x4_is_not_zero (vandq_u32 (vdupq_n_u32 (mask), mask_for (g))); }
+
   private:
 
   static mask_t mask_for (hb_codepoint_t g)
   { return ((mask_t) 1) << ((g >> shift) & (mask_bits - 1)); }
+
+  template <int u = shift,
+	    hb_enable_if (u == 0)>
+  static uint32x4_t shifted (uint32x4_t v) { return v; }
+  template <int u = shift,
+	    hb_enable_if (u != 0)>
+  static uint32x4_t shifted (uint32x4_t v) { return vshrq_n_u32 (v, shift); }
+
+  static uint32x4_t mask_for (const uint32x4_t &g)
+  {
+    uint32x4_t a = vandq_u32 (shifted (g), vdupq_n_u32 (mask_bits - 1));
+    return vshlq_u32 (vdupq_n_u32 (1), a);
+  }
+
   mask_t mask;
 };
 
@@ -179,7 +205,8 @@ struct hb_set_digest_combiner_t
     return head.may_have (o.head) && tail.may_have (o.tail);
   }
 
-  bool may_have (hb_codepoint_t g) const
+  template <typename T>
+  bool may_have (const T &g) const
   {
     return head.may_have (g) && tail.may_have (g);
   }
@@ -200,11 +227,11 @@ struct hb_set_digest_combiner_t
 using hb_set_digest_t =
   hb_set_digest_combiner_t
   <
-    hb_set_digest_bits_pattern_t<unsigned long, 4>,
+    hb_set_digest_bits_pattern_t<uint32_t, 4>,
     hb_set_digest_combiner_t
     <
-      hb_set_digest_bits_pattern_t<unsigned long, 0>,
-      hb_set_digest_bits_pattern_t<unsigned long, 9>
+      hb_set_digest_bits_pattern_t<uint32_t, 0>,
+      hb_set_digest_bits_pattern_t<uint32_t, 9>
     >
   >
 ;