mirror of
https://github.com/harfbuzz/harfbuzz.git
synced 2025-04-04 13:05:04 +00:00
[layout] Use NEON SIMD instructions for apply_forward()
To skip four glyphs at a time. Unfortunately slows down benchmark-shape by up to 25% for Amiri, instead of speeding anything up. This is due to the overhead of switching back and forth between the ARM and NEON processors. To be experimented with: Same architecture but with AVX on x86_64 to skip 8 glyphs at a time. https://github.com/harfbuzz/harfbuzz/issues/566
This commit is contained in:
parent
a10fad7cc2
commit
641ec17a1d
3 changed files with 65 additions and 2 deletions
|
@ -4085,7 +4085,8 @@ struct hb_ot_layout_lookup_accelerator_t
|
|||
return thiz;
|
||||
}
|
||||
|
||||
bool may_have (hb_codepoint_t g) const
|
||||
template <typename T>
|
||||
bool may_have (const T &g) const
|
||||
{ return digest.may_have (g); }
|
||||
|
||||
bool apply (hb_ot_apply_context_t *c, unsigned subtables_count, bool use_cache) const
|
||||
|
|
|
@ -30,6 +30,8 @@
|
|||
|
||||
#include "hb.hh"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifndef HB_NO_OT_LAYOUT
|
||||
|
||||
#ifdef HB_NO_OT_TAG
|
||||
|
@ -1852,6 +1854,12 @@ struct GPOSProxy
|
|||
const GPOS::accelerator_t &accel;
|
||||
};
|
||||
|
||||
inline bool uint32x4_is_not_zero(uint32x4_t v)
|
||||
{
|
||||
// https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics
|
||||
uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));
|
||||
return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
apply_forward (OT::hb_ot_apply_context_t *c,
|
||||
|
@ -1864,6 +1872,25 @@ apply_forward (OT::hb_ot_apply_context_t *c,
|
|||
hb_buffer_t *buffer = c->buffer;
|
||||
while (buffer->idx < buffer->len && buffer->successful)
|
||||
{
|
||||
uint32x4_t lookup_masksv = vdupq_n_u32 (c->lookup_mask);
|
||||
const auto *in = buffer->info;
|
||||
unsigned i = buffer->idx;
|
||||
while (i + 4 < buffer->len)
|
||||
{
|
||||
const uint32_t codepoints[4] = {in[i+0].codepoint, in[i+1].codepoint, in[i+2].codepoint, in[i+3].codepoint};
|
||||
uint32x4_t codepointsv = vld1q_u32 (codepoints);
|
||||
|
||||
const uint32_t masks[4] = {in[i+0].mask, in[i+1].mask, in[i+2].mask, in[i+3].mask};
|
||||
uint32x4_t masksv = vld1q_u32 (masks);
|
||||
|
||||
if (accel.may_have (codepointsv) &&
|
||||
uint32x4_is_not_zero (vandq_u32 (lookup_masksv, masksv)))
|
||||
break;
|
||||
|
||||
i += 4;
|
||||
}
|
||||
(void) buffer->next_glyphs (i - buffer->idx);
|
||||
|
||||
bool applied = false;
|
||||
if (accel.digest.may_have (buffer->cur().codepoint) &&
|
||||
(buffer->cur().mask & c->lookup_mask) &&
|
||||
|
|
|
@ -30,6 +30,8 @@
|
|||
#include "hb.hh"
|
||||
#include "hb-machinery.hh"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
/*
|
||||
* The set-digests here implement various "filters" that support
|
||||
* "approximate member query". Conceptually these are like Bloom
|
||||
|
@ -124,10 +126,34 @@ struct hb_set_digest_bits_pattern_t
|
|||
bool may_have (hb_codepoint_t g) const
|
||||
{ return mask & mask_for (g); }
|
||||
|
||||
static inline bool uint32x4_is_not_zero(uint32x4_t v)
|
||||
{
|
||||
// https://stackoverflow.com/questions/15389539/fastest-way-to-test-a-128-bit-neon-register-for-a-value-of-0-using-intrinsics
|
||||
uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));
|
||||
return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
|
||||
}
|
||||
|
||||
bool may_have (const uint32x4_t &g) const
|
||||
{ return uint32x4_is_not_zero (vandq_u32 (vdupq_n_u32 (mask), mask_for (g))); }
|
||||
|
||||
private:
|
||||
|
||||
static mask_t mask_for (hb_codepoint_t g)
|
||||
{ return ((mask_t) 1) << ((g >> shift) & (mask_bits - 1)); }
|
||||
|
||||
template <int u = shift,
|
||||
hb_enable_if (u == 0)>
|
||||
static uint32x4_t shifted (uint32x4_t v) { return v; }
|
||||
template <int u = shift,
|
||||
hb_enable_if (u != 0)>
|
||||
static uint32x4_t shifted (uint32x4_t v) { return vshrq_n_u32 (v, shift); }
|
||||
|
||||
static uint32x4_t mask_for (const uint32x4_t &g)
|
||||
{
|
||||
uint32x4_t a = vandq_u32 (shifted (g), vdupq_n_u32 (mask_bits - 1));
|
||||
return vshlq_u32 (vdupq_n_u32 (1), a);
|
||||
}
|
||||
|
||||
mask_t mask;
|
||||
};
|
||||
|
||||
|
@ -179,7 +205,8 @@ struct hb_set_digest_combiner_t
|
|||
return head.may_have (o.head) && tail.may_have (o.tail);
|
||||
}
|
||||
|
||||
bool may_have (hb_codepoint_t g) const
|
||||
template <typename T>
|
||||
bool may_have (const T &g) const
|
||||
{
|
||||
return head.may_have (g) && tail.may_have (g);
|
||||
}
|
||||
|
@ -200,11 +227,19 @@ struct hb_set_digest_combiner_t
|
|||
using hb_set_digest_t =
|
||||
hb_set_digest_combiner_t
|
||||
<
|
||||
<<<<<<< HEAD
|
||||
hb_set_digest_bits_pattern_t<unsigned long, 4>,
|
||||
hb_set_digest_combiner_t
|
||||
<
|
||||
hb_set_digest_bits_pattern_t<unsigned long, 0>,
|
||||
hb_set_digest_bits_pattern_t<unsigned long, 9>
|
||||
=======
|
||||
hb_set_digest_lowest_bits_t<uint32_t, 4>,
|
||||
hb_set_digest_combiner_t
|
||||
<
|
||||
hb_set_digest_lowest_bits_t<uint32_t, 0>,
|
||||
hb_set_digest_lowest_bits_t<uint32_t, 9>
|
||||
>>>>>>> 65ed2e570 ([layout] Use NEON SIMD instructions for apply_forward())
|
||||
>
|
||||
>
|
||||
;
|
||||
|
|
Loading…
Add table
Reference in a new issue