[subset] close unicodes over bidi mirror variants during subsetting.
Some checks failed
Scorecard supply-chain security / Scorecard analysis (push) Failing after 3s
arm / arm-none-eabi (push) Has been cancelled
configs-ci / build (push) Has been cancelled
macos-ci / build (push) Has been cancelled
fontations / build (push) Has been cancelled
linux-ci / build (push) Has been cancelled
msvc / msvc-2019-amd64 (push) Has been cancelled
msvc / msvc-2019-x86 (push) Has been cancelled
msys2 / CLANG64 (push) Has been cancelled
msys2 / MINGW32 (push) Has been cancelled
msys2 / MINGW64 (push) Has been cancelled

Fixes #5281. Does the closure by default, but I've introduced a new flag and option to disable this behaviour since some users may want to get the minimal set if they know they don't need the mirrored variants.
This commit is contained in:
Garret Rieger 2025-04-11 23:34:06 +00:00
parent 5afbd187b6
commit efcb7d3de1
12 changed files with 63 additions and 12 deletions

View file

@ -29,6 +29,8 @@
#include "hb-map.hh"
#include "hb-multimap.hh"
#include "hb-set.hh"
#include "hb-subset.h"
#include "hb-unicode.h"
#include "hb-ot-cmap-table.hh"
#include "hb-ot-glyf-table.hh"
@ -209,15 +211,46 @@ _fill_unicode_and_glyph_map(hb_subset_plan_t *plan,
_fill_unicode_and_glyph_map(plan, unicode_iterator, unicode_to_gid_for_iterator, unicode_to_gid_for_iterator);
}
/*
* Finds additional unicode codepoints which are reachable from the input unicode set.
* Currently this adds in mirrored variants (needed for bidi) of any input unicodes.
*/
static hb_set_t
_unicode_closure (const hb_set_t* unicodes, bool bidi_closure) {
// TODO: we may want to also consider pulling in reachable unicode composition and decompositions.
// see: https://github.com/harfbuzz/harfbuzz/issues/2283
hb_set_t out = *unicodes;
if (!bidi_closure) return out;
if (out.is_inverted()) {
// don't closure inverted sets, they are asking to specifically exclude certain codepoints.
// otherwise everything is already included.
return out;
}
auto unicode_funcs = hb_unicode_funcs_get_default ();
for (hb_codepoint_t cp : *unicodes) {
hb_codepoint_t mirror = hb_unicode_mirroring(unicode_funcs, cp);
if (unlikely (mirror != cp)) {
out.add(mirror);
}
}
return out;
}
static void
_populate_unicodes_to_retain (const hb_set_t *unicodes,
_populate_unicodes_to_retain (const hb_set_t *unicodes_in,
const hb_set_t *glyphs,
hb_subset_plan_t *plan)
{
hb_set_t unicodes = _unicode_closure(unicodes_in,
!(plan->flags & HB_SUBSET_FLAGS_NO_BIDI_CLOSURE));
OT::cmap::accelerator_t cmap (plan->source);
unsigned size_threshold = plan->source->get_num_glyphs ();
if (glyphs->is_empty () && unicodes->get_population () < size_threshold)
if (glyphs->is_empty () && unicodes.get_population () < size_threshold)
{
const hb_map_t* unicode_to_gid = nullptr;
@ -227,9 +260,9 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
// This is approach to collection is faster, but can only be used if glyphs
// are not being explicitly added to the subset and the input unicodes set is
// not excessively large (eg. an inverted set).
plan->unicode_to_new_gid_list.alloc (unicodes->get_population ());
plan->unicode_to_new_gid_list.alloc (unicodes.get_population ());
if (!unicode_to_gid) {
_fill_unicode_and_glyph_map(plan, unicodes->iter(), [&] (hb_codepoint_t cp) {
_fill_unicode_and_glyph_map(plan, unicodes.iter(), [&] (hb_codepoint_t cp) {
hb_codepoint_t gid;
if (!cmap.get_nominal_glyph (cp, &gid)) {
return HB_MAP_VALUE_INVALID;
@ -241,7 +274,7 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
// the map. This code is mostly duplicated from above to avoid doing
// conditionals on the presence of the unicode_to_gid map each
// iteration.
_fill_unicode_and_glyph_map(plan, unicodes->iter(), [&] (hb_codepoint_t cp) {
_fill_unicode_and_glyph_map(plan, unicodes.iter(), [&] (hb_codepoint_t cp) {
return unicode_to_gid->get (cp);
});
}
@ -258,7 +291,7 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
if (!plan->accelerator) {
cmap.collect_mapping (&cmap_unicodes_storage, &unicode_glyphid_map_storage);
plan->unicode_to_new_gid_list.alloc (hb_min(unicodes->get_population ()
plan->unicode_to_new_gid_list.alloc (hb_min(unicodes.get_population ()
+ glyphs->get_population (),
cmap_unicodes->get_population ()));
} else {
@ -267,10 +300,10 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
}
if (plan->accelerator &&
unicodes->get_population () < cmap_unicodes->get_population () &&
unicodes.get_population () < cmap_unicodes->get_population () &&
glyphs->get_population () < cmap_unicodes->get_population ())
{
plan->codepoint_to_glyph->alloc (unicodes->get_population () + glyphs->get_population ());
plan->codepoint_to_glyph->alloc (unicodes.get_population () + glyphs->get_population ());
auto &gid_to_unicodes = plan->accelerator->gid_to_unicodes;
@ -285,7 +318,7 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
});
}
_fill_unicode_and_glyph_map(plan, unicodes->iter(), [&] (hb_codepoint_t cp) {
_fill_unicode_and_glyph_map(plan, unicodes.iter(), [&] (hb_codepoint_t cp) {
/* Don't double-add entry. */
if (plan->codepoint_to_glyph->has (cp))
return HB_MAP_VALUE_INVALID;
@ -306,7 +339,7 @@ _populate_unicodes_to_retain (const hb_set_t *unicodes,
{
_fill_unicode_and_glyph_map(plan, hb_range(first, last + 1), [&] (hb_codepoint_t cp) {
hb_codepoint_t gid = (*unicode_glyphid_map)[cp];
if (!unicodes->has (cp) && !glyphs->has (gid))
if (!unicodes.has (cp) && !glyphs->has (gid))
return HB_MAP_VALUE_INVALID;
return gid;
},

View file

@ -71,10 +71,12 @@ typedef struct hb_subset_plan_t hb_subset_plan_t;
* in the final subset.
* @HB_SUBSET_FLAGS_NO_PRUNE_UNICODE_RANGES: If set then the unicode ranges in
* OS/2 will not be recalculated.
* @HB_SUBSET_FLAGS_NO_LAYOUT_CLOSURE: If set don't perform glyph closure on layout
* @HB_SUBSET_FLAGS_NO_LAYOUT_CLOSURE: If set do not perform glyph closure on layout
* substitution rules (GSUB). Since: 7.2.0.
* @HB_SUBSET_FLAGS_OPTIMIZE_IUP_DELTAS: If set perform IUP delta optimization on the
* remaining gvar table's deltas. Since: 8.5.0
* @HB_SUBSET_FLAGS_NO_BIDI_CLOSURE: If set do not pull mirrored versions of input
* codepoints into the subset. Since: REPLACEME
* @HB_SUBSET_FLAGS_IFTB_REQUIREMENTS: If set enforce requirements on the output subset
* to allow it to be used with incremental font transfer IFTB patches. Primarily,
* this forces all outline data to use long (32 bit) offsets. Since: EXPERIMENTAL
@ -96,8 +98,9 @@ typedef enum { /*< flags >*/
HB_SUBSET_FLAGS_NO_PRUNE_UNICODE_RANGES = 0x00000100u,
HB_SUBSET_FLAGS_NO_LAYOUT_CLOSURE = 0x00000200u,
HB_SUBSET_FLAGS_OPTIMIZE_IUP_DELTAS = 0x00000400u,
HB_SUBSET_FLAGS_NO_BIDI_CLOSURE = 0x00000800u,
#ifdef HB_EXPERIMENTAL_API
HB_SUBSET_FLAGS_IFTB_REQUIREMENTS = 0x00000800u,
HB_SUBSET_FLAGS_IFTB_REQUIREMENTS = 0x00001000u,
#endif
} hb_subset_flags_t;

View file

@ -1,3 +1,4 @@
--layout-features=*
--notdef-outline
--no-bidi-closure
--retain-gids

View file

@ -1,2 +1,3 @@
--layout-features=*
--notdef-outline
--no-bidi-closure

View file

@ -0,0 +1 @@
--no-bidi-closure

View file

@ -0,0 +1,10 @@
FONTS:
Roboto-Regular.ttf
PROFILES:
default.txt
no_bidi_closure.txt
SUBSETS:
abc<
abc>

View file

@ -1,5 +1,6 @@
tests = [
'basics',
'bidi',
'cmap',
'cmap14',
'preprocess',

View file

@ -985,6 +985,7 @@ subset_main_t::add_options ()
{"notdef-outline", 0, G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, (gpointer) &set_flag<HB_SUBSET_FLAGS_NOTDEF_OUTLINE>, "Keep the outline of \'.notdef\' glyph", nullptr},
{"no-prune-unicode-ranges", 0, G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, (gpointer) &set_flag<HB_SUBSET_FLAGS_NO_PRUNE_UNICODE_RANGES>, "Don't change the 'OS/2 ulUnicodeRange*' bits.", nullptr},
{"no-layout-closure", 0, G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, (gpointer) &set_flag<HB_SUBSET_FLAGS_NO_LAYOUT_CLOSURE>, "Don't perform glyph closure for layout substitution (GSUB).", nullptr},
{"no-bidi-closure", 0, G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, (gpointer) &set_flag<HB_SUBSET_FLAGS_NO_BIDI_CLOSURE>, "Don't perform bidi closure (adding mirrored variants) for input codepoints.", nullptr},
{"glyph-names", 0, G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, (gpointer) &set_flag<HB_SUBSET_FLAGS_GLYPH_NAMES>, "Keep PS glyph names in TT-flavored fonts. ", nullptr},
{"passthrough-tables", 0, G_OPTION_FLAG_NO_ARG, G_OPTION_ARG_CALLBACK, (gpointer) &set_flag<HB_SUBSET_FLAGS_PASSTHROUGH_UNRECOGNIZED>, "Do not drop tables that the tool does not know how to subset.", nullptr},
{"preprocess-face", 0, G_OPTION_FLAG_HIDDEN, G_OPTION_ARG_NONE, &this->preprocess,