Merge pull request #5069 from harfbuzz/cluster-level-graphemes

[buffer] Add HB_BUFFER_CLUSTER_LEVEL_GRAPHEMES
This commit is contained in:
Behdad Esfahbod 2025-03-10 02:52:26 -06:00 committed by GitHub
commit 9c0ac9aec4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 97 additions and 35 deletions

View file

@ -56,7 +56,6 @@ jobs:
-Dchafa=disabled \
-Dgraphite=enabled \
-Doptimization=2 \
-Ddoc_tests=true \
-Dfontations=enabled
- name: Build
run: meson compile -Cbuild

View file

@ -114,6 +114,9 @@ hb_glyph_position_t
hb_buffer_content_type_t
hb_buffer_flags_t
hb_buffer_cluster_level_t
HB_BUFFER_CLUSTER_LEVEL_IS_CHARACTERS
HB_BUFFER_CLUSTER_LEVEL_IS_GRAPHEMES
HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE
hb_segment_properties_t
hb_buffer_serialize_format_t
hb_buffer_serialize_flags_t

View file

@ -63,24 +63,25 @@ static bool
buffer_verify_monotone (hb_buffer_t *buffer,
hb_font_t *font)
{
/* Check that clusters are monotone. */
if (buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES ||
buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS)
if (!HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE (buffer->cluster_level))
{
bool is_forward = HB_DIRECTION_IS_FORWARD (hb_buffer_get_direction (buffer));
unsigned int num_glyphs;
hb_glyph_info_t *info = hb_buffer_get_glyph_infos (buffer, &num_glyphs);
for (unsigned int i = 1; i < num_glyphs; i++)
if (info[i-1].cluster != info[i].cluster &&
(info[i-1].cluster < info[i].cluster) != is_forward)
{
buffer_verify_error (buffer, font, BUFFER_VERIFY_ERROR "clusters are not monotone.");
return false;
}
/* Cannot perform this check without monotone clusters. */
return true;
}
bool is_forward = HB_DIRECTION_IS_FORWARD (hb_buffer_get_direction (buffer));
unsigned int num_glyphs;
hb_glyph_info_t *info = hb_buffer_get_glyph_infos (buffer, &num_glyphs);
for (unsigned int i = 1; i < num_glyphs; i++)
if (info[i-1].cluster != info[i].cluster &&
(info[i-1].cluster < info[i].cluster) != is_forward)
{
buffer_verify_error (buffer, font, BUFFER_VERIFY_ERROR "clusters are not monotone.");
return false;
}
return true;
}
@ -92,8 +93,7 @@ buffer_verify_unsafe_to_break (hb_buffer_t *buffer,
unsigned int num_features,
const char * const *shapers)
{
if (buffer->cluster_level != HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES &&
buffer->cluster_level != HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS)
if (!HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE (buffer->cluster_level))
{
/* Cannot perform this check without monotone clusters. */
return true;
@ -207,8 +207,7 @@ buffer_verify_unsafe_to_concat (hb_buffer_t *buffer,
unsigned int num_features,
const char * const *shapers)
{
if (buffer->cluster_level != HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES &&
buffer->cluster_level != HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS)
if (!HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE (buffer->cluster_level))
{
/* Cannot perform this check without monotone clusters. */
return true;

View file

@ -518,7 +518,7 @@ void
hb_buffer_t::merge_clusters_impl (unsigned int start,
unsigned int end)
{
if (cluster_level == HB_BUFFER_CLUSTER_LEVEL_CHARACTERS)
if (!HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE (cluster_level))
{
unsafe_to_break (start, end);
return;
@ -551,7 +551,7 @@ void
hb_buffer_t::merge_out_clusters (unsigned int start,
unsigned int end)
{
if (cluster_level == HB_BUFFER_CLUSTER_LEVEL_CHARACTERS)
if (!HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE (cluster_level))
return;
if (unlikely (end - start < 2))

View file

@ -422,18 +422,34 @@ hb_buffer_get_flags (const hb_buffer_t *buffer);
* @HB_BUFFER_CLUSTER_LEVEL_CHARACTERS: Don't group cluster values.
* @HB_BUFFER_CLUSTER_LEVEL_DEFAULT: Default cluster level,
* equal to @HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES.
*
* @HB_BUFFER_CLUSTER_LEVEL_GRAPHEMES: Only group clusters, but don't enforce monotone order.
*
* Data type for holding HarfBuzz's clustering behavior options. The cluster level
* dictates one aspect of how HarfBuzz will treat non-base characters
* dictates one aspect of how HarfBuzz will treat non-base characters
* during shaping.
*
* In @HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES, non-base
* characters are merged into the cluster of the base character that precedes them.
* There is also cluster merging every time the clusters will otherwise become non-monotone.
*
* In @HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS, non-base characters are initially
* assigned their own cluster values, which are not merged into preceding base
* clusters. This allows HarfBuzz to perform additional operations like reorder
* sequences of adjacent marks.
* sequences of adjacent marks. The output is still monotone, but the cluster
* values are more granular.
*
* In @HB_BUFFER_CLUSTER_LEVEL_CHARACTERS, non-base characters are assigned their
* own cluster values, which are not merged into preceding base clusters. Moreover,
* the cluster values are not merged into monotone order. This is the most granular
* cluster level, and it is useful for clients that need to know the exact cluster
* values of each character, but is harder to use for clients, since clusters
* might appear in any order.
*
* In @HB_BUFFER_CLUSTER_LEVEL_GRAPHEMES, non-base characters are merged into the
* cluster of the base character that precedes them. This is similar to the Unicode
* Grapheme Cluster algorithm, but it is not exactly the same. The output is
* not forced to be monotone. This is useful for clients that want to use HarfBuzz
* as a cheap implementation of the Unicode Grapheme Cluster algorithm.
*
* @HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES is the default, because it maintains
* backward compatibility with older versions of HarfBuzz. New client programs that
@ -446,9 +462,52 @@ typedef enum {
HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES = 0,
HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS = 1,
HB_BUFFER_CLUSTER_LEVEL_CHARACTERS = 2,
HB_BUFFER_CLUSTER_LEVEL_GRAPHEMES = 3,
HB_BUFFER_CLUSTER_LEVEL_DEFAULT = HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES
} hb_buffer_cluster_level_t;
/**
* HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE:
* @level: #hb_buffer_cluster_level_t to test
*
* Tests whether a cluster level groups cluster values into monotone order.
* Requires that the level be valid.
*
* XSince: REPLACEME
*/
#define HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE(level) \
((bool) ((1u << (unsigned) (level)) & \
((1u << HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES) | \
(1u << HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS))))
/**
* HB_BUFFER_CLUSTER_LEVEL_IS_GRAPHEMES:
* @level: #hb_buffer_cluster_level_t to test
*
* Tests whether a cluster level groups cluster values by graphemes. Requires
* that the level be valid.
*
* XSince: REPLACEME
*/
#define HB_BUFFER_CLUSTER_LEVEL_IS_GRAPHEMES(level) \
((bool) ((1u << (unsigned) (level)) & \
((1u << HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES) | \
(1u << HB_BUFFER_CLUSTER_LEVEL_GRAPHEMES))))
/**
* HB_BUFFER_CLUSTER_LEVEL_IS_CHARACTERS
* @level: #hb_buffer_cluster_level_t to test
*
* Tests whether a cluster level does not group cluster values by graphemes.
* Requires that the level be valid.
*
* XSince: REPLACEME
*/
#define HB_BUFFER_CLUSTER_LEVEL_IS_CHARACTERS(level) \
((bool) ((1u << (unsigned) (level)) & \
((1u << HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARCATERS) | \
(1u << HB_BUFFER_CLUSTER_LEVEL_CHARACTERS))))
HB_EXTERN void
hb_buffer_set_cluster_level (hb_buffer_t *buffer,
hb_buffer_cluster_level_t cluster_level);

View file

@ -646,7 +646,7 @@ _hb_coretext_shape (hb_shape_plan_t *shape_plan,
* B1 M1 B2 M2, and B1-B2 form a ligature, M2's cluster will
* continue pointing to B2 even though B2 was merged into B1's
* cluster... */
if (buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES)
if (HB_BUFFER_CLUSTER_LEVEL_IS_GRAPHEMES (buffer->cluster_level))
{
hb_unicode_funcs_t *unicode = buffer->unicode;
unsigned int count = buffer->len;
@ -1292,7 +1292,7 @@ resize_and_retry:
* or the native OT backend, only that the cluster indices will be
* monotonic in the output buffer. */
if (count > 1 && (status_or & kCTRunStatusNonMonotonic) &&
buffer->cluster_level != HB_BUFFER_CLUSTER_LEVEL_CHARACTERS)
HB_BUFFER_CLUSTER_LEVEL_IS_MONOTONE (buffer->cluster_level))
{
hb_glyph_info_t *info = buffer->info;
if (HB_DIRECTION_IS_FORWARD (buffer->props.direction))

View file

@ -387,6 +387,8 @@ _hb_grapheme_group_func (const hb_glyph_info_t& a HB_UNUSED,
static inline void
_hb_ot_layout_reverse_graphemes (hb_buffer_t *buffer)
{
// MONOTONE_GRAPHEMES was already applied and is taken care of by _hb_grapheme_group_func.
// So we just check for MONOTONE_CHARACTERS here.
buffer->reverse_groups (_hb_grapheme_group_func,
buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS);
}

View file

@ -551,7 +551,7 @@ hb_form_clusters (hb_buffer_t *buffer)
if (!(buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_NON_ASCII))
return;
if (buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES)
if (HB_BUFFER_CLUSTER_LEVEL_IS_GRAPHEMES (buffer->cluster_level))
foreach_grapheme (buffer, start, end)
buffer->merge_clusters (start, end);
else
@ -609,7 +609,7 @@ hb_ensure_native_direction (hb_buffer_t *buffer)
* Ogham fonts are supposed to be implemented BTT or not. Need to research that
* first. */
if ((HB_DIRECTION_IS_HORIZONTAL (direction) &&
direction != horiz_dir && horiz_dir != HB_DIRECTION_INVALID) ||
direction != horiz_dir && HB_DIRECTION_IS_VALID (horiz_dir)) ||
(HB_DIRECTION_IS_VERTICAL (direction) &&
direction != HB_DIRECTION_TTB))
{

View file

@ -298,8 +298,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
end = start + 2;
if (unlikely (!buffer->successful))
break;
if (buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES)
buffer->merge_out_clusters (start, end);
buffer->merge_out_clusters (start, end);
continue;
}
}
@ -372,8 +371,7 @@ preprocess_text_hangul (const hb_ot_shape_plan_t *plan HB_UNUSED,
if (i < end)
info[i++].hangul_shaping_feature() = TJMO;
if (buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES)
buffer->merge_out_clusters (start, end);
buffer->merge_out_clusters (start, end);
continue;
}
else if ((!tindex && buffer->idx + 1 < count && isT (buffer->cur(+1).codepoint)))

View file

@ -360,7 +360,7 @@ preprocess_text_thai (const hb_ot_shape_plan_t *plan,
{
/* Since we decomposed, and NIKHAHIT is combining, merge clusters with the
* previous cluster. */
if (start && buffer->cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES)
if (start)
buffer->merge_out_clusters (start - 1, end);
}
}

View file

@ -1,3 +1,5 @@
# Our cluster-level=3 doesn't really test anything here.
../fonts/4fac3929fc3332834e93673780ec0fe94342d193.ttf;--cluster-level=3;U+0078,U+030A,U+0058,U+030A;[gid2=0+1083|gid3=1@-1132,-8+0|gid1=2+1200|gid3=3@-1190,349+0]
../fonts/4fac3929fc3332834e93673780ec0fe94342d193.ttf;--cluster-level=2;U+0078,U+030A,U+0058,U+030A;[gid2=0+1083|gid3=1@-1132,-8+0|gid1=2+1200|gid3=3@-1190,349+0]
../fonts/43ef465752be9af900745f72fe29cb853a1401a5.ttf;--cluster-level=1;U+05D4,U+05B7,U+05E9,U+05BC,U+05C1,U+05B8,U+05DE,U+05B4,U+05DD;[uni05DD=8+1359|uni05B4=7@111,0+0|uni05DE=6+1391|uni05B8=5+0|uni05BC=3+0|uni05C1=3+0|uni05E9=2+1451|uni05B7=1@28,0+0|uni05D4=0+1338]
../fonts/6f36d056bad6d478fc0bf7397bd52dc3bd197d5f.ttf;--cluster-level=1;U+099B,U+09CB,U+09C8,U+09C2,U+09CB,U+098C;[evowelsigninibeng=0+346|aivowelsignbeng=0+346|evowelsignbeng=0+346|chabeng=0+687|uuvowelsignlongbeng=0@-96,0+0|aavowelsignbeng=0+266|aavowelsignbeng=4+266|lvocalicbeng=5+639]

View file

@ -383,7 +383,7 @@ shape_options_t::add_options (option_parser_t *parser)
0, 0, G_OPTION_ARG_INT, &this->not_found_variation_selector_glyph,
"Glyph value to replace not-found variation-selector characters with", nullptr},
{"utf8-clusters", 0, 0, G_OPTION_ARG_NONE, &this->utf8_clusters, "Use UTF8 byte indices, not char indices", nullptr},
{"cluster-level", 0, 0, G_OPTION_ARG_INT, &this->cluster_level, "Cluster merging level (default: 0)", "0/1/2"},
{"cluster-level", 0, 0, G_OPTION_ARG_INT, &this->cluster_level, "Cluster merging level (default: 0)", "0/1/2/3"},
{"normalize-glyphs",0, 0, G_OPTION_ARG_NONE, &this->normalize_glyphs, "Rearrange glyph clusters in nominal order", nullptr},
{"unsafe-to-concat",0, 0, G_OPTION_ARG_NONE, &this->unsafe_to_concat, "Produce unsafe-to-concat glyph flag", nullptr},
{"safe-to-insert-tatweel",0, 0, G_OPTION_ARG_NONE, &this->safe_to_insert_tatweel, "Produce safe-to-insert-tatweel glyph flag", nullptr},