ICU-22707 fix hst=V: hst=NA for Kirat Rai

2025-04-10 15:42:14 +00:00 · 2024-06-04 16:41:28 -07:00 · 2024-06-04 16:41:28 -07:00 · 6543634649
commit 6543634649
parent 47e9389b8e
4 changed files with 29 additions and 2 deletions
--- a/icu4c/source/common/uprops.cpp
+++ b/icu4c/source/common/uprops.cpp
@ -590,7 +590,11 @@ static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*

 /*
 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
- * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
+ * Hangul_Syllable_Type is redundant with a subset of Grapheme_Cluster_Break.
+ *
+ * Starting with Unicode 16, there is an exception:
+ * Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but
+ * they are of course not related to Hangul syllables.
 */
 static const UHangulSyllableType gcbToHst[]={
    U_HST_NOT_APPLICABLE,   /* U_GCB_OTHER */
@ -610,6 +614,11 @@ static const UHangulSyllableType gcbToHst[]={
 };

 static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
+    // Ignore supplementary code points: They all have HST=NA.
+    // This is a simple way to handle the GCB!=hst cases since Unicode 16 (Kirat Rai vowels).
+    if(c>0xffff) {
+        return U_HST_NOT_APPLICABLE;
+    }
    /* see comments on gcbToHst[] above */
    int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
    if(gcb<UPRV_LENGTHOF(gcbToHst)) {
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@ -2699,6 +2699,10 @@ TestAdditionalProperties(void) {

        { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },

+        // GCB=V but hst=NA (exception to GCB=hst for relevant values)
+        { 0x16D67, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_NOT_APPLICABLE },
+        { 0x16D6A, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_NOT_APPLICABLE },
+
        { -1, 0x410, 0 }, /* version break for Unicode 4.1 */

        { 0x00d7, UCHAR_PATTERN_SYNTAX, true },
--- a/icu4j/main/core/src/main/java/com/ibm/icu/impl/UCharacterProperty.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/impl/UCharacterProperty.java
@ -648,7 +648,11 @@ public final class UCharacterProperty

    /*
     * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
-     * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
+     * Hangul_Syllable_Type is redundant with a subset of Grapheme_Cluster_Break.
+     *
+     * Starting with Unicode 16, there is an exception:
+     * Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but
+     * they are of course not related to Hangul syllables.
     */
    private static final int /* UHangulSyllableType */ gcbToHst[]={
        HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_OTHER */
@ -809,6 +813,12 @@ public final class UCharacterProperty
        new IntProperty(SRC_PROPSVEC) {  // HANGUL_SYLLABLE_TYPE
            @Override
            int getValue(int c) {
+                // Ignore supplementary code points: They all have HST=NA.
+                // This is a simple way to handle the GCB!=hst cases since Unicode 16
+                // (Kirat Rai vowels).
+                if(c>0xffff) {
+                    return HangulSyllableType.NOT_APPLICABLE;
+                }
                /* see comments on gcbToHst[] above */
                int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT;
                if(gcb<gcbToHst.length) {
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/lang/UCharacterTest.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/lang/UCharacterTest.java
@ -2109,6 +2109,10 @@ public final class UCharacterTest extends CoreTestFmwk

            { 0xd7a4, UProperty.HANGUL_SYLLABLE_TYPE, 0 },

+            // GCB=V but hst=NA (exception to GCB=hst for relevant values)
+            { 0x16D67, UProperty.HANGUL_SYLLABLE_TYPE, UCharacter.HangulSyllableType.NOT_APPLICABLE },
+            { 0x16D6A, UProperty.HANGUL_SYLLABLE_TYPE, UCharacter.HangulSyllableType.NOT_APPLICABLE },
+
            { -1, 0x410, 0 }, /* version break for Unicode 4.1 */

            { 0x00d7, UProperty.PATTERN_SYNTAX, 1 },