ICU-22707 fix hst=V: hst=NA for Kirat Rai

This commit is contained in:
Markus Scherer 2024-06-04 16:41:28 -07:00
parent 47e9389b8e
commit 6543634649
4 changed files with 29 additions and 2 deletions

View file

@ -590,7 +590,11 @@ static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*
/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
* Hangul_Syllable_Type is redundant with a subset of Grapheme_Cluster_Break.
*
* Starting with Unicode 16, there is an exception:
* Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but
* they are of course not related to Hangul syllables.
*/
static const UHangulSyllableType gcbToHst[]={
U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */
@ -610,6 +614,11 @@ static const UHangulSyllableType gcbToHst[]={
};
static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
// Ignore supplementary code points: They all have HST=NA.
// This is a simple way to handle the GCB!=hst cases since Unicode 16 (Kirat Rai vowels).
if(c>0xffff) {
return U_HST_NOT_APPLICABLE;
}
/* see comments on gcbToHst[] above */
int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
if(gcb<UPRV_LENGTHOF(gcbToHst)) {

View file

@ -2699,6 +2699,10 @@ TestAdditionalProperties(void) {
{ 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
// GCB=V but hst=NA (exception to GCB=hst for relevant values)
{ 0x16D67, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_NOT_APPLICABLE },
{ 0x16D6A, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_NOT_APPLICABLE },
{ -1, 0x410, 0 }, /* version break for Unicode 4.1 */
{ 0x00d7, UCHAR_PATTERN_SYNTAX, true },

View file

@ -648,7 +648,11 @@ public final class UCharacterProperty
/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
* Hangul_Syllable_Type is redundant with a subset of Grapheme_Cluster_Break.
*
* Starting with Unicode 16, there is an exception:
* Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but
* they are of course not related to Hangul syllables.
*/
private static final int /* UHangulSyllableType */ gcbToHst[]={
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
@ -809,6 +813,12 @@ public final class UCharacterProperty
new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE
@Override
int getValue(int c) {
// Ignore supplementary code points: They all have HST=NA.
// This is a simple way to handle the GCB!=hst cases since Unicode 16
// (Kirat Rai vowels).
if(c>0xffff) {
return HangulSyllableType.NOT_APPLICABLE;
}
/* see comments on gcbToHst[] above */
int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT;
if(gcb<gcbToHst.length) {

View file

@ -2109,6 +2109,10 @@ public final class UCharacterTest extends CoreTestFmwk
{ 0xd7a4, UProperty.HANGUL_SYLLABLE_TYPE, 0 },
// GCB=V but hst=NA (exception to GCB=hst for relevant values)
{ 0x16D67, UProperty.HANGUL_SYLLABLE_TYPE, UCharacter.HangulSyllableType.NOT_APPLICABLE },
{ 0x16D6A, UProperty.HANGUL_SYLLABLE_TYPE, UCharacter.HangulSyllableType.NOT_APPLICABLE },
{ -1, 0x410, 0 }, /* version break for Unicode 4.1 */
{ 0x00d7, UProperty.PATTERN_SYNTAX, 1 },