ICU-12725 Update u_isIDStart and u_isIDPart to TR31

ICU-12725 move to uprops.cpp ICU-12725 change dependency ICU-12725 Fix Java implementation
2025-04-07 22:44:49 +00:00 · 2023-01-18 18:00:35 -08:00 · 2023-01-18 18:00:35 -08:00 · de9cb9a133
commit de9cb9a133
parent bb0e745e25
7 changed files with 47 additions and 73 deletions
--- a/icu4c/source/common/uchar.cpp
+++ b/icu4c/source/common/uchar.cpp
@ -304,30 +304,6 @@ u_ispunct(UChar32 c) {
    return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
 }

-/* Checks if the Unicode character can start a Unicode identifier.*/
-U_CAPI UBool U_EXPORT2
-u_isIDStart(UChar32 c) {
-    /* same as u_isalpha() */
-    uint32_t props;
-    GET_PROPS(c, props);
-    return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0);
-}
-
-/* Checks if the Unicode character can be a Unicode identifier part other than starting the
- identifier.*/
-U_CAPI UBool U_EXPORT2
-u_isIDPart(UChar32 c) {
-    uint32_t props;
-    GET_PROPS(c, props);
-    return (UBool)(
-           (CAT_MASK(props)&
-            (U_GC_ND_MASK|U_GC_NL_MASK|
-             U_GC_L_MASK|
-             U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK)
-           )!=0 ||
-           u_isIDIgnorable(c));
-}
-
 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
 U_CAPI UBool U_EXPORT2
 u_isIDIgnorable(UChar32 c) {
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@ -3837,9 +3837,8 @@ u_getPropertyValueEnum(UProperty property,

 /**
 * Determines if the specified character is permissible as the
- * first character in an identifier according to Unicode
- * (The Unicode Standard, Version 3.0, chapter 5.16 Identifiers).
- * True for characters with general categories "L" (letters) and "Nl" (letter numbers).
+ * first character in an identifier as ID_Start according to
+ * Unicode® Standard Annex #31 UNICODE IDENTIFIER AND PATTERN SYNTAX
 *
 * Same as java.lang.Character.isUnicodeIdentifierStart().
 * Same as UCHAR_ID_START
@ -3856,12 +3855,9 @@ U_CAPI UBool U_EXPORT2
 u_isIDStart(UChar32 c);

 /**
- * Determines if the specified character is permissible
- * in an identifier according to Java.
- * True for characters with general categories "L" (letters),
- * "Nl" (letter numbers), "Nd" (decimal digits),
- * "Mc" and "Mn" (combining marks), "Pc" (connecting punctuation), and
- * u_isIDIgnorable(c).
+ * Determines if the specified character is permissible as a
+ * character other than the first character in an identifier as ID_Continue
+ * according to Unicode® Standard Annex #31 UNICODE IDENTIFIER AND PATTERN SYNTAX
 *
 * Same as java.lang.Character.isUnicodeIdentifierPart().
 * Almost the same as Unicode's ID_Continue (UCHAR_ID_CONTINUE)
@ -3869,7 +3865,8 @@ u_isIDStart(UChar32 c);
 * u_isIDIgnorable(c).
 *
 * @param c the code point to be tested
- * @return true if the code point may occur in an identifier according to Java
+ * @return true if the code point may occur in an identifier other than the
+ * first character.
 *
 * @see UCHAR_ID_CONTINUE
 * @see u_isIDStart
--- a/icu4c/source/common/uprops.cpp
+++ b/icu4c/source/common/uprops.cpp
@ -423,6 +423,19 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
    }
 }

+/* Checks if the Unicode character can start a Unicode identifier.*/
+U_CAPI UBool U_EXPORT2
+u_isIDStart(UChar32 c) {
+    return u_hasBinaryProperty(c, UCHAR_ID_START);
+}
+
+/* Checks if the Unicode character can be a Unicode identifier part other than starting the
+ identifier.*/
+U_CAPI UBool U_EXPORT2
+u_isIDPart(UChar32 c) {
+    return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE);
+}
+
 U_CAPI UBool U_EXPORT2
 u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which) {
    if (s == nullptr && length != 0) { return false; }
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@ -895,10 +895,10 @@ static void TestIdentifier()
    const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
    const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
    const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
-    const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
-    const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
-    const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
-    const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
+    const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061, 0x1885, 0x212e, 0x309b};
+    const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019, 0x2e2f};
+    const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045, 0x1886, 0x212e, 0x309c};
+    const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020, 0x2019, 0x2e2f};
    const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
    const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};

--- a/icu4c/source/test/depstest/dependencies.txt
+++ b/icu4c/source/test/depstest/dependencies.txt
@ -351,7 +351,7 @@ group: uniset_core
 group: icu_utility_with_props
    util_props.o
  deps
-    icu_utility uchar ucase
+    icu_utility uchar ucase uprops

 group: icu_utility
    util.o
--- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java
@ -4550,20 +4550,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
     */
    public static boolean isUnicodeIdentifierPart(int ch)
    {
-        // if props == 0, it will just fall through and return false
-        // cat == format
-        return ((1 << getType(ch))
-                & ((1 << UCharacterCategory.UPPERCASE_LETTER)
-                        | (1 << UCharacterCategory.LOWERCASE_LETTER)
-                        | (1 << UCharacterCategory.TITLECASE_LETTER)
-                        | (1 << UCharacterCategory.MODIFIER_LETTER)
-                        | (1 << UCharacterCategory.OTHER_LETTER)
-                        | (1 << UCharacterCategory.LETTER_NUMBER)
-                        | (1 << UCharacterCategory.CONNECTOR_PUNCTUATION)
-                        | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER)
-                        | (1 << UCharacterCategory.COMBINING_SPACING_MARK)
-                        | (1 << UCharacterCategory.NON_SPACING_MARK))) != 0
-                        || isIdentifierIgnorable(ch);
+        return hasBinaryProperty(ch, UProperty.ID_CONTINUE);  // single code point
    }

    /**
@ -4588,15 +4575,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
     */
    public static boolean isUnicodeIdentifierStart(int ch)
    {
-        /*int cat = getType(ch);*/
-        // if props == 0, it will just fall through and return false
-        return ((1 << getType(ch))
-                & ((1 << UCharacterCategory.UPPERCASE_LETTER)
-                        | (1 << UCharacterCategory.LOWERCASE_LETTER)
-                        | (1 << UCharacterCategory.TITLECASE_LETTER)
-                        | (1 << UCharacterCategory.MODIFIER_LETTER)
-                        | (1 << UCharacterCategory.OTHER_LETTER)
-                        | (1 << UCharacterCategory.LETTER_NUMBER))) != 0;
+        return hasBinaryProperty(ch, UProperty.ID_START);  // single code point
    }

    /**
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
@ -623,15 +623,14 @@ public final class UCharacterTest extends TestFmwk
    @Test
    public void TestIdentifier()
    {
-        int unicodeidstart[] = {0x0250, 0x0000e2, 0x000061};
-        int nonunicodeidstart[] = {0x2000, 0x00000a, 0x002019};
-        int unicodeidpart[] = {0x005f, 0x000032, 0x000045};
-        int nonunicodeidpart[] = {0x2030, 0x0000a3, 0x000020};
+        int unicodeidstart[] = {0x0250, 0x0000e2, 0x000061, 0x001885, 0x00212e, 0x00309b};
+        int nonunicodeidstart[] = {0x2000, 0x00000a, 0x002019, 0x002e2f};
+        int unicodeidpart[] = {0x005f, 0x000032, 0x000045, 0x001886, 0x00212e, 0x00309c};
+        int nonunicodeidpart[] = {0x2030, 0x0000a3, 0x000020, 0x002019, 0x002e2f};
        int idignore[] = {0x0006, 0x0010, 0x206b};
        int nonidignore[] = {0x0075, 0x0000a3, 0x000061};

-        int size = unicodeidstart.length;
-        for (int i = 0; i < size; i ++)
+        for (int i = 0; i < unicodeidstart.length; i ++)
        {
            if (!UCharacter.isUnicodeIdentifierStart(unicodeidstart[i]))
            {
@ -639,6 +638,9 @@ public final class UCharacterTest extends TestFmwk
                    " expected to be a unicode identifier start character");
                break;
            }
+        }
+        for (int i = 0; i < nonunicodeidstart.length; i ++)
+        {
            if (UCharacter.isUnicodeIdentifierStart(nonunicodeidstart[i]))
            {
                errln("FAIL \\u" + hex(nonunicodeidstart[i]) +
@ -646,12 +648,18 @@ public final class UCharacterTest extends TestFmwk
                        "character");
                break;
            }
+        }
+        for (int i = 0; i < unicodeidpart.length; i ++)
+        {
            if (!UCharacter.isUnicodeIdentifierPart(unicodeidpart[i]))
            {
                errln("FAIL \\u" + hex(unicodeidpart[i]) +
                    " expected to be a unicode identifier part character");
                break;
            }
+        }
+        for (int i = 0; i < nonunicodeidpart.length; i ++)
+        {
            if (UCharacter.isUnicodeIdentifierPart(nonunicodeidpart[i]))
            {
                errln("FAIL \\u" + hex(nonunicodeidpart[i]) +
@ -659,23 +667,24 @@ public final class UCharacterTest extends TestFmwk
                        "character");
                break;
            }
+         }
+        for (int i = 0; i < idignore.length; i ++)
+        {
            if (!UCharacter.isIdentifierIgnorable(idignore[i]))
            {
                errln("FAIL \\u" + hex(idignore[i]) +
                        " expected to be a ignorable unicode character");
                break;
            }
+        }
+        for (int i = 0; i < nonidignore.length; i ++)
+        {
            if (UCharacter.isIdentifierIgnorable(nonidignore[i]))
            {
                errln("FAIL \\u" + hex(nonidignore[i]) +
                    " expected not to be a ignorable unicode character");
                break;
            }
-            logln("Ok    \\u" + hex(unicodeidstart[i]) + " and \\u" +
-                    hex(nonunicodeidstart[i]) + " and \\u" +
-                    hex(unicodeidpart[i]) + " and \\u" +
-                    hex(nonunicodeidpart[i]) + " and \\u" +
-                    hex(idignore[i]) + " and \\u" + hex(nonidignore[i]));
        }
    }