ICU-4199 enum/name API support for C/POSIX character classes, and UnicodeSet support for [:Assigned:]

X-SVN-Rev: 17730
2025-04-13 08:53:20 +00:00 · 2005-05-28 22:54:36 +00:00 · 2005-05-28 22:54:36 +00:00 · e6a0df52ee
commit e6a0df52ee
parent 291516499b
11 changed files with 1299 additions and 1175 deletions
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@ -504,7 +504,7 @@ u_isUAlphabetic(UChar32 c) {
    return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
 }

-/* Checks if ch is a letter or a decimal digit */
+/* Checks if c is a letter or a decimal digit */
 U_CAPI UBool U_EXPORT2
 u_isalnum(UChar32 c) {
    uint32_t props;
@ -512,6 +512,15 @@ u_isalnum(UChar32 c) {
    return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0);
 }

+/**
+ * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
+ * @internal
+ */
+U_CFUNC UBool
+u_isalnumPOSIX(UChar32 c) {
+    return (UBool)(u_isUAlphabetic(c) || u_isdigit(c));
+}
+
 /* Checks if ch is a unicode character with assigned character type.*/
 U_CAPI UBool U_EXPORT2
 u_isdefined(UChar32 c) {
@ -577,8 +586,10 @@ u_isblank(UChar32 c) {
    if((uint32_t)c<=0x9f) {
        return c==9 || c==0x20; /* TAB or SPACE */
    } else {
-        /* White_Space but not LS (Zl) or PS (Zp) */
-        return u_isUWhiteSpace(c) && ((c&0xfffffffe)!=0x2028);
+        /* Zs */
+        uint32_t props;
+        GET_PROPS(c, props);
+        return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR);
    }
 }

@ -596,6 +607,22 @@ u_isprint(UChar32 c) {
    return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
 }

+/**
+ * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
+ * Implements UCHAR_POSIX_PRINT.
+ * @internal
+ */
+U_CFUNC UBool
+u_isprintPOSIX(UChar32 c) {
+    uint32_t props;
+    GET_PROPS(c, props);
+    /*
+     * The only cntrl character in graph+blank is TAB (in blank).
+     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
+     */
+    return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c));
+}
+
 U_CAPI UBool U_EXPORT2
 u_isgraph(UChar32 c) {
    uint32_t props;
@ -606,6 +633,24 @@ u_isgraph(UChar32 c) {
                   ==0);
 }

+/**
+ * Checks if c is in
+ * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
+ * with space=\p{Whitespace} and Control=Cc.
+ * Implements UCHAR_POSIX_GRAPH.
+ * @internal
+ */
+U_CFUNC UBool
+u_isgraphPOSIX(UChar32 c) {
+    uint32_t props;
+    GET_PROPS(c, props);
+    /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
+    /* comparing ==0 returns FALSE for the categories mentioned */
+    return (UBool)((CAT_MASK(props)&
+                    (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
+                   ==0);
+}
+
 U_CAPI UBool U_EXPORT2
 u_ispunct(UChar32 c) {
    uint32_t props;
@ -1003,9 +1048,11 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {

    /* add code points with hardcoded properties, plus the ones following them */

+    /* add for u_isblank() */
+    USET_ADD_CP_AND_NEXT(sa, TAB);
+
    /* add for IS_THAT_CONTROL_SPACE() */
-    sa->add(sa->set, TAB); /* range TAB..CR */
-    sa->add(sa->set, CR+1);
+    sa->add(sa->set, CR+1); /* range TAB..CR */
    sa->add(sa->set, 0x1c);
    sa->add(sa->set, 0x1f+1);
    USET_ADD_CP_AND_NEXT(sa, NL);
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@ -77,12 +77,31 @@ U_CDECL_BEGIN
 * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
 * Another example: There is no "istitle()" class for titlecase characters.
 *
- * A summary of the behavior of some C/POSIX character classification implementations
- * for Unicode is available at http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/posix_classes.html
+ * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
+ * ICU implements them according to the Standard Recommendations in
+ * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
+ * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
 *
- * <strong>Important</strong>:
- * The behavior of the ICU C/POSIX-style character classification
- * functions is subject to change according to discussion of the above summary.
+ * API access for C/POSIX character classes is as follows:
+ * - alpha:     u_isUAlphabetic(c) or u_hasBinaryProperty(c, UCHAR_ALPHABETIC)
+ * - lower:     u_isULowercase(c) or u_hasBinaryProperty(c, UCHAR_LOWERCASE)
+ * - upper:     u_isUUppercase(c) or u_hasBinaryProperty(c, UCHAR_UPPERCASE)
+ * - punct:     u_ispunct(c)
+ * - digit:     u_charType(c)==U_DECIMAL_DIGIT_NUMBER
+ * - xdigit:    u_isxdigit(c) or u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT)
+ * - alnum:     u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM)
+ * - space:     u_isUWhiteSpace(c) or u_hasBinaryProperty(c, UCHAR_WHITE_SPACE)
+ * - blank:     u_isblank(c) or u_hasBinaryProperty(c, UCHAR_POSIX_BLANK)
+ * - cntrl:     u_charType(c)==U_CONTROL_CHAR
+ * - graph:     u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH)
+ * - print:     u_hasBinaryProperty(c, UCHAR_POSIX_PRINT)
+ *
+ * Note: Some of the u_isxyz() functions in uchar.h predate, and do not match,
+ * the Standard Recommendations in UTS #18. Instead, they match Java
+ * functions according to their API documentation.
+ *
+ * The C/POSIX character classes are also available in UnicodeSet patterns,
+ * using patterns like [:graph:] or \p{graph}.
 *
 * Note: There are several ICU whitespace functions.
 * Comparison:
@ -368,6 +387,31 @@ typedef enum UProperty {
        (http://www.unicode.org/reports/tr31/)
        @draft ICU 3.4 */
    UCHAR_PATTERN_WHITE_SPACE,
+    /** Binary property alnum (a C/POSIX character class).
+        Implemented according to the UTS #18 Annex C Standard Recommendation.
+        See the uchar.h file documentation.
+        @draft ICU 3.4 */
+    UCHAR_POSIX_ALNUM,
+    /** Binary property blank (a C/POSIX character class).
+        Implemented according to the UTS #18 Annex C Standard Recommendation.
+        See the uchar.h file documentation.
+        @draft ICU 3.4 */
+    UCHAR_POSIX_BLANK,
+    /** Binary property graph (a C/POSIX character class).
+        Implemented according to the UTS #18 Annex C Standard Recommendation.
+        See the uchar.h file documentation.
+        @draft ICU 3.4 */
+    UCHAR_POSIX_GRAPH,
+    /** Binary property print (a C/POSIX character class).
+        Implemented according to the UTS #18 Annex C Standard Recommendation.
+        See the uchar.h file documentation.
+        @draft ICU 3.4 */
+    UCHAR_POSIX_PRINT,
+    /** Binary property xdigit (a C/POSIX character class).
+        Implemented according to the UTS #18 Annex C Standard Recommendation.
+        See the uchar.h file documentation.
+        @draft ICU 3.4 */
+    UCHAR_POSIX_XDIGIT,
    /** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */
    UCHAR_BINARY_LIMIT,

@ -1739,7 +1783,6 @@ u_getNumericValue(UChar32 c);
 * @see UCHAR_LOWERCASE
 * @see u_isupper
 * @see u_istitle
- * @see u_islower
 * @stable ICU 2.0
 */
 U_STABLE UBool U_EXPORT2
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@ -569,7 +569,8 @@ public:
     * correspond to the following sets:
     *
     * "ANY" = [\\u0000-\\U0010FFFF],
-     * "ASCII" = [\\u0000-\\u007F].
+     * "ASCII" = [\\u0000-\\u007F],
+     * "Assigned" = [:^Cn:].
     *
     * @param value a value alias, either short or long.  The name is matched
     * loosely.  See PropertyValueAliases.txt for names and a description of
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h
@ -265,7 +265,8 @@ uset_applyIntPropertyValue(USet* set,
 * matched loosely and correspond to the following sets:
 *
 * "ANY" = [\\u0000-\\U0010FFFF],
- * "ASCII" = [\\u0000-\\u007F].
+ * "ASCII" = [\\u0000-\\u007F],
+ * "Assigned" = [:^Cn:].
 *
 * @param propLength the length of the prop, or -1 if NULL
 *
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@ -77,42 +77,12 @@ static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
 // Special property set IDs
 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
+static const char ASSIGNED[] = "Assigned"; // [:^Cn:]

 // Unicode name property alias
 #define NAME_PROP "na"
 #define NAME_PROP_LENGTH 2

-// TODO: Remove the following special-case code when
-// these four C99-compatibility properties are implemented
-// as enums/names.
-U_CDECL_BEGIN
-    typedef UBool (U_CALLCONV *C99_Property_Function)(UChar32);
-U_CDECL_END
-static const struct C99_Map {
-    const char* name;
-    C99_Property_Function func;
-    UPropertySource src;
-} C99_DISPATCH[] = {
-    // These three entries omitted; they clash with PropertyAliases
-    // names for Unicode properties, so UnicodeSet already maps them
-    // to those properties.
-    //{ "alpha", u_isalpha, UPROPS_SRC_PROPSVEC },
-    //{ "lower", u_islower, UPROPS_SRC_CASE },
-    //{ "upper", u_isupper, UPROPS_SRC_CASE },
-
-    // MUST be in SORTED order
-    { "alnum", u_isalnum, UPROPS_SRC_CHAR },
-    { "blank", u_isblank, UPROPS_SRC_PROPSVEC },
-    // new alias in Unicode 4.1 { "cntrl", u_iscntrl, UPROPS_SRC_CHAR },
-    // new alias in Unicode 4.1 { "digit", u_isdigit, UPROPS_SRC_CHAR },
-    { "graph", u_isgraph, UPROPS_SRC_CHAR },
-    { "print", u_isprint, UPROPS_SRC_CHAR },
-    // new alias in Unicode 4.1 { "punct", u_ispunct, UPROPS_SRC_CHAR },
-    // new alias in Unicode 4.1 { "space", u_isspace, UPROPS_SRC_CHAR },
-    { "title", u_istitle, UPROPS_SRC_CHAR },
-    { "xdigit", u_isxdigit, UPROPS_SRC_CHAR }
-};
-
 // TEMPORARY: Remove when deprecated category code constructor is removed.
 static const UChar CATEGORY_NAMES[] = {
    // Must be kept in sync with uchar.h/UCharCategory
@ -931,14 +901,6 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {

 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}

-// TODO: Remove the following special-case code when
-// these four C99-compatibility properties are implemented
-// as enums/names.
-static UBool c99Filter(UChar32 ch, void* context) {
-    struct C99_Map* m = (struct C99_Map*) context;
-    return m->func(ch);
-}
-
 UnicodeSet&
 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
    if (U_FAILURE(ec)) return *this;
@ -974,7 +936,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,

    UProperty p;
    int32_t v;
-    UBool mustNotBeEmpty = FALSE;
+    UBool mustNotBeEmpty = FALSE, invert = FALSE;

    if (value.length() > 0) {
        p = u_getPropertyEnum(pname);
@ -1081,22 +1043,12 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
                } else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
                    set(0, 0x7F);
                    return *this;
+                } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
+                    // [:Assigned:]=[:^Cn:]
+                    p = UCHAR_GENERAL_CATEGORY_MASK;
+                    v = U_GC_CN_MASK;
+                    invert = TRUE;
                } else {
-
-                    // TODO: Remove the following special-case code when
-                    // these four C99-compatibility properties are implemented
-                    // as enums/names.
-                    for (int32_t i=0; i<LENGTHOF(C99_DISPATCH); ++i) {
-                        int32_t c = uprv_comparePropertyNames(pname, C99_DISPATCH[i].name);
-                        if (c == 0) {
-                            applyFilter(c99Filter, (void*) &C99_DISPATCH[i], C99_DISPATCH[i].src, ec);
-                            return *this;
-                        } else if (c < 0) {
-                            // Further entries will not match; bail out
-                            break;
-                        }
-                    }
-
                    FAIL(ec);
                }
            }
@ -1104,6 +1056,9 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
    }
    
    applyIntPropertyValue(p, v, ec);
+    if(invert) {
+        complement();
+    }

    if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
        // mustNotBeEmpty is set to true if an empty set indicates
@ -1342,6 +1297,10 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
            case UPROPS_SRC_PROPSVEC:
                upropsvec_addPropertyStarts(&sa, &status);
                break;
+            case UPROPS_SRC_CHAR_AND_PROPSVEC:
+                uchar_addPropertyStarts(&sa, &status);
+                upropsvec_addPropertyStarts(&sa, &status);
+                break;
            case UPROPS_SRC_HST:
                uhst_addPropertyStarts(&sa, &status);
                break;
--- a/icu4c/source/common/uprops.c
+++ b/icu4c/source/common/uprops.c
@ -239,7 +239,12 @@ static const struct {
    { UPROPS_SRC_NORM,  0 },                                    /* UCHAR_NFKC_INERT */
    { UPROPS_SRC_NORM,  0 },                                    /* UCHAR_SEGMENT_STARTER */
    {  2,               U_MASK(UPROPS_V2_PATTERN_SYNTAX) },
-    {  2,               U_MASK(UPROPS_V2_PATTERN_WHITE_SPACE) }
+    {  2,               U_MASK(UPROPS_V2_PATTERN_WHITE_SPACE) },
+    { UPROPS_SRC_CHAR_AND_PROPSVEC,  0 },                       /* UCHAR_POSIX_ALNUM */
+    { UPROPS_SRC_CHAR,  0 },                                    /* UCHAR_POSIX_BLANK */
+    { UPROPS_SRC_CHAR,  0 },                                    /* UCHAR_POSIX_GRAPH */
+    { UPROPS_SRC_CHAR,  0 },                                    /* UCHAR_POSIX_PRINT */
+    { UPROPS_SRC_CHAR,  0 }                                     /* UCHAR_POSIX_XDIGIT */
 };

 U_CAPI UBool U_EXPORT2
@ -305,6 +310,26 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
                default:
                    break;
                }
+            } else if(column==UPROPS_SRC_CHAR) {
+                switch(which) {
+                case UCHAR_POSIX_BLANK:
+                    return u_isblank(c);
+                case UCHAR_POSIX_GRAPH:
+                    return u_isgraphPOSIX(c);
+                case UCHAR_POSIX_PRINT:
+                    return u_isprintPOSIX(c);
+                case UCHAR_POSIX_XDIGIT:
+                    return u_isxdigit(c);
+                default:
+                    break;
+                }
+            } else if(column==UPROPS_SRC_CHAR_AND_PROPSVEC) {
+                switch(which) {
+                case UCHAR_POSIX_ALNUM:
+                    return u_isalnumPOSIX(c);
+                default:
+                    break;
+                }
            }
        }
    }
--- a/icu4c/source/common/uprops.h
+++ b/icu4c/source/common/uprops.h
@ -224,6 +224,31 @@ uprv_getMaxValues(int32_t column);
 U_CFUNC UHangulSyllableType
 uchar_getHST(UChar32 c);

+/**
+ * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
+ * @internal
+ */
+U_CFUNC UBool
+u_isalnumPOSIX(UChar32 c);
+
+/**
+ * Checks if c is in
+ * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
+ * with space=\p{Whitespace} and Control=Cc.
+ * Implements UCHAR_POSIX_GRAPH.
+ * @internal
+ */
+U_CFUNC UBool
+u_isgraphPOSIX(UChar32 c);
+
+/**
+ * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
+ * Implements UCHAR_POSIX_PRINT.
+ * @internal
+ */
+U_CFUNC UBool
+u_isprintPOSIX(UChar32 c);
+
 /** Turn a bit index into a bit flag. @internal */
 #define FLAG(n) ((uint32_t)1<<(n))

@ -359,6 +384,8 @@ enum UPropertySource {
    UPROPS_SRC_CASE,
    /** From ubidi_props.c/ubidi.icu */
    UPROPS_SRC_BIDI,
+    /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
+    UPROPS_SRC_CHAR_AND_PROPSVEC,
    /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
    UPROPS_SRC_COUNT
 };
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@ -889,6 +889,9 @@ void UnicodeSetTest::TestPropertySet() {
        "\\u0F73\\u0F75\\u0F81",
        "abcd\\u0300\\u0301\\u00c0\\u00c5",

+        "[:Assigned:]",
+        "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
+        "\\u0888\\uFDD3\\uFFFE\\U00050005"
    };

    static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
@ -946,24 +949,20 @@ void UnicodeSetTest::TestPosixClasses() {
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(s1==s2);
    }
-    UVersionInfo ICU_34 = {3, 4, 0, 0};  // Time Bomb for bug 4199
    {
-        if (isICUVersionAtLeast(ICU_34)) {  // Time Bomb Test
-            UErrorCode status = U_ZERO_ERROR;
-            UnicodeSet s1("[:alnum:]", status);
-            UnicodeSet s2("[\\p{Alphabetic}\\p{DecimalNumber}]", status);
-            TEST_ASSERT_SUCCESS(status);
-            TEST_ASSERT(s1==s2);
-        }
+        UErrorCode status = U_ZERO_ERROR;
+        UnicodeSet s1("[:alnum:]", status);
+        UnicodeSet s2("[\\p{Alphabetic}\\p{DecimalNumber}]", status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(s1==s2);
    }
    {
-        if (isICUVersionAtLeast(ICU_34)) {  // Time Bomb Test
-            UErrorCode status = U_ZERO_ERROR;
-            UnicodeSet s1("[:space:]", status);
-            UnicodeSet s2("\\p{Whitespace}", status);
-            TEST_ASSERT_SUCCESS(status);
-            TEST_ASSERT(s1==s2);
-        }    }
+        UErrorCode status = U_ZERO_ERROR;
+        UnicodeSet s1("[:space:]", status);
+        UnicodeSet s2("\\p{Whitespace}", status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(s1==s2);
+    }
    {
        UErrorCode status = U_ZERO_ERROR;
        UnicodeSet s1("[:blank:]", status);
@ -974,39 +973,29 @@ void UnicodeSetTest::TestPosixClasses() {
        TEST_ASSERT(s1==s2);
    }
    {
-        if (isICUVersionAtLeast(ICU_34)) {  // Time Bomb Test
-            UErrorCode status = U_ZERO_ERROR;
-            UnicodeSet s1("[:cntrl:]", status);
-            TEST_ASSERT_SUCCESS(status);
-            UnicodeSet s2("\\p{Control}", status);
-            TEST_ASSERT_SUCCESS(status);
-            TEST_ASSERT(s1==s2);
-        }
+        UErrorCode status = U_ZERO_ERROR;
+        UnicodeSet s1("[:cntrl:]", status);
+        TEST_ASSERT_SUCCESS(status);
+        UnicodeSet s2("\\p{Control}", status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(s1==s2);
    }
    {
        UErrorCode status = U_ZERO_ERROR;
        UnicodeSet s1("[:graph:]", status);
        TEST_ASSERT_SUCCESS(status);
-        UnicodeSet s2("[^\\p{Whitespace}\\p{Control}\\p{Format}"
-            "\\p{Surrogate}\\p{Unassigned}]", status);
+        UnicodeSet s2("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(s1==s2);
    }
    {
-        if (isICUVersionAtLeast(ICU_34)) {  // Time Bomb Test
-            UErrorCode status = U_ZERO_ERROR;
-            UnicodeSet s1("[:print:]", status);
-            TEST_ASSERT_SUCCESS(status);
-            UnicodeSet s2(
-                "[[^\\p{Whitespace}\\p{Control}\\p{Format}\\p{Surrogate}\\p{Unassigned}]"
-                "[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}]]"
-                "-[\\p{Control}]]"
-                , status);
-            TEST_ASSERT_SUCCESS(status);
-            TEST_ASSERT(s1==s2);
-        }
+        UErrorCode status = U_ZERO_ERROR;
+        UnicodeSet s1("[:print:]", status);
+        TEST_ASSERT_SUCCESS(status);
+        UnicodeSet s2("[[:graph:][:blank:]-[\\p{Control}]]" ,status);
+        TEST_ASSERT_SUCCESS(status);
+        TEST_ASSERT(s1==s2);
    }
-
 }
 /**
 * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
--- a/icu4c/source/tools/genpname/SyntheticPropertyAliases.txt
+++ b/icu4c/source/tools/genpname/SyntheticPropertyAliases.txt
@ -1,5 +1,5 @@
 ######################################################################
-# Copyright (c) 2003-2004, International Business Machines
+# Copyright (c) 2003-2005, International Business Machines
 # Corporation and others.  All Rights Reserved.
 ######################################################################
 # Author: Alan Liu
@ -42,3 +42,11 @@ nfcinert; NFC_Inert
 nfkcinert; NFKC_Inert

 segstart; Segment_Starter
+
+# C/POSIX character classes that do not have Unicode property [value] aliases
+# see uchar.h
+n/a; alnum
+n/a; blank
+n/a; graph
+n/a; print
+n/a; xdigit
--- a/icu4c/source/tools/genpname/data.h
+++ b/icu4c/source/tools/genpname/data.h
--- a/icu4c/source/tools/genpname/preparse.pl
+++ b/icu4c/source/tools/genpname/preparse.pl
@ -65,6 +65,9 @@ my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
 # Get the current year from the system
 my $YEAR = 1900+@{[localtime]}[5]; # Get the current year

+# Used to make "n/a" property aliases (Unicode or Synthetic) unique
+my $propNA = 0;
+
 #----------------------------------------------------------------------
 # Top level property keys for binary, enumerated, string, and double props
 my @TOP     = qw( _bp _ep _sp _dp _mp );
@ -304,7 +307,7 @@ END
                $i = $groupToInt{$groupString};
            } else {
                my @names = split(/\|/, $groupString);
-                die "Error: Wrong number of names in " . $groupString if (@names < 2);
+                die "Error: Wrong number of names in " . $groupString if (@names < 1);
                $i = @nameGroups; # index of group we are making 
                $groupToInt{$groupString} = $i; # Cache for reuse
                push @nameGroups, map { $stringToID{$_} } @names;
@ -589,7 +592,12 @@ sub merge_PropertyAliases {
                die "Error: Property $long_name not found (or used more than once)";
            }

-            my $value = $pa->{$long_name} . "|" . $long_name;
+            my $value;
+            if($pa->{$long_name} =~ m|^n/a\d*$|) {
+                $value = $long_name;
+            } else {
+                $value = $pa->{$long_name} . "|" . $long_name;
+            }
            if (exists $additional_property_aliases{$long_name}) {
                $value .= "|" . $additional_property_aliases{$long_name};
            }
@ -689,8 +697,8 @@ sub merge_PropertyValueAliases {
            my $l = $n;
            my $r = $pva->{$n};
            # convert |n/a\d+| to blank
-            $l = '' if ($l =~ m|^n/a\d+$|);
-            $r = '' if ($r =~ m|^n/a\d+$|);
+            $l = '' if ($l =~ m|^n/a\d*$|);
+            $r = '' if ($r =~ m|^n/a\d*$|);

            $hh->{$enum} = "$l|$r";
            # Don't delete the 'gc' properties because we need to share
@ -766,8 +774,6 @@ sub read_PropertyAliases {
    my $in = new FileHandle($filename, 'r');
    die "Error: Cannot open $filename" if (!defined $in);

-    my $sym = 0; # Used to make "n/a" strings unique
-
    while (<$in>) {

        # Read version (embedded in a comment)
@ -795,9 +801,12 @@ sub read_PropertyAliases {
            }

            # Make "n/a" strings unique
+            if ($short eq 'n/a') {
+                $short .= sprintf("%03d", $propNA++);
+            }
            my $long = $fields[0];
            if ($long eq 'n/a') {
-                $long .= sprintf("%03d", $sym++);
+                $long .= sprintf("%03d", $propNA++);
            }

            # Add long name->short name to the hash=pa hash table
@ -847,7 +856,7 @@ sub read_PropertyValueAliases {
    my $in = new FileHandle($filename, 'r');
    die "Error: Cannot open $filename" if (!defined $in);

-    my $sym = 0; # Used to make "n/a" strings unique
+    my $valueNA = 0; # Used to make "n/a" strings unique

    while (<$in>) {

@ -868,7 +877,7 @@ sub read_PropertyValueAliases {
            die "Error: Wrong number of fields in $filename"
                if (@fields < 2 || @fields > 3);
            # Make "n/a" strings unique
-            $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a');
+            $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
            # Squash extra fields together
            while (@fields > 2) {
                my $f = pop @fields;