diff --git a/icu4c/source/common/uchar.c b/icu4c/source/common/uchar.c
index 5f3a656bb2a..5b00a493e6c 100644
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@@ -504,7 +504,7 @@ u_isUAlphabetic(UChar32 c) {
return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
}
-/* Checks if ch is a letter or a decimal digit */
+/* Checks if c is a letter or a decimal digit */
U_CAPI UBool U_EXPORT2
u_isalnum(UChar32 c) {
uint32_t props;
@@ -512,6 +512,15 @@ u_isalnum(UChar32 c) {
return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0);
}
+/**
+ * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
+ * @internal
+ */
+U_CFUNC UBool
+u_isalnumPOSIX(UChar32 c) {
+ return (UBool)(u_isUAlphabetic(c) || u_isdigit(c));
+}
+
/* Checks if ch is a unicode character with assigned character type.*/
U_CAPI UBool U_EXPORT2
u_isdefined(UChar32 c) {
@@ -577,8 +586,10 @@ u_isblank(UChar32 c) {
if((uint32_t)c<=0x9f) {
return c==9 || c==0x20; /* TAB or SPACE */
} else {
- /* White_Space but not LS (Zl) or PS (Zp) */
- return u_isUWhiteSpace(c) && ((c&0xfffffffe)!=0x2028);
+ /* Zs */
+ uint32_t props;
+ GET_PROPS(c, props);
+ return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR);
}
}
@@ -596,6 +607,22 @@ u_isprint(UChar32 c) {
return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
}
+/**
+ * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
+ * Implements UCHAR_POSIX_PRINT.
+ * @internal
+ */
+U_CFUNC UBool
+u_isprintPOSIX(UChar32 c) {
+ uint32_t props;
+ GET_PROPS(c, props);
+ /*
+ * The only cntrl character in graph+blank is TAB (in blank).
+ * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
+ */
+ return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c));
+}
+
U_CAPI UBool U_EXPORT2
u_isgraph(UChar32 c) {
uint32_t props;
@@ -606,6 +633,24 @@ u_isgraph(UChar32 c) {
==0);
}
+/**
+ * Checks if c is in
+ * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
+ * with space=\p{Whitespace} and Control=Cc.
+ * Implements UCHAR_POSIX_GRAPH.
+ * @internal
+ */
+U_CFUNC UBool
+u_isgraphPOSIX(UChar32 c) {
+ uint32_t props;
+ GET_PROPS(c, props);
+ /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
+ /* comparing ==0 returns FALSE for the categories mentioned */
+ return (UBool)((CAT_MASK(props)&
+ (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
+ ==0);
+}
+
U_CAPI UBool U_EXPORT2
u_ispunct(UChar32 c) {
uint32_t props;
@@ -1003,9 +1048,11 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
/* add code points with hardcoded properties, plus the ones following them */
+ /* add for u_isblank() */
+ USET_ADD_CP_AND_NEXT(sa, TAB);
+
/* add for IS_THAT_CONTROL_SPACE() */
- sa->add(sa->set, TAB); /* range TAB..CR */
- sa->add(sa->set, CR+1);
+ sa->add(sa->set, CR+1); /* range TAB..CR */
sa->add(sa->set, 0x1c);
sa->add(sa->set, 0x1f+1);
USET_ADD_CP_AND_NEXT(sa, NL);
diff --git a/icu4c/source/common/unicode/uchar.h b/icu4c/source/common/unicode/uchar.h
index c868b9ec670..16a64342acb 100644
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@@ -77,12 +77,31 @@ U_CDECL_BEGIN
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
* Another example: There is no "istitle()" class for titlecase characters.
*
- * A summary of the behavior of some C/POSIX character classification implementations
- * for Unicode is available at http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/posix_classes.html
+ * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
+ * ICU implements them according to the Standard Recommendations in
+ * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
+ * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
*
- * Important:
- * The behavior of the ICU C/POSIX-style character classification
- * functions is subject to change according to discussion of the above summary.
+ * API access for C/POSIX character classes is as follows:
+ * - alpha: u_isUAlphabetic(c) or u_hasBinaryProperty(c, UCHAR_ALPHABETIC)
+ * - lower: u_isULowercase(c) or u_hasBinaryProperty(c, UCHAR_LOWERCASE)
+ * - upper: u_isUUppercase(c) or u_hasBinaryProperty(c, UCHAR_UPPERCASE)
+ * - punct: u_ispunct(c)
+ * - digit: u_charType(c)==U_DECIMAL_DIGIT_NUMBER
+ * - xdigit: u_isxdigit(c) or u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT)
+ * - alnum: u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM)
+ * - space: u_isUWhiteSpace(c) or u_hasBinaryProperty(c, UCHAR_WHITE_SPACE)
+ * - blank: u_isblank(c) or u_hasBinaryProperty(c, UCHAR_POSIX_BLANK)
+ * - cntrl: u_charType(c)==U_CONTROL_CHAR
+ * - graph: u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH)
+ * - print: u_hasBinaryProperty(c, UCHAR_POSIX_PRINT)
+ *
+ * Note: Some of the u_isxyz() functions in uchar.h predate, and do not match,
+ * the Standard Recommendations in UTS #18. Instead, they match Java
+ * functions according to their API documentation.
+ *
+ * The C/POSIX character classes are also available in UnicodeSet patterns,
+ * using patterns like [:graph:] or \p{graph}.
*
* Note: There are several ICU whitespace functions.
* Comparison:
@@ -368,6 +387,31 @@ typedef enum UProperty {
(http://www.unicode.org/reports/tr31/)
@draft ICU 3.4 */
UCHAR_PATTERN_WHITE_SPACE,
+ /** Binary property alnum (a C/POSIX character class).
+ Implemented according to the UTS #18 Annex C Standard Recommendation.
+ See the uchar.h file documentation.
+ @draft ICU 3.4 */
+ UCHAR_POSIX_ALNUM,
+ /** Binary property blank (a C/POSIX character class).
+ Implemented according to the UTS #18 Annex C Standard Recommendation.
+ See the uchar.h file documentation.
+ @draft ICU 3.4 */
+ UCHAR_POSIX_BLANK,
+ /** Binary property graph (a C/POSIX character class).
+ Implemented according to the UTS #18 Annex C Standard Recommendation.
+ See the uchar.h file documentation.
+ @draft ICU 3.4 */
+ UCHAR_POSIX_GRAPH,
+ /** Binary property print (a C/POSIX character class).
+ Implemented according to the UTS #18 Annex C Standard Recommendation.
+ See the uchar.h file documentation.
+ @draft ICU 3.4 */
+ UCHAR_POSIX_PRINT,
+ /** Binary property xdigit (a C/POSIX character class).
+ Implemented according to the UTS #18 Annex C Standard Recommendation.
+ See the uchar.h file documentation.
+ @draft ICU 3.4 */
+ UCHAR_POSIX_XDIGIT,
/** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */
UCHAR_BINARY_LIMIT,
@@ -1739,7 +1783,6 @@ u_getNumericValue(UChar32 c);
* @see UCHAR_LOWERCASE
* @see u_isupper
* @see u_istitle
- * @see u_islower
* @stable ICU 2.0
*/
U_STABLE UBool U_EXPORT2
diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index 9e45b52f1cb..639305e7c38 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -569,7 +569,8 @@ public:
* correspond to the following sets:
*
* "ANY" = [\\u0000-\\U0010FFFF],
- * "ASCII" = [\\u0000-\\u007F].
+ * "ASCII" = [\\u0000-\\u007F],
+ * "Assigned" = [:^Cn:].
*
* @param value a value alias, either short or long. The name is matched
* loosely. See PropertyValueAliases.txt for names and a description of
diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h
index 6a243d7fbe4..9a2066c9435 100644
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h
@@ -265,7 +265,8 @@ uset_applyIntPropertyValue(USet* set,
* matched loosely and correspond to the following sets:
*
* "ANY" = [\\u0000-\\U0010FFFF],
- * "ASCII" = [\\u0000-\\u007F].
+ * "ASCII" = [\\u0000-\\u007F],
+ * "Assigned" = [:^Cn:].
*
* @param propLength the length of the prop, or -1 if NULL
*
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 069c59fe2f8..590c5a84b13 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -77,42 +77,12 @@ static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
// Special property set IDs
static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
+static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
// Unicode name property alias
#define NAME_PROP "na"
#define NAME_PROP_LENGTH 2
-// TODO: Remove the following special-case code when
-// these four C99-compatibility properties are implemented
-// as enums/names.
-U_CDECL_BEGIN
- typedef UBool (U_CALLCONV *C99_Property_Function)(UChar32);
-U_CDECL_END
-static const struct C99_Map {
- const char* name;
- C99_Property_Function func;
- UPropertySource src;
-} C99_DISPATCH[] = {
- // These three entries omitted; they clash with PropertyAliases
- // names for Unicode properties, so UnicodeSet already maps them
- // to those properties.
- //{ "alpha", u_isalpha, UPROPS_SRC_PROPSVEC },
- //{ "lower", u_islower, UPROPS_SRC_CASE },
- //{ "upper", u_isupper, UPROPS_SRC_CASE },
-
- // MUST be in SORTED order
- { "alnum", u_isalnum, UPROPS_SRC_CHAR },
- { "blank", u_isblank, UPROPS_SRC_PROPSVEC },
- // new alias in Unicode 4.1 { "cntrl", u_iscntrl, UPROPS_SRC_CHAR },
- // new alias in Unicode 4.1 { "digit", u_isdigit, UPROPS_SRC_CHAR },
- { "graph", u_isgraph, UPROPS_SRC_CHAR },
- { "print", u_isprint, UPROPS_SRC_CHAR },
- // new alias in Unicode 4.1 { "punct", u_ispunct, UPROPS_SRC_CHAR },
- // new alias in Unicode 4.1 { "space", u_isspace, UPROPS_SRC_CHAR },
- { "title", u_istitle, UPROPS_SRC_CHAR },
- { "xdigit", u_isxdigit, UPROPS_SRC_CHAR }
-};
-
// TEMPORARY: Remove when deprecated category code constructor is removed.
static const UChar CATEGORY_NAMES[] = {
// Must be kept in sync with uchar.h/UCharCategory
@@ -931,14 +901,6 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
-// TODO: Remove the following special-case code when
-// these four C99-compatibility properties are implemented
-// as enums/names.
-static UBool c99Filter(UChar32 ch, void* context) {
- struct C99_Map* m = (struct C99_Map*) context;
- return m->func(ch);
-}
-
UnicodeSet&
UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
if (U_FAILURE(ec)) return *this;
@@ -974,7 +936,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
UProperty p;
int32_t v;
- UBool mustNotBeEmpty = FALSE;
+ UBool mustNotBeEmpty = FALSE, invert = FALSE;
if (value.length() > 0) {
p = u_getPropertyEnum(pname);
@@ -1081,22 +1043,12 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
} else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
set(0, 0x7F);
return *this;
+ } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
+ // [:Assigned:]=[:^Cn:]
+ p = UCHAR_GENERAL_CATEGORY_MASK;
+ v = U_GC_CN_MASK;
+ invert = TRUE;
} else {
-
- // TODO: Remove the following special-case code when
- // these four C99-compatibility properties are implemented
- // as enums/names.
- for (int32_t i=0; i{$long_name} . "|" . $long_name;
+ my $value;
+ if($pa->{$long_name} =~ m|^n/a\d*$|) {
+ $value = $long_name;
+ } else {
+ $value = $pa->{$long_name} . "|" . $long_name;
+ }
if (exists $additional_property_aliases{$long_name}) {
$value .= "|" . $additional_property_aliases{$long_name};
}
@@ -689,8 +697,8 @@ sub merge_PropertyValueAliases {
my $l = $n;
my $r = $pva->{$n};
# convert |n/a\d+| to blank
- $l = '' if ($l =~ m|^n/a\d+$|);
- $r = '' if ($r =~ m|^n/a\d+$|);
+ $l = '' if ($l =~ m|^n/a\d*$|);
+ $r = '' if ($r =~ m|^n/a\d*$|);
$hh->{$enum} = "$l|$r";
# Don't delete the 'gc' properties because we need to share
@@ -766,8 +774,6 @@ sub read_PropertyAliases {
my $in = new FileHandle($filename, 'r');
die "Error: Cannot open $filename" if (!defined $in);
- my $sym = 0; # Used to make "n/a" strings unique
-
while (<$in>) {
# Read version (embedded in a comment)
@@ -795,9 +801,12 @@ sub read_PropertyAliases {
}
# Make "n/a" strings unique
+ if ($short eq 'n/a') {
+ $short .= sprintf("%03d", $propNA++);
+ }
my $long = $fields[0];
if ($long eq 'n/a') {
- $long .= sprintf("%03d", $sym++);
+ $long .= sprintf("%03d", $propNA++);
}
# Add long name->short name to the hash=pa hash table
@@ -847,7 +856,7 @@ sub read_PropertyValueAliases {
my $in = new FileHandle($filename, 'r');
die "Error: Cannot open $filename" if (!defined $in);
- my $sym = 0; # Used to make "n/a" strings unique
+ my $valueNA = 0; # Used to make "n/a" strings unique
while (<$in>) {
@@ -868,7 +877,7 @@ sub read_PropertyValueAliases {
die "Error: Wrong number of fields in $filename"
if (@fields < 2 || @fields > 3);
# Make "n/a" strings unique
- $fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a');
+ $fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
# Squash extra fields together
while (@fields > 2) {
my $f = pop @fields;