mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-4199 enum/name API support for C/POSIX character classes, and UnicodeSet support for [:Assigned:]
X-SVN-Rev: 17730
This commit is contained in:
parent
291516499b
commit
e6a0df52ee
11 changed files with 1299 additions and 1175 deletions
|
@ -504,7 +504,7 @@ u_isUAlphabetic(UChar32 c) {
|
|||
return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
|
||||
}
|
||||
|
||||
/* Checks if ch is a letter or a decimal digit */
|
||||
/* Checks if c is a letter or a decimal digit */
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isalnum(UChar32 c) {
|
||||
uint32_t props;
|
||||
|
@ -512,6 +512,15 @@ u_isalnum(UChar32 c) {
|
|||
return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
u_isalnumPOSIX(UChar32 c) {
|
||||
return (UBool)(u_isUAlphabetic(c) || u_isdigit(c));
|
||||
}
|
||||
|
||||
/* Checks if ch is a unicode character with assigned character type.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isdefined(UChar32 c) {
|
||||
|
@ -577,8 +586,10 @@ u_isblank(UChar32 c) {
|
|||
if((uint32_t)c<=0x9f) {
|
||||
return c==9 || c==0x20; /* TAB or SPACE */
|
||||
} else {
|
||||
/* White_Space but not LS (Zl) or PS (Zp) */
|
||||
return u_isUWhiteSpace(c) && ((c&0xfffffffe)!=0x2028);
|
||||
/* Zs */
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -596,6 +607,22 @@ u_isprint(UChar32 c) {
|
|||
return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
|
||||
* Implements UCHAR_POSIX_PRINT.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
u_isprintPOSIX(UChar32 c) {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
/*
|
||||
* The only cntrl character in graph+blank is TAB (in blank).
|
||||
* Here we implement (blank-TAB)=Zs instead of calling u_isblank().
|
||||
*/
|
||||
return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c));
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isgraph(UChar32 c) {
|
||||
uint32_t props;
|
||||
|
@ -606,6 +633,24 @@ u_isgraph(UChar32 c) {
|
|||
==0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if c is in
|
||||
* [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
|
||||
* with space=\p{Whitespace} and Control=Cc.
|
||||
* Implements UCHAR_POSIX_GRAPH.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
u_isgraphPOSIX(UChar32 c) {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
/* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
|
||||
/* comparing ==0 returns FALSE for the categories mentioned */
|
||||
return (UBool)((CAT_MASK(props)&
|
||||
(U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
|
||||
==0);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_ispunct(UChar32 c) {
|
||||
uint32_t props;
|
||||
|
@ -1003,9 +1048,11 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
|||
|
||||
/* add code points with hardcoded properties, plus the ones following them */
|
||||
|
||||
/* add for u_isblank() */
|
||||
USET_ADD_CP_AND_NEXT(sa, TAB);
|
||||
|
||||
/* add for IS_THAT_CONTROL_SPACE() */
|
||||
sa->add(sa->set, TAB); /* range TAB..CR */
|
||||
sa->add(sa->set, CR+1);
|
||||
sa->add(sa->set, CR+1); /* range TAB..CR */
|
||||
sa->add(sa->set, 0x1c);
|
||||
sa->add(sa->set, 0x1f+1);
|
||||
USET_ADD_CP_AND_NEXT(sa, NL);
|
||||
|
|
|
@ -77,12 +77,31 @@ U_CDECL_BEGIN
|
|||
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
|
||||
* Another example: There is no "istitle()" class for titlecase characters.
|
||||
*
|
||||
* A summary of the behavior of some C/POSIX character classification implementations
|
||||
* for Unicode is available at http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/posix_classes.html
|
||||
* ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
|
||||
* ICU implements them according to the Standard Recommendations in
|
||||
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
|
||||
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
*
|
||||
* <strong>Important</strong>:
|
||||
* The behavior of the ICU C/POSIX-style character classification
|
||||
* functions is subject to change according to discussion of the above summary.
|
||||
* API access for C/POSIX character classes is as follows:
|
||||
* - alpha: u_isUAlphabetic(c) or u_hasBinaryProperty(c, UCHAR_ALPHABETIC)
|
||||
* - lower: u_isULowercase(c) or u_hasBinaryProperty(c, UCHAR_LOWERCASE)
|
||||
* - upper: u_isUUppercase(c) or u_hasBinaryProperty(c, UCHAR_UPPERCASE)
|
||||
* - punct: u_ispunct(c)
|
||||
* - digit: u_charType(c)==U_DECIMAL_DIGIT_NUMBER
|
||||
* - xdigit: u_isxdigit(c) or u_hasBinaryProperty(c, UCHAR_POSIX_XDIGIT)
|
||||
* - alnum: u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM)
|
||||
* - space: u_isUWhiteSpace(c) or u_hasBinaryProperty(c, UCHAR_WHITE_SPACE)
|
||||
* - blank: u_isblank(c) or u_hasBinaryProperty(c, UCHAR_POSIX_BLANK)
|
||||
* - cntrl: u_charType(c)==U_CONTROL_CHAR
|
||||
* - graph: u_hasBinaryProperty(c, UCHAR_POSIX_GRAPH)
|
||||
* - print: u_hasBinaryProperty(c, UCHAR_POSIX_PRINT)
|
||||
*
|
||||
* Note: Some of the u_isxyz() functions in uchar.h predate, and do not match,
|
||||
* the Standard Recommendations in UTS #18. Instead, they match Java
|
||||
* functions according to their API documentation.
|
||||
*
|
||||
* The C/POSIX character classes are also available in UnicodeSet patterns,
|
||||
* using patterns like [:graph:] or \p{graph}.
|
||||
*
|
||||
* Note: There are several ICU whitespace functions.
|
||||
* Comparison:
|
||||
|
@ -368,6 +387,31 @@ typedef enum UProperty {
|
|||
(http://www.unicode.org/reports/tr31/)
|
||||
@draft ICU 3.4 */
|
||||
UCHAR_PATTERN_WHITE_SPACE,
|
||||
/** Binary property alnum (a C/POSIX character class).
|
||||
Implemented according to the UTS #18 Annex C Standard Recommendation.
|
||||
See the uchar.h file documentation.
|
||||
@draft ICU 3.4 */
|
||||
UCHAR_POSIX_ALNUM,
|
||||
/** Binary property blank (a C/POSIX character class).
|
||||
Implemented according to the UTS #18 Annex C Standard Recommendation.
|
||||
See the uchar.h file documentation.
|
||||
@draft ICU 3.4 */
|
||||
UCHAR_POSIX_BLANK,
|
||||
/** Binary property graph (a C/POSIX character class).
|
||||
Implemented according to the UTS #18 Annex C Standard Recommendation.
|
||||
See the uchar.h file documentation.
|
||||
@draft ICU 3.4 */
|
||||
UCHAR_POSIX_GRAPH,
|
||||
/** Binary property print (a C/POSIX character class).
|
||||
Implemented according to the UTS #18 Annex C Standard Recommendation.
|
||||
See the uchar.h file documentation.
|
||||
@draft ICU 3.4 */
|
||||
UCHAR_POSIX_PRINT,
|
||||
/** Binary property xdigit (a C/POSIX character class).
|
||||
Implemented according to the UTS #18 Annex C Standard Recommendation.
|
||||
See the uchar.h file documentation.
|
||||
@draft ICU 3.4 */
|
||||
UCHAR_POSIX_XDIGIT,
|
||||
/** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */
|
||||
UCHAR_BINARY_LIMIT,
|
||||
|
||||
|
@ -1739,7 +1783,6 @@ u_getNumericValue(UChar32 c);
|
|||
* @see UCHAR_LOWERCASE
|
||||
* @see u_isupper
|
||||
* @see u_istitle
|
||||
* @see u_islower
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_STABLE UBool U_EXPORT2
|
||||
|
|
|
@ -569,7 +569,8 @@ public:
|
|||
* correspond to the following sets:
|
||||
*
|
||||
* "ANY" = [\\u0000-\\U0010FFFF],
|
||||
* "ASCII" = [\\u0000-\\u007F].
|
||||
* "ASCII" = [\\u0000-\\u007F],
|
||||
* "Assigned" = [:^Cn:].
|
||||
*
|
||||
* @param value a value alias, either short or long. The name is matched
|
||||
* loosely. See PropertyValueAliases.txt for names and a description of
|
||||
|
|
|
@ -265,7 +265,8 @@ uset_applyIntPropertyValue(USet* set,
|
|||
* matched loosely and correspond to the following sets:
|
||||
*
|
||||
* "ANY" = [\\u0000-\\U0010FFFF],
|
||||
* "ASCII" = [\\u0000-\\u007F].
|
||||
* "ASCII" = [\\u0000-\\u007F],
|
||||
* "Assigned" = [:^Cn:].
|
||||
*
|
||||
* @param propLength the length of the prop, or -1 if NULL
|
||||
*
|
||||
|
|
|
@ -77,42 +77,12 @@ static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
|
|||
// Special property set IDs
|
||||
static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
|
||||
static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
|
||||
static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
|
||||
|
||||
// Unicode name property alias
|
||||
#define NAME_PROP "na"
|
||||
#define NAME_PROP_LENGTH 2
|
||||
|
||||
// TODO: Remove the following special-case code when
|
||||
// these four C99-compatibility properties are implemented
|
||||
// as enums/names.
|
||||
U_CDECL_BEGIN
|
||||
typedef UBool (U_CALLCONV *C99_Property_Function)(UChar32);
|
||||
U_CDECL_END
|
||||
static const struct C99_Map {
|
||||
const char* name;
|
||||
C99_Property_Function func;
|
||||
UPropertySource src;
|
||||
} C99_DISPATCH[] = {
|
||||
// These three entries omitted; they clash with PropertyAliases
|
||||
// names for Unicode properties, so UnicodeSet already maps them
|
||||
// to those properties.
|
||||
//{ "alpha", u_isalpha, UPROPS_SRC_PROPSVEC },
|
||||
//{ "lower", u_islower, UPROPS_SRC_CASE },
|
||||
//{ "upper", u_isupper, UPROPS_SRC_CASE },
|
||||
|
||||
// MUST be in SORTED order
|
||||
{ "alnum", u_isalnum, UPROPS_SRC_CHAR },
|
||||
{ "blank", u_isblank, UPROPS_SRC_PROPSVEC },
|
||||
// new alias in Unicode 4.1 { "cntrl", u_iscntrl, UPROPS_SRC_CHAR },
|
||||
// new alias in Unicode 4.1 { "digit", u_isdigit, UPROPS_SRC_CHAR },
|
||||
{ "graph", u_isgraph, UPROPS_SRC_CHAR },
|
||||
{ "print", u_isprint, UPROPS_SRC_CHAR },
|
||||
// new alias in Unicode 4.1 { "punct", u_ispunct, UPROPS_SRC_CHAR },
|
||||
// new alias in Unicode 4.1 { "space", u_isspace, UPROPS_SRC_CHAR },
|
||||
{ "title", u_istitle, UPROPS_SRC_CHAR },
|
||||
{ "xdigit", u_isxdigit, UPROPS_SRC_CHAR }
|
||||
};
|
||||
|
||||
// TEMPORARY: Remove when deprecated category code constructor is removed.
|
||||
static const UChar CATEGORY_NAMES[] = {
|
||||
// Must be kept in sync with uchar.h/UCharCategory
|
||||
|
@ -931,14 +901,6 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
|
|||
|
||||
#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
|
||||
|
||||
// TODO: Remove the following special-case code when
|
||||
// these four C99-compatibility properties are implemented
|
||||
// as enums/names.
|
||||
static UBool c99Filter(UChar32 ch, void* context) {
|
||||
struct C99_Map* m = (struct C99_Map*) context;
|
||||
return m->func(ch);
|
||||
}
|
||||
|
||||
UnicodeSet&
|
||||
UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
|
||||
if (U_FAILURE(ec)) return *this;
|
||||
|
@ -974,7 +936,7 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
|||
|
||||
UProperty p;
|
||||
int32_t v;
|
||||
UBool mustNotBeEmpty = FALSE;
|
||||
UBool mustNotBeEmpty = FALSE, invert = FALSE;
|
||||
|
||||
if (value.length() > 0) {
|
||||
p = u_getPropertyEnum(pname);
|
||||
|
@ -1081,22 +1043,12 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
|||
} else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
|
||||
set(0, 0x7F);
|
||||
return *this;
|
||||
} else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
|
||||
// [:Assigned:]=[:^Cn:]
|
||||
p = UCHAR_GENERAL_CATEGORY_MASK;
|
||||
v = U_GC_CN_MASK;
|
||||
invert = TRUE;
|
||||
} else {
|
||||
|
||||
// TODO: Remove the following special-case code when
|
||||
// these four C99-compatibility properties are implemented
|
||||
// as enums/names.
|
||||
for (int32_t i=0; i<LENGTHOF(C99_DISPATCH); ++i) {
|
||||
int32_t c = uprv_comparePropertyNames(pname, C99_DISPATCH[i].name);
|
||||
if (c == 0) {
|
||||
applyFilter(c99Filter, (void*) &C99_DISPATCH[i], C99_DISPATCH[i].src, ec);
|
||||
return *this;
|
||||
} else if (c < 0) {
|
||||
// Further entries will not match; bail out
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
FAIL(ec);
|
||||
}
|
||||
}
|
||||
|
@ -1104,6 +1056,9 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
|
|||
}
|
||||
|
||||
applyIntPropertyValue(p, v, ec);
|
||||
if(invert) {
|
||||
complement();
|
||||
}
|
||||
|
||||
if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
|
||||
// mustNotBeEmpty is set to true if an empty set indicates
|
||||
|
@ -1342,6 +1297,10 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
|
|||
case UPROPS_SRC_PROPSVEC:
|
||||
upropsvec_addPropertyStarts(&sa, &status);
|
||||
break;
|
||||
case UPROPS_SRC_CHAR_AND_PROPSVEC:
|
||||
uchar_addPropertyStarts(&sa, &status);
|
||||
upropsvec_addPropertyStarts(&sa, &status);
|
||||
break;
|
||||
case UPROPS_SRC_HST:
|
||||
uhst_addPropertyStarts(&sa, &status);
|
||||
break;
|
||||
|
|
|
@ -239,7 +239,12 @@ static const struct {
|
|||
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFKC_INERT */
|
||||
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_SEGMENT_STARTER */
|
||||
{ 2, U_MASK(UPROPS_V2_PATTERN_SYNTAX) },
|
||||
{ 2, U_MASK(UPROPS_V2_PATTERN_WHITE_SPACE) }
|
||||
{ 2, U_MASK(UPROPS_V2_PATTERN_WHITE_SPACE) },
|
||||
{ UPROPS_SRC_CHAR_AND_PROPSVEC, 0 }, /* UCHAR_POSIX_ALNUM */
|
||||
{ UPROPS_SRC_CHAR, 0 }, /* UCHAR_POSIX_BLANK */
|
||||
{ UPROPS_SRC_CHAR, 0 }, /* UCHAR_POSIX_GRAPH */
|
||||
{ UPROPS_SRC_CHAR, 0 }, /* UCHAR_POSIX_PRINT */
|
||||
{ UPROPS_SRC_CHAR, 0 } /* UCHAR_POSIX_XDIGIT */
|
||||
};
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
|
@ -305,6 +310,26 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
|
|||
default:
|
||||
break;
|
||||
}
|
||||
} else if(column==UPROPS_SRC_CHAR) {
|
||||
switch(which) {
|
||||
case UCHAR_POSIX_BLANK:
|
||||
return u_isblank(c);
|
||||
case UCHAR_POSIX_GRAPH:
|
||||
return u_isgraphPOSIX(c);
|
||||
case UCHAR_POSIX_PRINT:
|
||||
return u_isprintPOSIX(c);
|
||||
case UCHAR_POSIX_XDIGIT:
|
||||
return u_isxdigit(c);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else if(column==UPROPS_SRC_CHAR_AND_PROPSVEC) {
|
||||
switch(which) {
|
||||
case UCHAR_POSIX_ALNUM:
|
||||
return u_isalnumPOSIX(c);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -224,6 +224,31 @@ uprv_getMaxValues(int32_t column);
|
|||
U_CFUNC UHangulSyllableType
|
||||
uchar_getHST(UChar32 c);
|
||||
|
||||
/**
|
||||
* Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
u_isalnumPOSIX(UChar32 c);
|
||||
|
||||
/**
|
||||
* Checks if c is in
|
||||
* [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
|
||||
* with space=\p{Whitespace} and Control=Cc.
|
||||
* Implements UCHAR_POSIX_GRAPH.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
u_isgraphPOSIX(UChar32 c);
|
||||
|
||||
/**
|
||||
* Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
|
||||
* Implements UCHAR_POSIX_PRINT.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
u_isprintPOSIX(UChar32 c);
|
||||
|
||||
/** Turn a bit index into a bit flag. @internal */
|
||||
#define FLAG(n) ((uint32_t)1<<(n))
|
||||
|
||||
|
@ -359,6 +384,8 @@ enum UPropertySource {
|
|||
UPROPS_SRC_CASE,
|
||||
/** From ubidi_props.c/ubidi.icu */
|
||||
UPROPS_SRC_BIDI,
|
||||
/** From uchar.c/uprops.icu main trie as well as properties vectors trie */
|
||||
UPROPS_SRC_CHAR_AND_PROPSVEC,
|
||||
/** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
|
||||
UPROPS_SRC_COUNT
|
||||
};
|
||||
|
|
|
@ -889,6 +889,9 @@ void UnicodeSetTest::TestPropertySet() {
|
|||
"\\u0F73\\u0F75\\u0F81",
|
||||
"abcd\\u0300\\u0301\\u00c0\\u00c5",
|
||||
|
||||
"[:Assigned:]",
|
||||
"A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
|
||||
"\\u0888\\uFDD3\\uFFFE\\U00050005"
|
||||
};
|
||||
|
||||
static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
|
||||
|
@ -946,24 +949,20 @@ void UnicodeSetTest::TestPosixClasses() {
|
|||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
UVersionInfo ICU_34 = {3, 4, 0, 0}; // Time Bomb for bug 4199
|
||||
{
|
||||
if (isICUVersionAtLeast(ICU_34)) { // Time Bomb Test
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:alnum:]", status);
|
||||
UnicodeSet s2("[\\p{Alphabetic}\\p{DecimalNumber}]", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:alnum:]", status);
|
||||
UnicodeSet s2("[\\p{Alphabetic}\\p{DecimalNumber}]", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
{
|
||||
if (isICUVersionAtLeast(ICU_34)) { // Time Bomb Test
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:space:]", status);
|
||||
UnicodeSet s2("\\p{Whitespace}", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
} }
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:space:]", status);
|
||||
UnicodeSet s2("\\p{Whitespace}", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:blank:]", status);
|
||||
|
@ -974,39 +973,29 @@ void UnicodeSetTest::TestPosixClasses() {
|
|||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
{
|
||||
if (isICUVersionAtLeast(ICU_34)) { // Time Bomb Test
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:cntrl:]", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
UnicodeSet s2("\\p{Control}", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:cntrl:]", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
UnicodeSet s2("\\p{Control}", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:graph:]", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
UnicodeSet s2("[^\\p{Whitespace}\\p{Control}\\p{Format}"
|
||||
"\\p{Surrogate}\\p{Unassigned}]", status);
|
||||
UnicodeSet s2("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
{
|
||||
if (isICUVersionAtLeast(ICU_34)) { // Time Bomb Test
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:print:]", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
UnicodeSet s2(
|
||||
"[[^\\p{Whitespace}\\p{Control}\\p{Format}\\p{Surrogate}\\p{Unassigned}]"
|
||||
"[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}]]"
|
||||
"-[\\p{Control}]]"
|
||||
, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeSet s1("[:print:]", status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
UnicodeSet s2("[[:graph:][:blank:]-[\\p{Control}]]" ,status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
TEST_ASSERT(s1==s2);
|
||||
}
|
||||
|
||||
}
|
||||
/**
|
||||
* Test cloning of UnicodeSet. For C++, we test the copy constructor.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
######################################################################
|
||||
# Copyright (c) 2003-2004, International Business Machines
|
||||
# Copyright (c) 2003-2005, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
######################################################################
|
||||
# Author: Alan Liu
|
||||
|
@ -42,3 +42,11 @@ nfcinert; NFC_Inert
|
|||
nfkcinert; NFKC_Inert
|
||||
|
||||
segstart; Segment_Starter
|
||||
|
||||
# C/POSIX character classes that do not have Unicode property [value] aliases
|
||||
# see uchar.h
|
||||
n/a; alnum
|
||||
n/a; blank
|
||||
n/a; graph
|
||||
n/a; print
|
||||
n/a; xdigit
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -65,6 +65,9 @@ my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
|
|||
# Get the current year from the system
|
||||
my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
|
||||
|
||||
# Used to make "n/a" property aliases (Unicode or Synthetic) unique
|
||||
my $propNA = 0;
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Top level property keys for binary, enumerated, string, and double props
|
||||
my @TOP = qw( _bp _ep _sp _dp _mp );
|
||||
|
@ -304,7 +307,7 @@ END
|
|||
$i = $groupToInt{$groupString};
|
||||
} else {
|
||||
my @names = split(/\|/, $groupString);
|
||||
die "Error: Wrong number of names in " . $groupString if (@names < 2);
|
||||
die "Error: Wrong number of names in " . $groupString if (@names < 1);
|
||||
$i = @nameGroups; # index of group we are making
|
||||
$groupToInt{$groupString} = $i; # Cache for reuse
|
||||
push @nameGroups, map { $stringToID{$_} } @names;
|
||||
|
@ -589,7 +592,12 @@ sub merge_PropertyAliases {
|
|||
die "Error: Property $long_name not found (or used more than once)";
|
||||
}
|
||||
|
||||
my $value = $pa->{$long_name} . "|" . $long_name;
|
||||
my $value;
|
||||
if($pa->{$long_name} =~ m|^n/a\d*$|) {
|
||||
$value = $long_name;
|
||||
} else {
|
||||
$value = $pa->{$long_name} . "|" . $long_name;
|
||||
}
|
||||
if (exists $additional_property_aliases{$long_name}) {
|
||||
$value .= "|" . $additional_property_aliases{$long_name};
|
||||
}
|
||||
|
@ -689,8 +697,8 @@ sub merge_PropertyValueAliases {
|
|||
my $l = $n;
|
||||
my $r = $pva->{$n};
|
||||
# convert |n/a\d+| to blank
|
||||
$l = '' if ($l =~ m|^n/a\d+$|);
|
||||
$r = '' if ($r =~ m|^n/a\d+$|);
|
||||
$l = '' if ($l =~ m|^n/a\d*$|);
|
||||
$r = '' if ($r =~ m|^n/a\d*$|);
|
||||
|
||||
$hh->{$enum} = "$l|$r";
|
||||
# Don't delete the 'gc' properties because we need to share
|
||||
|
@ -766,8 +774,6 @@ sub read_PropertyAliases {
|
|||
my $in = new FileHandle($filename, 'r');
|
||||
die "Error: Cannot open $filename" if (!defined $in);
|
||||
|
||||
my $sym = 0; # Used to make "n/a" strings unique
|
||||
|
||||
while (<$in>) {
|
||||
|
||||
# Read version (embedded in a comment)
|
||||
|
@ -795,9 +801,12 @@ sub read_PropertyAliases {
|
|||
}
|
||||
|
||||
# Make "n/a" strings unique
|
||||
if ($short eq 'n/a') {
|
||||
$short .= sprintf("%03d", $propNA++);
|
||||
}
|
||||
my $long = $fields[0];
|
||||
if ($long eq 'n/a') {
|
||||
$long .= sprintf("%03d", $sym++);
|
||||
$long .= sprintf("%03d", $propNA++);
|
||||
}
|
||||
|
||||
# Add long name->short name to the hash=pa hash table
|
||||
|
@ -847,7 +856,7 @@ sub read_PropertyValueAliases {
|
|||
my $in = new FileHandle($filename, 'r');
|
||||
die "Error: Cannot open $filename" if (!defined $in);
|
||||
|
||||
my $sym = 0; # Used to make "n/a" strings unique
|
||||
my $valueNA = 0; # Used to make "n/a" strings unique
|
||||
|
||||
while (<$in>) {
|
||||
|
||||
|
@ -868,7 +877,7 @@ sub read_PropertyValueAliases {
|
|||
die "Error: Wrong number of fields in $filename"
|
||||
if (@fields < 2 || @fields > 3);
|
||||
# Make "n/a" strings unique
|
||||
$fields[0] .= sprintf("%03d", $sym++) if ($fields[0] eq 'n/a');
|
||||
$fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
|
||||
# Squash extra fields together
|
||||
while (@fields > 2) {
|
||||
my $f = pop @fields;
|
||||
|
|
Loading…
Add table
Reference in a new issue