mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-2556 implement UCHAR_GENERAL_CATEGORY_MASK in name api; refine min/max behavior
X-SVN-Rev: 10596
This commit is contained in:
parent
206c6b4cfd
commit
53406ecc37
4 changed files with 98 additions and 17 deletions
|
@ -147,23 +147,70 @@ static UBool load() {
|
|||
// on it. If it cannot obtain a pointer, because valid data is not
|
||||
// available, then it returns NULL or UCHAR_INVALID_CODE.
|
||||
|
||||
// NOTE (ICU 2.4) For the 2.4 release it was decided late in the cycle
|
||||
// to add a new enum to UProperty, UCHAR_GENERAL_CATEGORY_MASK. This
|
||||
// enum would specify UCharCategory mask values. Because of time
|
||||
// constraints, the underlying binary data and genprop scripts were
|
||||
// not updated. So the PNAME->... API takes UCHAR_GENERAL_CATEGORY
|
||||
// and associates it with a MASK value. We munge things to make this
|
||||
// associate with a UCharCategory value, and we make
|
||||
// UCHAR_GENERAL_CATEGORY_MASK correspond to the mask value.
|
||||
|
||||
// We add a synthetic (not in PropertyAliases.txt) pair of property
|
||||
// names corresponding to UCHAR_GENERAL_CATEGORY_MASK:
|
||||
// gcm ; General_Category_Mask
|
||||
|
||||
// TODO: Remove the munge code, marked "//TODO:munge" below, after the
|
||||
// script/binary data are updated (probably in ICU 2.6).
|
||||
|
||||
static const char* SHORT_GCM_NAME = "gcm";
|
||||
static const char* LONG_GCM_NAME = "General_Category_Mask";
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
u_getPropertyName(UProperty property,
|
||||
UPropertyNameChoice nameChoice) {
|
||||
//TODO:munge
|
||||
if (property == UCHAR_GENERAL_CATEGORY_MASK) {
|
||||
switch (nameChoice) {
|
||||
case U_SHORT_PROPERTY_NAME:
|
||||
return SHORT_GCM_NAME;
|
||||
case U_LONG_PROPERTY_NAME:
|
||||
return LONG_GCM_NAME;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return load() ? PNAME->getPropertyName(property, nameChoice)
|
||||
: NULL;
|
||||
}
|
||||
|
||||
U_CAPI UProperty U_EXPORT2
|
||||
u_getPropertyEnum(const char* alias) {
|
||||
return load() ? (UProperty) PNAME->getPropertyEnum(alias)
|
||||
: UCHAR_INVALID_CODE;
|
||||
UProperty p = load() ? (UProperty) PNAME->getPropertyEnum(alias)
|
||||
: UCHAR_INVALID_CODE;
|
||||
//TODO:munge
|
||||
if (p == UCHAR_INVALID_CODE) {
|
||||
if (0 == uprv_comparePropertyNames(alias, SHORT_GCM_NAME) ||
|
||||
0 == uprv_comparePropertyNames(alias, LONG_GCM_NAME)) {
|
||||
p = UCHAR_GENERAL_CATEGORY_MASK;
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
u_getPropertyValueName(UProperty property,
|
||||
int32_t value,
|
||||
UPropertyNameChoice nameChoice) {
|
||||
//TODO:munge
|
||||
switch (property) {
|
||||
case UCHAR_GENERAL_CATEGORY:
|
||||
value = (value < 32) ? U_MASK(value) : 0;
|
||||
break;
|
||||
case UCHAR_GENERAL_CATEGORY_MASK:
|
||||
property = UCHAR_GENERAL_CATEGORY;
|
||||
break;
|
||||
}
|
||||
return load() ? PNAME->getPropertyValueName(property, value, nameChoice)
|
||||
: NULL;
|
||||
}
|
||||
|
@ -171,8 +218,28 @@ u_getPropertyValueName(UProperty property,
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
u_getPropertyValueEnum(UProperty property,
|
||||
const char* alias) {
|
||||
return load() ? PNAME->getPropertyValueEnum(property, alias)
|
||||
: UCHAR_INVALID_CODE;
|
||||
//TODO:munge
|
||||
UProperty p = (property == UCHAR_GENERAL_CATEGORY_MASK) ?
|
||||
UCHAR_GENERAL_CATEGORY : property;
|
||||
int32_t v = load() ? PNAME->getPropertyValueEnum(p, alias)
|
||||
: UCHAR_INVALID_CODE;
|
||||
//TODO:munge
|
||||
if (property == UCHAR_GENERAL_CATEGORY) {
|
||||
int32_t gc = 0;
|
||||
for (;;) {
|
||||
if (v == 1) {
|
||||
return gc;
|
||||
}
|
||||
if ((v & 1) != 0) {
|
||||
// More than one bit is set; we can't map this mask to
|
||||
// a UCharCategory.
|
||||
return UCHAR_INVALID_CODE;
|
||||
}
|
||||
v >>= 1;
|
||||
gc += 1;
|
||||
}
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
//eof
|
||||
|
|
|
@ -1462,8 +1462,7 @@ u_getIntPropertyValue(UChar32 c, UProperty which);
|
|||
*
|
||||
* @param which UProperty selector constant, identifies which binary property to check.
|
||||
* Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT
|
||||
* or UCHAR_INT_START<=which<UCHAR_INT_LIMIT
|
||||
* or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT.
|
||||
* or UCHAR_INT_START<=which<UCHAR_INT_LIMIT.
|
||||
* @return Minimum value returned by u_getIntPropertyValue for a Unicode property.
|
||||
* 0 if the property selector is out of range.
|
||||
*
|
||||
|
@ -1492,8 +1491,7 @@ u_getIntPropertyMinValue(UProperty which);
|
|||
*
|
||||
* @param which UProperty selector constant, identifies which binary property to check.
|
||||
* Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT
|
||||
* or UCHAR_INT_START<=which<UCHAR_INT_LIMIT
|
||||
* or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT.
|
||||
* or UCHAR_INT_START<=which<UCHAR_INT_LIMIT.
|
||||
* @return Maximum value returned by u_getIntPropertyValue for a Unicode property.
|
||||
* <=0 if the property selector is out of range.
|
||||
*
|
||||
|
@ -2066,6 +2064,11 @@ u_enumCharNames(UChar32 start, UChar32 limit,
|
|||
* Return the Unicode name for a given property, as given in the
|
||||
* Unicode database file PropertyAliases.txt.
|
||||
*
|
||||
* In addition, this function maps the property
|
||||
* UCHAR_GENERAL_CATEGORY_MASK to the synthetic names "gcm" /
|
||||
* "General_Category_Mask". These names are not in
|
||||
* PropertyAliases.txt.
|
||||
*
|
||||
* @param property UProperty selector other than UCHAR_INVALID_CODE.
|
||||
* If out of range, NULL is returned.
|
||||
*
|
||||
|
@ -2097,6 +2100,11 @@ u_getPropertyName(UProperty property,
|
|||
* in the Unicode database file PropertyAliases.txt. Short, long, and
|
||||
* any other variants are recognized.
|
||||
*
|
||||
* In addition, this function maps the synthetic names "gcm" /
|
||||
* "General_Category_Mask" to the property
|
||||
* UCHAR_GENERAL_CATEGORY_MASK. These names are not in
|
||||
* PropertyAliases.txt.
|
||||
*
|
||||
* @param alias the property name to be matched. The name is compared
|
||||
* using "loose matching" as described in PropertyAliases.txt.
|
||||
*
|
||||
|
@ -2113,6 +2121,12 @@ u_getPropertyEnum(const char* alias);
|
|||
* Return the Unicode name for a given property value, as given in the
|
||||
* Unicode database file PropertyValueAliases.txt.
|
||||
*
|
||||
* Note: Some of the names in PropertyValueAliases.txt can only be
|
||||
* retrieved using UCHAR_GENERAL_CATEGORY_MASK, not
|
||||
* UCHAR_GENERAL_CATEGORY. These include: "C" / "Other", "L" /
|
||||
* "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
|
||||
* / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
|
||||
*
|
||||
* @param property UProperty selector constant.
|
||||
* Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT
|
||||
* or UCHAR_INT_START<=which<UCHAR_INT_LIMIT
|
||||
|
@ -2125,7 +2139,7 @@ u_getPropertyEnum(const char* alias);
|
|||
* (1.) UCHAR_BLOCK values begin at the non-zero value
|
||||
* UBLOCK_BASIC_LATIN. (2.) UCHAR_CANONICAL_COMBINING_CLASS
|
||||
* values are not contiguous and range from 0..240. (3.)
|
||||
* UCHAR_GENERAL_CATEGORY values are not values of
|
||||
* UCHAR_GENERAL_CATEGORY_MASK values are not values of
|
||||
* UCharCategory, but rather mask values produced by
|
||||
* U_GET_GC_MASK(). This allows grouped categories such as
|
||||
* [:L:] to be represented. Mask values range
|
||||
|
@ -2160,11 +2174,16 @@ u_getPropertyValueName(UProperty property,
|
|||
* specified in the Unicode database file PropertyValueAliases.txt.
|
||||
* Short, long, and any other variants are recognized.
|
||||
*
|
||||
* Note: Some of the names in PropertyValueAliases.txt will only be
|
||||
* recognized with UCHAR_GENERAL_CATEGORY_MASK, not
|
||||
* UCHAR_GENERAL_CATEGORY. These include: "C" / "Other", "L" /
|
||||
* "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
|
||||
* / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
|
||||
*
|
||||
* @param property UProperty selector constant.
|
||||
* Must be UCHAR_BINARY_START<=which<UCHAR_BINARY_LIMIT
|
||||
* or UCHAR_INT_START<=which<UCHAR_INT_LIMIT
|
||||
* or UCHAR_MASK_START<=which<UCHAR_MASK_LIMIT.
|
||||
* Only these properties can be enumerated.
|
||||
* If out of range, UCHAR_INVALID_CODE is returned.
|
||||
*
|
||||
* @param alias the value name to be matched. The name is compared
|
||||
|
|
|
@ -400,8 +400,6 @@ u_getIntPropertyMaxValue(UProperty which) {
|
|||
default:
|
||||
return -1; /* undefined */
|
||||
}
|
||||
} else if(which==UCHAR_GENERAL_CATEGORY_MASK) {
|
||||
return U_MASK(U_CHAR_CATEGORY_COUNT)-1;
|
||||
} else {
|
||||
return -1; /* undefined */
|
||||
}
|
||||
|
|
|
@ -2355,7 +2355,7 @@ TestPropertyNames(void) {
|
|||
int32_t max = 0;
|
||||
if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
|
||||
max = 255;
|
||||
} else if (p == UCHAR_GENERAL_CATEGORY) {
|
||||
} else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
|
||||
/* it's far too slow to iterate all the way up to
|
||||
the real max, U_GC_P_MASK */
|
||||
max = U_GC_NL_MASK;
|
||||
|
@ -2396,8 +2396,6 @@ TestPropertyNames(void) {
|
|||
} else if (p>=UCHAR_MASK_LIMIT) {
|
||||
p = UCHAR_DOUBLE_START - 1;
|
||||
} else if (p>=UCHAR_INT_LIMIT) {
|
||||
/* ### TODO remove this next line */
|
||||
return;
|
||||
p = UCHAR_MASK_START - 1;
|
||||
} else if (p>=UCHAR_BINARY_LIMIT) {
|
||||
p = UCHAR_INT_START - 1;
|
||||
|
@ -2433,8 +2431,7 @@ TestPropertyValues(void) {
|
|||
}
|
||||
|
||||
if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
|
||||
u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=(int32_t)(U_MASK(U_CHAR_CATEGORY_COUNT)-1)
|
||||
) {
|
||||
u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
|
||||
log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue