diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index bb63e5bde70..e10d3a27d3a 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -100,7 +100,8 @@ utf_impl.o ustring.o ustrcase.o ucasemap.o ucasemap_titlecase_brkiter.o cstring. unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase_brkiter.o \ normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o loadednormalizer2impl.o \ chariter.o schriter.o uchriter.o uiter.o \ -patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \ +patternprops.o uchar.o uprops.o ucase.o propname.o ubidi_props.o characterproperties.o \ +ubidi.o ubidiwrt.o ubidiln.o ushape.o \ uscript.o uscript_props.o usc_impl.o unames.o \ utrie.o utrie2.o utrie2_builder.o ucptrie.o umutablecptrie.o \ bmpset.o unisetspan.o uset_props.o uniset_props.o uniset_closure.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \ diff --git a/icu4c/source/common/characterproperties.cpp b/icu4c/source/common/characterproperties.cpp new file mode 100644 index 00000000000..53367489de0 --- /dev/null +++ b/icu4c/source/common/characterproperties.cpp @@ -0,0 +1,340 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// characterproperties.cpp +// created: 2018sep03 Markus W. Scherer + +#include "unicode/utypes.h" +#include "unicode/localpointer.h" +#include "unicode/uchar.h" +#include "unicode/ucpmap.h" +#include "unicode/ucptrie.h" +#include "unicode/umutablecptrie.h" +#include "unicode/uniset.h" +#include "unicode/uscript.h" +#include "unicode/uset.h" +#include "cmemory.h" +#include "mutex.h" +#include "normalizer2impl.h" +#include "uassert.h" +#include "ubidi_props.h" +#include "ucase.h" +#include "ucln_cmn.h" +#include "umutex.h" +#include "uprops.h" + +using icu::UInitOnce; +using icu::UnicodeSet; + +namespace { + +U_CDECL_BEGIN + +UBool U_CALLCONV characterproperties_cleanup(); + +struct Inclusion { + UnicodeSet *fSet; + UInitOnce fInitOnce; +}; +Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() + +UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {}; + +UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {}; + +UMutex cpMutex = U_MUTEX_INITIALIZER; + +//---------------------------------------------------------------- +// Inclusions list +//---------------------------------------------------------------- + +// USetAdder implementation +// Does not use uset.h to reduce code dependencies +void U_CALLCONV +_set_add(USet *set, UChar32 c) { + ((UnicodeSet *)set)->add(c); +} + +void U_CALLCONV +_set_addRange(USet *set, UChar32 start, UChar32 end) { + ((UnicodeSet *)set)->add(start, end); +} + +void U_CALLCONV +_set_addString(USet *set, const UChar *str, int32_t length) { + ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length)); +} + +UBool U_CALLCONV characterproperties_cleanup() { + for (Inclusion &in: gInclusions) { + delete in.fSet; + in.fSet = nullptr; + in.fInitOnce.reset(); + } + for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) { + delete sets[i]; + sets[i] = nullptr; + } + for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) { + ucptrie_close(reinterpret_cast(maps[i])); + maps[i] = nullptr; + } + return TRUE; +} + +U_CDECL_END + +} // namespace + +U_NAMESPACE_BEGIN + +/* +Reduce excessive reallocation, and make it easier to detect initialization problems. +Usually you don't see smaller sets than this for Unicode 5.0. +*/ +constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072; + +void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) { + // This function is invoked only via umtx_initOnce(). + // This function is a friend of class UnicodeSet. + + U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT); + if (src == UPROPS_SRC_NONE) { + errorCode = U_INTERNAL_PROGRAM_ERROR; + return; + } + UnicodeSet * &incl = gInclusions[src].fSet; + U_ASSERT(incl == nullptr); + + incl = new UnicodeSet(); + if (incl == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + USetAdder sa = { + (USet *)incl, + _set_add, + _set_addRange, + _set_addString, + nullptr, // don't need remove() + nullptr // don't need removeRange() + }; + + incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode); + switch(src) { + case UPROPS_SRC_CHAR: + uchar_addPropertyStarts(&sa, &errorCode); + break; + case UPROPS_SRC_PROPSVEC: + upropsvec_addPropertyStarts(&sa, &errorCode); + break; + case UPROPS_SRC_CHAR_AND_PROPSVEC: + uchar_addPropertyStarts(&sa, &errorCode); + upropsvec_addPropertyStarts(&sa, &errorCode); + break; +#if !UCONFIG_NO_NORMALIZATION + case UPROPS_SRC_CASE_AND_NORM: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); + if(U_SUCCESS(errorCode)) { + impl->addPropertyStarts(&sa, errorCode); + } + ucase_addPropertyStarts(&sa, &errorCode); + break; + } + case UPROPS_SRC_NFC: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); + if(U_SUCCESS(errorCode)) { + impl->addPropertyStarts(&sa, errorCode); + } + break; + } + case UPROPS_SRC_NFKC: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode); + if(U_SUCCESS(errorCode)) { + impl->addPropertyStarts(&sa, errorCode); + } + break; + } + case UPROPS_SRC_NFKC_CF: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode); + if(U_SUCCESS(errorCode)) { + impl->addPropertyStarts(&sa, errorCode); + } + break; + } + case UPROPS_SRC_NFC_CANON_ITER: { + const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); + if(U_SUCCESS(errorCode)) { + impl->addCanonIterPropertyStarts(&sa, errorCode); + } + break; + } +#endif + case UPROPS_SRC_CASE: + ucase_addPropertyStarts(&sa, &errorCode); + break; + case UPROPS_SRC_BIDI: + ubidi_addPropertyStarts(&sa, &errorCode); + break; + case UPROPS_SRC_INPC: + case UPROPS_SRC_INSC: + case UPROPS_SRC_VO: + uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode); + break; + default: + errorCode = U_INTERNAL_PROGRAM_ERROR; + break; + } + + if (U_FAILURE(errorCode)) { + delete incl; + incl = nullptr; + return; + } + // Compact for caching + incl->compact(); + ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); +} + +const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } + if (src < 0 || UPROPS_SRC_COUNT <= src) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + Inclusion &i = gInclusions[src]; + umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode); + return i.fSet; +} + +const UnicodeSet *CharacterProperties::getInclusionsForProperty( + UProperty prop, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } + UPropertySource src = uprops_getSource(prop); + return getInclusionsForSource(src, errorCode); +} + +U_NAMESPACE_END + +namespace { + +UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } + icu::LocalPointer set(new UnicodeSet()); + if (set.isNull()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + const UnicodeSet *inclusions = + icu::CharacterProperties::getInclusionsForProperty(property, errorCode); + if (U_FAILURE(errorCode)) { return nullptr; } + int32_t numRanges = inclusions->getRangeCount(); + UChar32 startHasProperty = -1; + + for (int32_t i = 0; i < numRanges; ++i) { + UChar32 rangeEnd = inclusions->getRangeEnd(i); + for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) { + // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch. + if (u_hasBinaryProperty(c, property)) { + if (startHasProperty < 0) { + // Transition from false to true. + startHasProperty = c; + } + } else if (startHasProperty >= 0) { + // Transition from true to false. + set->add(startHasProperty, c - 1); + startHasProperty = -1; + } + } + } + if (startHasProperty >= 0) { + set->add(startHasProperty, 0x10FFFF); + } + set->freeze(); + return set.orphan(); +} + +UCPMap *makeMap(UProperty property, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return nullptr; } + uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0; + icu::LocalUMutableCPTriePointer mutableTrie( + umutablecptrie_open(nullValue, nullValue, &errorCode)); + const UnicodeSet *inclusions = + icu::CharacterProperties::getInclusionsForProperty(property, errorCode); + if (U_FAILURE(errorCode)) { return nullptr; } + int32_t numRanges = inclusions->getRangeCount(); + UChar32 start = 0; + uint32_t value = nullValue; + + for (int32_t i = 0; i < numRanges; ++i) { + UChar32 rangeEnd = inclusions->getRangeEnd(i); + for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) { + // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch. + uint32_t nextValue = u_getIntPropertyValue(c, property); + if (value != nextValue) { + if (value != nullValue) { + umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode); + } + start = c; + value = nextValue; + } + } + } + if (value != 0) { + umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode); + } + + UCPTrieType type; + if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) { + type = UCPTRIE_TYPE_FAST; + } else { + type = UCPTRIE_TYPE_SMALL; + } + UCPTrieValueWidth valueWidth; + // TODO: UCharacterProperty.IntProperty + int32_t max = u_getIntPropertyMaxValue(property); + if (max <= 0xff) { + valueWidth = UCPTRIE_VALUE_BITS_8; + } else if (max <= 0xffff) { + valueWidth = UCPTRIE_VALUE_BITS_16; + } else { + valueWidth = UCPTRIE_VALUE_BITS_32; + } + return reinterpret_cast( + umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode)); +} + +} // namespace + +U_NAMESPACE_USE + +U_CAPI const USet * U_EXPORT2 +u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { return nullptr; } + if (property < 0 || UCHAR_BINARY_LIMIT <= property) { + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + Mutex m(&cpMutex); + UnicodeSet *set = sets[property]; + if (set == nullptr) { + sets[property] = set = makeSet(property, *pErrorCode); + } + if (U_FAILURE(*pErrorCode)) { return nullptr; } + return set->toUSet(); +} + +U_CAPI const UCPMap * U_EXPORT2 +u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { return nullptr; } + if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) { + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + Mutex m(&cpMutex); + UCPMap *map = maps[property - UCHAR_INT_START]; + if (map == nullptr) { + maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode); + } + return map; +} diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 9d4f1e94988..2495a895639 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -268,6 +268,7 @@ + diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 52034549f1f..99ed94fbd70 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -388,6 +388,9 @@ properties & sets + + properties & sets + properties & sets diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj index b9207e1ebdf..a3801840669 100644 --- a/icu4c/source/common/common_uwp.vcxproj +++ b/icu4c/source/common/common_uwp.vcxproj @@ -393,6 +393,7 @@ + diff --git a/icu4c/source/common/mutex.h b/icu4c/source/common/mutex.h index bb45e7df83c..47f5e080f82 100644 --- a/icu4c/source/common/mutex.h +++ b/icu4c/source/common/mutex.h @@ -34,9 +34,9 @@ U_NAMESPACE_BEGIN // private mutex where possible. // For example: -// -// UMutex myMutex; -// +// +// UMutex myMutex = U_MUTEX_INITIALIZER; +// // void Function(int arg1, int arg2) // { // static Object* foo; // Shared read-write object diff --git a/icu4c/source/common/normalizer2impl.cpp b/icu4c/source/common/normalizer2impl.cpp index 6816ddc853a..e7ae646c41a 100644 --- a/icu4c/source/common/normalizer2impl.cpp +++ b/icu4c/source/common/normalizer2impl.cpp @@ -466,7 +466,7 @@ void Normalizer2Impl::addLcccChars(UnicodeSet &set) const { UChar32 start = 0, end; uint32_t norm16; - while ((end = ucptrie_getRange(normTrie, start, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, INERT, + while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, nullptr, nullptr, &norm16)) >= 0) { if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES && norm16 != Normalizer2Impl::JAMO_VT) { @@ -484,7 +484,7 @@ Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode // Add the start code point of each same-value range of the trie. UChar32 start = 0, end; uint32_t value; - while ((end = ucptrie_getRange(normTrie, start, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, INERT, + while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, nullptr, nullptr, &value)) >= 0) { sa->add(sa->set, start); if (start != end && isAlgorithmicNoNo((uint16_t)value) && @@ -518,7 +518,7 @@ Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &err // Currently only used for the SEGMENT_STARTER property. UChar32 start = 0, end; uint32_t value; - while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPTRIE_RANGE_NORMAL, 0, + while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0, segmentStarterMapper, nullptr, &value)) >= 0) { sa->add(sa->set, start); start = end + 1; @@ -2398,7 +2398,7 @@ void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) { UChar32 start = 0, end; uint32_t value; while ((end = ucptrie_getRange(impl->normTrie, start, - UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT, + UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT, nullptr, nullptr, &value)) >= 0) { // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. if (value != Normalizer2Impl::INERT) { diff --git a/icu4c/source/common/normalizer2impl.h b/icu4c/source/common/normalizer2impl.h index 2231110bbc5..2e6aff30881 100644 --- a/icu4c/source/common/normalizer2impl.h +++ b/icu4c/source/common/normalizer2impl.h @@ -30,6 +30,7 @@ #include "unicode/utf.h" #include "unicode/utf16.h" #include "mutex.h" +#include "udataswp.h" #include "uset_imp.h" // When the nfc.nrm data is *not* hardcoded into the common library diff --git a/icu4c/source/common/ucln_cmn.h b/icu4c/source/common/ucln_cmn.h index d1971b998d9..0ca911b47d9 100644 --- a/icu4c/source/common/ucln_cmn.h +++ b/icu4c/source/common/ucln_cmn.h @@ -45,6 +45,7 @@ typedef enum ECleanupCommonType { UCLN_COMMON_CURRENCY, UCLN_COMMON_LOADED_NORMALIZER2, UCLN_COMMON_NORMALIZER2, + UCLN_COMMON_CHARACTERPROPERTIES, UCLN_COMMON_USET, UCLN_COMMON_UNAMES, UCLN_COMMON_UPROPS, diff --git a/icu4c/source/common/ucptrie.cpp b/icu4c/source/common/ucptrie.cpp index 09ac38a705b..13496ad56c5 100644 --- a/icu4c/source/common/ucptrie.cpp +++ b/icu4c/source/common/ucptrie.cpp @@ -247,7 +247,7 @@ namespace { constexpr int32_t MAX_UNICODE = 0x10ffff; inline uint32_t maybeFilterValue(uint32_t value, uint32_t trieNullValue, uint32_t nullValue, - UCPTrieValueFilter *filter, const void *context) { + UCPMapValueFilter *filter, const void *context) { if (value == trieNullValue) { value = nullValue; } else if (filter != nullptr) { @@ -257,7 +257,7 @@ inline uint32_t maybeFilterValue(uint32_t value, uint32_t trieNullValue, uint32_ } UChar32 getRange(const void *t, UChar32 start, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) { + UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { if ((uint32_t)start > MAX_UNICODE) { return U_SENTINEL; } @@ -403,9 +403,9 @@ UChar32 getRange(const void *t, UChar32 start, U_CFUNC UChar32 ucptrie_internalGetRange(UCPTrieGetRange *getRange, const void *trie, UChar32 start, - UCPTrieRangeOption option, uint32_t surrogateValue, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) { - if (option == UCPTRIE_RANGE_NORMAL) { + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { + if (option == UCPMAP_RANGE_NORMAL) { return getRange(trie, start, filter, context, pValue); } uint32_t value; @@ -413,7 +413,7 @@ ucptrie_internalGetRange(UCPTrieGetRange *getRange, // We need to examine the range value even if the caller does not want it. pValue = &value; } - UChar32 surrEnd = option == UCPTRIE_RANGE_FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff; + UChar32 surrEnd = option == UCPMAP_RANGE_FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff; UChar32 end = getRange(trie, start, filter, context, pValue); if (end < 0xd7ff || start > surrEnd) { return end; @@ -448,8 +448,8 @@ ucptrie_internalGetRange(UCPTrieGetRange *getRange, U_CAPI UChar32 U_EXPORT2 ucptrie_getRange(const UCPTrie *trie, UChar32 start, - UCPTrieRangeOption option, uint32_t surrogateValue, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) { + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { return ucptrie_internalGetRange(getRange, trie, start, option, surrogateValue, filter, context, pValue); @@ -571,3 +571,20 @@ ucptrie_printLengths(const UCPTrie *trie, const char *which) { #endif } // namespace + +// UCPMap ---- +// Initially, this is the same as UCPTrie. This may well change. + +U_CAPI uint32_t U_EXPORT2 +ucpmap_get(const UCPMap *map, UChar32 c) { + return ucptrie_get(reinterpret_cast(map), c); +} + +U_CAPI UChar32 U_EXPORT2 +ucpmap_getRange(const UCPMap *map, UChar32 start, + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { + return ucptrie_getRange(reinterpret_cast(map), start, + option, surrogateValue, + filter, context, pValue); +} diff --git a/icu4c/source/common/ucptrie_impl.h b/icu4c/source/common/ucptrie_impl.h index 8202628afaf..1fe6a18ac53 100644 --- a/icu4c/source/common/ucptrie_impl.h +++ b/icu4c/source/common/ucptrie_impl.h @@ -131,13 +131,13 @@ enum { typedef UChar32 UCPTrieGetRange(const void *trie, UChar32 start, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue); + UCPMapValueFilter *filter, const void *context, uint32_t *pValue); U_CFUNC UChar32 ucptrie_internalGetRange(UCPTrieGetRange *getRange, const void *trie, UChar32 start, - UCPTrieRangeOption option, uint32_t surrogateValue, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue); + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue); #ifdef UCPTRIE_DEBUG U_CFUNC void diff --git a/icu4c/source/common/umutablecptrie.cpp b/icu4c/source/common/umutablecptrie.cpp index f23b5e19261..40af4b6c16a 100644 --- a/icu4c/source/common/umutablecptrie.cpp +++ b/icu4c/source/common/umutablecptrie.cpp @@ -70,10 +70,11 @@ public: MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete; + static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode); static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode); uint32_t get(UChar32 c) const; - int32_t getRange(UChar32 start, UCPTrieValueFilter *filter, const void *context, + int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context, uint32_t *pValue) const; void set(UChar32 c, uint32_t value, UErrorCode &errorCode); @@ -171,6 +172,36 @@ MutableCodePointTrie::~MutableCodePointTrie() { uprv_free(index16); } +MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) { + // Use the highValue as the initialValue to reduce the highStart. + uint32_t errorValue = ucpmap_get(map, -1); + uint32_t initialValue = ucpmap_get(map, 0x10ffff); + LocalPointer mutableTrie( + new MutableCodePointTrie(initialValue, errorValue, errorCode), + errorCode); + if (U_FAILURE(errorCode)) { + return nullptr; + } + UChar32 start = 0, end; + uint32_t value; + while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, + nullptr, nullptr, &value)) >= 0) { + if (value != initialValue) { + if (start == end) { + mutableTrie->set(start, value, errorCode); + } else { + mutableTrie->setRange(start, end, value, errorCode); + } + } + start = end + 1; + } + if (U_SUCCESS(errorCode)) { + return mutableTrie.orphan(); + } else { + return nullptr; + } +} + MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) { // Use the highValue as the initialValue to reduce the highStart. uint32_t errorValue; @@ -201,7 +232,7 @@ MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UEr } UChar32 start = 0, end; uint32_t value; - while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0, + while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value)) >= 0) { if (value != initialValue) { if (start == end) { @@ -244,7 +275,7 @@ uint32_t MutableCodePointTrie::get(UChar32 c) const { } inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue, - UCPTrieValueFilter *filter, const void *context) { + UCPMapValueFilter *filter, const void *context) { if (value == initialValue) { value = nullValue; } else if (filter != nullptr) { @@ -254,7 +285,7 @@ inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t } UChar32 MutableCodePointTrie::getRange( - UChar32 start, UCPTrieValueFilter *filter, const void *context, + UChar32 start, UCPMapValueFilter *filter, const void *context, uint32_t *pValue) const { if ((uint32_t)start > MAX_UNICODE) { return U_SENTINEL; @@ -1565,6 +1596,18 @@ umutablecptrie_close(UMutableCPTrie *trie) { delete reinterpret_cast(trie); } +U_CAPI UMutableCPTrie * U_EXPORT2 +umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) { + if (U_FAILURE(*pErrorCode)) { + return nullptr; + } + if (map == nullptr) { + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + return reinterpret_cast(MutableCodePointTrie::fromUCPMap(map, *pErrorCode)); +} + U_CAPI UMutableCPTrie * U_EXPORT2 umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) { if (U_FAILURE(*pErrorCode)) { @@ -1585,7 +1628,7 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) { namespace { UChar32 getRange(const void *trie, UChar32 start, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) { + UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { return reinterpret_cast(trie)-> getRange(start, filter, context, pValue); } @@ -1594,8 +1637,8 @@ UChar32 getRange(const void *trie, UChar32 start, U_CAPI UChar32 U_EXPORT2 umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start, - UCPTrieRangeOption option, uint32_t surrogateValue, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue) { + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { return ucptrie_internalGetRange(getRange, trie, start, option, surrogateValue, filter, context, pValue); diff --git a/icu4c/source/common/unicode/uchar.h b/icu4c/source/common/unicode/uchar.h index 29ec68fe7e1..d9342626bc5 100644 --- a/icu4c/source/common/unicode/uchar.h +++ b/icu4c/source/common/unicode/uchar.h @@ -27,6 +27,24 @@ #include "unicode/utypes.h" #include "unicode/stringoptions.h" +#include "unicode/ucpmap.h" + +#if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN) + +#define USET_DEFINED + +/** + * USet is the C API type corresponding to C++ class UnicodeSet. + * It is forward-declared here to avoid including unicode/uset.h file if related + * APIs are not used. + * + * @see ucnv_getUnicodeSet + * @stable ICU 2.4 + */ +typedef struct USet USet; + +#endif + U_CDECL_BEGIN @@ -61,6 +79,18 @@ U_CDECL_BEGIN * "About the Unicode Character Database" (http://www.unicode.org/ucd/) * and the ICU User Guide chapter on Properties (http://icu-project.org/userguide/properties.html). * + * Many properties are accessible via generic functions that take a UProperty selector. + * - u_hasBinaryProperty() returns a binary value (TRUE/FALSE) per property and code point. + * - u_getIntPropertyValue() returns an integer value per property and code point. + * For each supported enumerated or catalog property, there is + * an enum type for all of the property's values, and + * u_getIntPropertyValue() returns the numeric values of those constants. + * - u_getBinaryPropertySet() returns a set for each ICU-supported binary property with + * all code points for which the property is true. + * - u_getIntPropertyMap() returns a map for each + * ICU-supported enumerated/catalog/int-valued property which + * maps all Unicode code points to their values for that property. + * * Many functions are designed to match java.lang.Character functions. * See the individual function documentation, * and see the JDK 1.4 java.lang.Character documentation @@ -2519,6 +2549,7 @@ typedef enum UVerticalOrientation { * does not have data for the property at all, or not for this code point. * * @see UProperty + * @see u_getBinaryPropertySet * @see u_getIntPropertyValue * @see u_getUnicodeVersion * @stable ICU 2.1 @@ -2526,6 +2557,27 @@ typedef enum UVerticalOrientation { U_STABLE UBool U_EXPORT2 u_hasBinaryProperty(UChar32 c, UProperty which); +#ifndef U_HIDE_DRAFT_API + +/** + * Returns a frozen USet for a binary property. + * The library retains ownership over the returned object. + * Sets an error code if the property number is not one for a binary property. + * + * The returned set contains all code points for which the property is true. + * + * @param property UCHAR_BINARY_START..UCHAR_BINARY_LIMIT-1 + * @param pErrorCode an in/out ICU UErrorCode + * @return the property as a set + * @see UProperty + * @see u_hasBinaryProperty + * @see Unicode::fromUSet + */ +U_CAPI const USet * U_EXPORT2 +u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode); + +#endif // U_HIDE_DRAFT_API + /** * Check if a code point has the Alphabetic Unicode property. * Same as u_hasBinaryProperty(c, UCHAR_ALPHABETIC). @@ -2626,6 +2678,7 @@ u_isUWhiteSpace(UChar32 c); * @see u_hasBinaryProperty * @see u_getIntPropertyMinValue * @see u_getIntPropertyMaxValue + * @see u_getIntPropertyMap * @see u_getUnicodeVersion * @stable ICU 2.2 */ @@ -2682,6 +2735,27 @@ u_getIntPropertyMinValue(UProperty which); U_STABLE int32_t U_EXPORT2 u_getIntPropertyMaxValue(UProperty which); +#ifndef U_HIDE_DRAFT_API + +/** + * Returns an immutable UCPMap for an enumerated/catalog/int-valued property. + * The library retains ownership over the returned object. + * Sets an error code if the property number is not one for an "int property". + * + * The returned object maps all Unicode code points to their values for that property. + * For documentation of the integer values see u_getIntPropertyValue(). + * + * @param property UCHAR_INT_START..UCHAR_INT_LIMIT-1 + * @param pErrorCode an in/out ICU UErrorCode + * @return the property as a map + * @see UProperty + * @see u_getIntPropertyValue + */ +U_CAPI const UCPMap * U_EXPORT2 +u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode); + +#endif // U_HIDE_DRAFT_API + /** * Get the numeric value for a Unicode code point as defined in the * Unicode Character Database. diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index 53b4c6f0733..ec7c5f350b4 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -53,19 +53,18 @@ #include "unicode/uenum.h" #include "unicode/localpointer.h" -#ifndef __USET_H__ +#if !defined(USET_DEFINED) && !defined(U_IN_DOXYGEN) + +#define USET_DEFINED /** - * USet is the C API type for Unicode sets. - * It is forward-declared here to avoid including the header file if related + * USet is the C API type corresponding to C++ class UnicodeSet. + * It is forward-declared here to avoid including unicode/uset.h file if related * conversion APIs are not used. - * See unicode/uset.h * * @see ucnv_getUnicodeSet - * @stable ICU 2.6 + * @stable ICU 2.4 */ -struct USet; -/** @stable ICU 2.6 */ typedef struct USet USet; #endif diff --git a/icu4c/source/common/unicode/ucpmap.h b/icu4c/source/common/unicode/ucpmap.h new file mode 100644 index 00000000000..58fed20894f --- /dev/null +++ b/icu4c/source/common/unicode/ucpmap.h @@ -0,0 +1,159 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// ucpmap.h +// created: 2018sep03 Markus W. Scherer + +#ifndef __UCPMAP_H__ +#define __UCPMAP_H__ + +#include "unicode/utypes.h" + +#ifndef U_HIDE_DRAFT_API + +U_CDECL_BEGIN + +/** + * \file + * + * This file defines an abstract map from Unicode code points to integer values. + * + * @see UCPMap + * @see UCPTrie + * @see UMutableCPTrie + */ + +/** + * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values. + * + * @see UCPTrie + * @see UMutableCPTrie + * @draft ICU 63 + */ +typedef struct UCPMap UCPMap; + +/** + * Selectors for how ucpmap_getRange() etc. should report value ranges overlapping with surrogates. + * Most users should use UCPMAP_RANGE_NORMAL. + * + * @see ucpmap_getRange + * @see ucptrie_getRange + * @see umutablecptrie_getRange + * @draft ICU 63 + */ +enum UCPMapRangeOption { + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map. + * Most users should use this option. + */ + UCPMAP_RANGE_NORMAL, + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map, + * except that lead surrogates (U+D800..U+DBFF) are treated as having the + * surrogateValue, which is passed to getRange() as a separate parameter. + * The surrogateValue is not transformed via filter(). + * See U_IS_LEAD(c). + * + * Most users should use UCPMAP_RANGE_NORMAL instead. + * + * This option is useful for maps that map surrogate code *units* to + * special values optimized for UTF-16 string processing + * or for special error behavior for unpaired surrogates, + * but those values are not to be associated with the lead surrogate code *points*. + */ + UCPMAP_RANGE_FIXED_LEAD_SURROGATES, + /** + * ucpmap_getRange() enumerates all same-value ranges as stored in the map, + * except that all surrogates (U+D800..U+DFFF) are treated as having the + * surrogateValue, which is passed to getRange() as a separate parameter. + * The surrogateValue is not transformed via filter(). + * See U_IS_SURROGATE(c). + * + * Most users should use UCPMAP_RANGE_NORMAL instead. + * + * This option is useful for maps that map surrogate code *units* to + * special values optimized for UTF-16 string processing + * or for special error behavior for unpaired surrogates, + * but those values are not to be associated with the lead surrogate code *points*. + */ + UCPMAP_RANGE_FIXED_ALL_SURROGATES +}; +#ifndef U_IN_DOXYGEN +typedef enum UCPMapRangeOption UCPMapRangeOption; +#endif + +/** + * Returns the value for a code point as stored in the map, with range checking. + * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF. + * + * @param map the map + * @param c the code point + * @return the map value, + * or an implementation-defined error value if the code point is not in the range 0..U+10FFFF + * @draft ICU 63 + */ +U_CAPI uint32_t U_EXPORT2 +ucpmap_get(const UCPMap *map, UChar32 c); + +/** + * Callback function type: Modifies a map value. + * Optionally called by ucpmap_getRange()/ucptrie_getRange()/umutablecptrie_getRange(). + * The modified value will be returned by the getRange function. + * + * Can be used to ignore some of the value bits, + * make a filter for one of several values, + * return a value index computed from the map value, etc. + * + * @param context an opaque pointer, as passed into the getRange function + * @param value a value from the map + * @return the modified value + * @draft ICU 63 + */ +typedef uint32_t U_CALLCONV +UCPMapValueFilter(const void *context, uint32_t value); + +/** + * Returns the last code point such that all those from start to there have the same value. + * Can be used to efficiently iterate over all same-value ranges in a map. + * (This is normally faster than iterating over code points and get()ting each value, + * but much slower than a data structure that stores ranges directly.) + * + * If the UCPMapValueFilter function pointer is not NULL, then + * the value to be delivered is passed through that function, and the return value is the end + * of the range where all values are modified to the same actual value. + * The value is unchanged if that function pointer is NULL. + * + * Example: + * \code + * UChar32 start = 0, end; + * uint32_t value; + * while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, + * NULL, NULL, &value)) >= 0) { + * // Work with the range start..end and its value. + * start = end + 1; + * } + * \endcode + * + * @param map the map + * @param start range start + * @param option defines whether surrogates are treated normally, + * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL + * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL + * @param filter a pointer to a function that may modify the map data value, + * or NULL if the values from the map are to be used unmodified + * @param context an opaque pointer that is passed on to the filter function + * @param pValue if not NULL, receives the value that every code point start..end has; + * may have been modified by filter(context, map value) + * if that function pointer is not NULL + * @return the range end code point, or -1 if start is not a valid code point + * @draft ICU 63 + */ +U_CAPI UChar32 U_EXPORT2 +ucpmap_getRange(const UCPMap *map, UChar32 start, + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue); + +U_CDECL_END + +#endif // U_HIDE_DRAFT_API +#endif diff --git a/icu4c/source/common/unicode/ucptrie.h b/icu4c/source/common/unicode/ucptrie.h index 505995b3a73..461c47a4f2e 100644 --- a/icu4c/source/common/unicode/ucptrie.h +++ b/icu4c/source/common/unicode/ucptrie.h @@ -8,10 +8,12 @@ #define __UCPTRIE_H__ #include "unicode/utypes.h" + +#ifndef U_HIDE_DRAFT_API + #include "unicode/localpointer.h" +#include "unicode/ucpmap.h" #include "unicode/utf8.h" -#include "putilimp.h" -#include "udataswp.h" U_CDECL_BEGIN @@ -174,54 +176,6 @@ enum UCPTrieValueWidth { typedef enum UCPTrieValueWidth UCPTrieValueWidth; #endif -/** - * Selectors for how ucptrie_getRange() should report value ranges overlapping with surrogates. - * Most users should use UCPTRIE_RANGE_NORMAL. - * - * @see ucptrie_getRange - * @draft ICU 63 - */ -enum UCPTrieRangeOption { - /** - * ucptrie_getRange() enumerates all same-value ranges as stored in the trie. - * Most users should use this option. - */ - UCPTRIE_RANGE_NORMAL, - /** - * ucptrie_getRange() enumerates all same-value ranges as stored in the trie, - * except that lead surrogates (U+D800..U+DBFF) are treated as having the - * surrogateValue, which is passed to getRange() as a separate parameter. - * The surrogateValue is not transformed via filter(). - * See U_IS_LEAD(c). - * - * Most users should use UCPTRIE_RANGE_NORMAL instead. - * - * This option is useful for tries that map surrogate code *units* to - * special values optimized for UTF-16 string processing - * or for special error behavior for unpaired surrogates, - * but those values are not to be associated with the lead surrogate code *points*. - */ - UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, - /** - * ucptrie_getRange() enumerates all same-value ranges as stored in the trie, - * except that all surrogates (U+D800..U+DFFF) are treated as having the - * surrogateValue, which is passed to getRange() as a separate parameter. - * The surrogateValue is not transformed via filter(). - * See U_IS_SURROGATE(c). - * - * Most users should use UCPTRIE_RANGE_NORMAL instead. - * - * This option is useful for tries that map surrogate code *units* to - * special values optimized for UTF-16 string processing - * or for special error behavior for unpaired surrogates, - * but those values are not to be associated with the lead surrogate code *points*. - */ - UCPTRIE_RANGE_FIXED_ALL_SURROGATES -}; -#ifndef U_IN_DOXYGEN -typedef enum UCPTrieRangeOption UCPTrieRangeOption; -#endif - /** * Opens a trie from its binary form, stored in 32-bit-aligned memory. * Inverse of ucptrie_toBinary(). @@ -322,30 +276,13 @@ ucptrie_getValueWidth(const UCPTrie *trie); U_CAPI uint32_t U_EXPORT2 ucptrie_get(const UCPTrie *trie, UChar32 c); -/** - * Callback function type: Modifies a trie value. - * Optionally called by ucptrie_getRange() or umutablecptrie_getRange(). - * The modified value will be returned by the getRange function. - * - * Can be used to ignore some of the value bits, - * make a filter for one of several values, - * return a value index computed from the trie value, etc. - * - * @param context an opaque pointer, as passed into the getRange function - * @param value a value from the trie - * @return the modified value - * @draft ICU 63 - */ -typedef uint32_t U_CALLCONV -UCPTrieValueFilter(const void *context, uint32_t value); - /** * Returns the last code point such that all those from start to there have the same value. * Can be used to efficiently iterate over all same-value ranges in a trie. * (This is normally faster than iterating over code points and get()ting each value, * but much slower than a data structure that stores ranges directly.) * - * If the UCPTrieValueFilter function pointer is not NULL, then + * If the UCPMapValueFilter function pointer is not NULL, then * the value to be delivered is passed through that function, and the return value is the end * of the range where all values are modified to the same actual value. * The value is unchanged if that function pointer is NULL. @@ -354,7 +291,7 @@ UCPTrieValueFilter(const void *context, uint32_t value); * \code * UChar32 start = 0, end; * uint32_t value; - * while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0, + * while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, * NULL, NULL, &value)) >= 0) { * // Work with the range start..end and its value. * start = end + 1; @@ -364,8 +301,8 @@ UCPTrieValueFilter(const void *context, uint32_t value); * @param trie the trie * @param start range start * @param option defines whether surrogates are treated normally, - * or as having the surrogateValue; usually UCPTRIE_RANGE_NORMAL - * @param surrogateValue value for surrogates; ignored if option==UCPTRIE_RANGE_NORMAL + * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL + * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL * @param filter a pointer to a function that may modify the trie data value, * or NULL if the values from the trie are to be used unmodified * @param context an opaque pointer that is passed on to the filter function @@ -377,8 +314,8 @@ UCPTrieValueFilter(const void *context, uint32_t value); */ U_CAPI UChar32 U_EXPORT2 ucptrie_getRange(const UCPTrie *trie, UChar32 start, - UCPTrieRangeOption option, uint32_t surrogateValue, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue); + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue); /** * Writes a memory-mappable form of the trie into 32-bit aligned memory. @@ -704,4 +641,5 @@ ucptrie_internalU8PrevIndex(const UCPTrie *trie, UChar32 c, U_CDECL_END #endif // U_IN_DOXYGEN +#endif // U_HIDE_DRAFT_API #endif diff --git a/icu4c/source/common/unicode/umutablecptrie.h b/icu4c/source/common/unicode/umutablecptrie.h index 31d10f4d74f..e75191a4495 100644 --- a/icu4c/source/common/unicode/umutablecptrie.h +++ b/icu4c/source/common/unicode/umutablecptrie.h @@ -8,11 +8,13 @@ #define __UMUTABLECPTRIE_H__ #include "unicode/utypes.h" + +#ifndef U_HIDE_DRAFT_API + #include "unicode/localpointer.h" +#include "unicode/ucpmap.h" #include "unicode/ucptrie.h" #include "unicode/utf8.h" -#include "putilimp.h" -#include "udataswp.h" U_CDECL_BEGIN @@ -102,6 +104,18 @@ U_NAMESPACE_END #endif +/** + * Creates a mutable trie with the same contents as the UCPMap. + * You must umutablecptrie_close() the mutable trie once you are done using it. + * + * @param map the source map + * @param pErrorCode an in/out ICU UErrorCode + * @return the mutable trie + * @draft ICU 63 + */ +U_CAPI UMutableCPTrie * U_EXPORT2 +umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode); + /** * Creates a mutable trie with the same contents as the immutable one. * You must umutablecptrie_close() the mutable trie once you are done using it. @@ -133,7 +147,7 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c); * * The trie can be modified between calls to this function. * - * If the UCPTrieValueFilter function pointer is not NULL, then + * If the UCPMapValueFilter function pointer is not NULL, then * the value to be delivered is passed through that function, and the return value is the end * of the range where all values are modified to the same actual value. * The value is unchanged if that function pointer is NULL. @@ -143,8 +157,8 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c); * @param trie the trie * @param start range start * @param option defines whether surrogates are treated normally, - * or as having the surrogateValue; usually UCPTRIE_RANGE_NORMAL - * @param surrogateValue value for surrogates; ignored if option==UCPTRIE_RANGE_NORMAL + * or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL + * @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL * @param filter a pointer to a function that may modify the trie data value, * or NULL if the values from the trie are to be used unmodified * @param context an opaque pointer that is passed on to the filter function @@ -156,8 +170,8 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c); */ U_CAPI UChar32 U_EXPORT2 umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start, - UCPTrieRangeOption option, uint32_t surrogateValue, - UCPTrieValueFilter *filter, const void *context, uint32_t *pValue); + UCPMapRangeOption option, uint32_t surrogateValue, + UCPMapValueFilter *filter, const void *context, uint32_t *pValue); /** * Sets a value for a code point. @@ -223,4 +237,5 @@ umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieVal U_CDECL_END +#endif // U_HIDE_DRAFT_API #endif diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index ed9a3eb72ff..bd9aa5600df 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -13,6 +13,7 @@ #ifndef UNICODESET_H #define UNICODESET_H +#include "unicode/ucpmap.h" #include "unicode/unifilt.h" #include "unicode/unistr.h" #include "unicode/uset.h" @@ -25,9 +26,8 @@ U_NAMESPACE_BEGIN // Forward Declarations. -void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status); /**< @internal */ - class BMPSet; +class CharacterProperties; class ParsePosition; class RBBIRuleScanner; class SymbolTable; @@ -584,9 +584,8 @@ public: //---------------------------------------------------------------- /** - * Make this object represent the range start - end. - * If end > start then this object is set to an - * an empty range. + * Make this object represent the range `start - end`. + * If `end > start` then this object is set to an empty range. * A frozen set will not be modified. * * @param start first character in the set, inclusive @@ -1506,6 +1505,7 @@ private: //---------------------------------------------------------------- UnicodeSet(const UnicodeSet& o, UBool /* asThawed */); + UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed); //---------------------------------------------------------------- // Implementation: Pattern parsing @@ -1614,7 +1614,7 @@ private: UnicodeString& rebuiltPat, UErrorCode& ec); - friend void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status); + friend class CharacterProperties; static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); /** @@ -1634,9 +1634,13 @@ private: */ void applyFilter(Filter filter, void* context, - int32_t src, + const UnicodeSet* inclusions, UErrorCode &status); + void applyIntPropertyValue(const UCPMap *map, + UCPMapValueFilter *filter, const void *context, + UErrorCode &errorCode); + /** * Set the new pattern to cache. */ diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h index 59f46507d54..18482c10e73 100644 --- a/icu4c/source/common/unicode/uset.h +++ b/icu4c/source/common/unicode/uset.h @@ -33,10 +33,14 @@ #include "unicode/uchar.h" #include "unicode/localpointer.h" -#ifndef UCNV_H -struct USet; +#ifndef USET_DEFINED + +#ifndef U_IN_DOXYGEN +#define USET_DEFINED +#endif /** - * A UnicodeSet. Use the uset_* API to manipulate. Create with + * USet is the C API type corresponding to C++ class UnicodeSet. + * Use the uset_* API to manipulate. Create with * uset_open*, and destroy with uset_close. * @stable ICU 2.4 */ diff --git a/icu4c/source/common/uniset.cpp b/icu4c/source/common/uniset.cpp index 7206e63e887..e8378e0a223 100644 --- a/icu4c/source/common/uniset.cpp +++ b/icu4c/source/common/uniset.cpp @@ -276,6 +276,10 @@ UnicodeSet::~UnicodeSet() { * Assigns this object to be a copy of another. */ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { + return copyFrom(o, FALSE); +} + +UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { if (this == &o) { return *this; } @@ -294,7 +298,7 @@ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { } len = o.len; uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); - if (o.bmpSet == NULL) { + if (o.bmpSet == NULL || asThawed) { bmpSet = NULL; } else { bmpSet = new BMPSet(*o.bmpSet, list, len); @@ -309,7 +313,7 @@ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { setToBogus(); return *this; } - if (o.stringSpan == NULL) { + if (o.stringSpan == NULL || asThawed) { stringSpan = NULL; } else { stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 7e5b132b69b..6cfd80a705b 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -36,8 +36,6 @@ #include "uprops.h" #include "propname.h" #include "normalizer2impl.h" -#include "ucase.h" -#include "ubidi_props.h" #include "uinvchar.h" #include "uprops.h" #include "charstr.h" @@ -98,47 +96,13 @@ static const char ASSIGNED[] = "Assigned"; // [:^Cn:] U_CDECL_BEGIN static UBool U_CALLCONV uset_cleanup(); -struct Inclusion { - UnicodeSet *fSet; - UInitOnce fInitOnce; -}; -static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() - static UnicodeSet *uni32Singleton; static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER; -//---------------------------------------------------------------- -// Inclusions list -//---------------------------------------------------------------- - -// USetAdder implementation -// Does not use uset.h to reduce code dependencies -static void U_CALLCONV -_set_add(USet *set, UChar32 c) { - ((UnicodeSet *)set)->add(c); -} - -static void U_CALLCONV -_set_addRange(USet *set, UChar32 start, UChar32 end) { - ((UnicodeSet *)set)->add(start, end); -} - -static void U_CALLCONV -_set_addString(USet *set, const UChar *str, int32_t length) { - ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); -} - /** * Cleanup function for UnicodeSet */ static UBool U_CALLCONV uset_cleanup(void) { - for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { - Inclusion &in = gInclusions[i]; - delete in.fSet; - in.fSet = NULL; - in.fInitOnce.reset(); - } - delete uni32Singleton; uni32Singleton = NULL; uni32InitOnce.reset(); @@ -149,119 +113,6 @@ U_CDECL_END U_NAMESPACE_BEGIN -/* -Reduce excessive reallocation, and make it easier to detect initialization problems. -Usually you don't see smaller sets than this for Unicode 5.0. -*/ -#define DEFAULT_INCLUSION_CAPACITY 3072 - -void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) { - // This function is invoked only via umtx_initOnce(). - // This function is a friend of class UnicodeSet. - - U_ASSERT(src >=0 && srcensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); - switch(src) { - case UPROPS_SRC_CHAR: - uchar_addPropertyStarts(&sa, &status); - break; - case UPROPS_SRC_PROPSVEC: - upropsvec_addPropertyStarts(&sa, &status); - break; - case UPROPS_SRC_CHAR_AND_PROPSVEC: - uchar_addPropertyStarts(&sa, &status); - upropsvec_addPropertyStarts(&sa, &status); - break; -#if !UCONFIG_NO_NORMALIZATION - case UPROPS_SRC_CASE_AND_NORM: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); - if(U_SUCCESS(status)) { - impl->addPropertyStarts(&sa, status); - } - ucase_addPropertyStarts(&sa, &status); - break; - } - case UPROPS_SRC_NFC: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); - if(U_SUCCESS(status)) { - impl->addPropertyStarts(&sa, status); - } - break; - } - case UPROPS_SRC_NFKC: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); - if(U_SUCCESS(status)) { - impl->addPropertyStarts(&sa, status); - } - break; - } - case UPROPS_SRC_NFKC_CF: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); - if(U_SUCCESS(status)) { - impl->addPropertyStarts(&sa, status); - } - break; - } - case UPROPS_SRC_NFC_CANON_ITER: { - const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); - if(U_SUCCESS(status)) { - impl->addCanonIterPropertyStarts(&sa, status); - } - break; - } -#endif - case UPROPS_SRC_CASE: - ucase_addPropertyStarts(&sa, &status); - break; - case UPROPS_SRC_BIDI: - ubidi_addPropertyStarts(&sa, &status); - break; - case UPROPS_SRC_INPC: - case UPROPS_SRC_INSC: - case UPROPS_SRC_VO: - uprops_addPropertyStarts((UPropertySource)src, &sa, &status); - break; - default: - status = U_INTERNAL_PROGRAM_ERROR; - break; - } - - if (U_FAILURE(status)) { - delete incl; - incl = NULL; - return; - } - // Compact for caching - incl->compact(); - ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); -} - - - -const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { - U_ASSERT(src >=0 && src 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; } -typedef struct { - UProperty prop; - int32_t value; -} IntPropertyContext; - -static UBool intPropertyFilter(UChar32 ch, void* context) { - IntPropertyContext* c = (IntPropertyContext*)context; - return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; -} - static UBool scriptExtensionsFilter(UChar32 ch, void* context) { return uscript_hasScript(ch, *(UScriptCode*)context); } @@ -896,7 +732,7 @@ static UBool scriptExtensionsFilter(UChar32 ch, void* context) { */ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, void* context, - int32_t src, + const UnicodeSet* inclusions, UErrorCode &status) { if (U_FAILURE(status)) return; @@ -907,12 +743,8 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, // To improve performance, use an inclusions set which // encodes information about character ranges that are known // to have identical properties. - // getInclusions(src) contains exactly the first characters of - // same-value ranges for the given properties "source". - const UnicodeSet* inclusions = getInclusions(src, status); - if (U_FAILURE(status)) { - return; - } + // inclusions contains the first characters of + // same-value ranges for the given property. clear(); @@ -949,6 +781,43 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, namespace { +/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ +uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) { + uint32_t mask = *(const uint32_t *)context; + value = U_MASK(value) & mask; + if (value != 0) { value = 1; } + return value; +} + +/** Maps one map value to 1, all others to 0. */ +uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) { + uint32_t v = *(const uint32_t *)context; + return value == v ? 1 : 0; +} + +} // namespace + +void UnicodeSet::applyIntPropertyValue(const UCPMap *map, + UCPMapValueFilter *filter, const void *context, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return; } + clear(); + UChar32 start = 0, end; + uint32_t value; + while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, + filter, context, &value)) >= 0) { + if (value != 0) { + add(start, end); + } + start = end + 1; + } + if (isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } +} + +namespace { + static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { /* Note: we use ' ' in compiler code page */ int32_t j = 0; @@ -976,16 +845,35 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { UnicodeSet& UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { - if (U_FAILURE(ec) || isFrozen()) return *this; - + if (U_FAILURE(ec)) { return *this; } + // All of the following check isFrozen() before modifying this set. if (prop == UCHAR_GENERAL_CATEGORY_MASK) { - applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); + const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec); + applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec); } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { + const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); UScriptCode script = (UScriptCode)value; - applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); + applyFilter(scriptExtensionsFilter, &script, inclusions, ec); + } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) { + if (value == 0 || value == 1) { + const USet *set = u_getBinaryPropertySet(prop, &ec); + if (U_FAILURE(ec)) { return *this; } + copyFrom(*UnicodeSet::fromUSet(set), TRUE); + if (value == 0) { + complement(); + } + } else { + clear(); + } + } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { + const UCPMap *map = u_getIntPropertyMap(prop, &ec); + applyIntPropertyValue(map, intValueFilter, &value, ec); } else { - IntPropertyContext c = {prop, value}; - applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); + // This code used to always call getInclusions(property source) + // which sets an error for an unsupported property. + ec = U_ILLEGAL_ARGUMENT_ERROR; + // Otherwise we would just clear() this set because + // getIntPropertyValue(c, prop) returns 0 for all code points. } return *this; } @@ -1061,7 +949,8 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, if (*end != 0) { FAIL(ec); } - applyFilter(numericValueFilter, &val, UPROPS_SRC_CHAR, ec); + applyFilter(numericValueFilter, &val, + CharacterProperties::getInclusionsForProperty(p, ec), ec); return *this; } case UCHAR_NAME: @@ -1090,7 +979,8 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop, if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); UVersionInfo version; u_versionFromString(version, buf); - applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); + applyFilter(versionFilter, &version, + CharacterProperties::getInclusionsForProperty(p, ec), ec); return *this; } case UCHAR_SCRIPT_EXTENSIONS: diff --git a/icu4c/source/common/uprops.cpp b/icu4c/source/common/uprops.cpp index 9738881b1dc..2421c15d2bd 100644 --- a/icu4c/source/common/uprops.cpp +++ b/icu4c/source/common/uprops.cpp @@ -605,7 +605,7 @@ uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *p // Add the start code point of each same-value range of the trie. UChar32 start = 0, end; - while ((end = ucptrie_getRange(trie, start, UCPTRIE_RANGE_NORMAL, 0, + while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, nullptr)) >= 0) { sa->add(sa->set, start); start = end + 1; diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h index 0896973da32..1a8e4e84f74 100644 --- a/icu4c/source/common/uprops.h +++ b/icu4c/source/common/uprops.h @@ -459,6 +459,13 @@ U_NAMESPACE_BEGIN class UnicodeSet; +class CharacterProperties { +public: + CharacterProperties() = delete; + static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode); + static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); +}; + // implemented in uniset_props.cpp U_CFUNC UnicodeSet * uniset_getUnicode32Instance(UErrorCode &errorCode); diff --git a/icu4c/source/test/cintltst/cucdtst.c b/icu4c/source/test/cintltst/cucdtst.c index 4e18cfa0025..059bd72adad 100644 --- a/icu4c/source/test/cintltst/cucdtst.c +++ b/icu4c/source/test/cintltst/cucdtst.c @@ -61,6 +61,8 @@ static void TestPropertyNames(void); static void TestPropertyValues(void); static void TestConsistency(void); static void TestCaseFolding(void); +static void TestBinaryCharacterPropertiesAPI(void); +static void TestIntCharacterPropertiesAPI(void); /* internal methods used */ static int32_t MakeProp(char* str); @@ -196,6 +198,10 @@ void addUnicodeTest(TestNode** root) addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues"); addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency"); addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding"); + addTest(root, &TestBinaryCharacterPropertiesAPI, + "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI"); + addTest(root, &TestIntCharacterPropertiesAPI, + "tsutil/cucdtst/TestIntCharacterPropertiesAPI"); } /*==================================================== */ @@ -3522,3 +3528,41 @@ TestCaseFolding() { uset_close(data.notSeen); } + +static void TestBinaryCharacterPropertiesAPI() { + // API test only. See intltest/ucdtest.cpp for functional test. + UErrorCode errorCode = U_ZERO_ERROR; + const USet *set = u_getBinaryPropertySet(-1, &errorCode); + if (U_SUCCESS(errorCode)) { + log_err("u_getBinaryPropertySet(-1) did not fail\n"); + } + errorCode = U_ZERO_ERROR; + set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode); + if (U_SUCCESS(errorCode)) { + log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n"); + } + errorCode = U_ZERO_ERROR; + set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode); + if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) { + log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n"); + } +} + +static void TestIntCharacterPropertiesAPI() { + // API test only. See intltest/ucdtest.cpp for functional test. + UErrorCode errorCode = U_ZERO_ERROR; + const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode); + if (U_SUCCESS(errorCode)) { + log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n"); + } + errorCode = U_ZERO_ERROR; + map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode); + if (U_SUCCESS(errorCode)) { + log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n"); + } + errorCode = U_ZERO_ERROR; + map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode); + if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) { + log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n"); + } +} diff --git a/icu4c/source/test/cintltst/ucptrietest.c b/icu4c/source/test/cintltst/ucptrietest.c index 9969a62937a..299ef900b83 100644 --- a/icu4c/source/test/cintltst/ucptrietest.c +++ b/icu4c/source/test/cintltst/ucptrietest.c @@ -107,11 +107,11 @@ static UChar32 iterStarts[] = { static void testTrieGetRanges(const char *testName, const UCPTrie *trie, const UMutableCPTrie *mutableTrie, - UCPTrieRangeOption option, uint32_t surrValue, + UCPMapRangeOption option, uint32_t surrValue, const CheckRange checkRanges[], int32_t countCheckRanges) { const char *const typeName = trie == NULL ? "mutableTrie" : "trie"; - const char *const optionName = option == UCPTRIE_RANGE_NORMAL ? "normal" : - option == UCPTRIE_RANGE_FIXED_LEAD_SURROGATES ? "fixedLeadSurr" : "fixedAllSurr"; + const char *const optionName = option == UCPMAP_RANGE_NORMAL ? "normal" : + option == UCPMAP_RANGE_FIXED_LEAD_SURROGATES ? "fixedLeadSurr" : "fixedAllSurr"; char name[80]; int32_t s; for (s = 0; s < UPRV_LENGTHOF(iterStarts); ++s) { @@ -690,7 +690,7 @@ testTrie(const char *testName, const UCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth, const CheckRange checkRanges[], int32_t countCheckRanges) { testTrieGetters(testName, trie, type, valueWidth, checkRanges, countCheckRanges); - testTrieGetRanges(testName, trie, NULL, UCPTRIE_RANGE_NORMAL, 0, checkRanges, countCheckRanges); + testTrieGetRanges(testName, trie, NULL, UCPMAP_RANGE_NORMAL, 0, checkRanges, countCheckRanges); if (type == UCPTRIE_TYPE_FAST) { testTrieUTF16(testName, trie, valueWidth, checkRanges, countCheckRanges); testTrieUTF8(testName, trie, valueWidth, checkRanges, countCheckRanges); @@ -701,7 +701,7 @@ static void testBuilder(const char *testName, const UMutableCPTrie *mutableTrie, const CheckRange checkRanges[], int32_t countCheckRanges) { testBuilderGetters(testName, mutableTrie, checkRanges, countCheckRanges); - testTrieGetRanges(testName, NULL, mutableTrie, UCPTRIE_RANGE_NORMAL, 0, checkRanges, countCheckRanges); + testTrieGetRanges(testName, NULL, mutableTrie, UCPMAP_RANGE_NORMAL, 0, checkRanges, countCheckRanges); } static uint32_t storage[120000]; @@ -1366,7 +1366,7 @@ MuchDataTest(void) { } static void testGetRangesFixedSurr(const char *testName, const UMutableCPTrie *mutableTrie, - UCPTrieRangeOption option, + UCPMapRangeOption option, const CheckRange checkRanges[], int32_t countCheckRanges) { testTrieGetRanges(testName, NULL, mutableTrie, option, 5, checkRanges, countCheckRanges); UErrorCode errorCode = U_ZERO_ERROR; @@ -1454,9 +1454,9 @@ TrieTestGetRangesFixedSurr(void) { if (mutableTrie == NULL) { return; } - testGetRangesFixedSurr("fixedLeadSurr1", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, + testGetRangesFixedSurr("fixedLeadSurr1", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, checkRangesFixedLeadSurr1, UPRV_LENGTHOF(checkRangesFixedLeadSurr1)); - testGetRangesFixedSurr("fixedAllSurr1", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES, + testGetRangesFixedSurr("fixedAllSurr1", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES, checkRangesFixedAllSurr1, UPRV_LENGTHOF(checkRangesFixedAllSurr1)); // Setting a range in the middle of lead surrogates makes no difference. umutablecptrie_setRange(mutableTrie, 0xd844, 0xd899, 5, &errorCode); @@ -1465,7 +1465,7 @@ TrieTestGetRangesFixedSurr(void) { umutablecptrie_close(mutableTrie); return; } - testGetRangesFixedSurr("fixedLeadSurr2", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, + testGetRangesFixedSurr("fixedLeadSurr2", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, checkRangesFixedLeadSurr1, UPRV_LENGTHOF(checkRangesFixedLeadSurr1)); // Bridge the gap before the lead surrogates. umutablecptrie_set(mutableTrie, 0xd7ff, 5, &errorCode); @@ -1474,9 +1474,9 @@ TrieTestGetRangesFixedSurr(void) { umutablecptrie_close(mutableTrie); return; } - testGetRangesFixedSurr("fixedLeadSurr3", mutableTrie, UCPTRIE_RANGE_FIXED_LEAD_SURROGATES, + testGetRangesFixedSurr("fixedLeadSurr3", mutableTrie, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, checkRangesFixedLeadSurr3, UPRV_LENGTHOF(checkRangesFixedLeadSurr3)); - testGetRangesFixedSurr("fixedAllSurr3", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES, + testGetRangesFixedSurr("fixedAllSurr3", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES, checkRangesFixedAllSurr3, UPRV_LENGTHOF(checkRangesFixedAllSurr3)); // Bridge the gap after the trail surrogates. umutablecptrie_set(mutableTrie, 0xe000, 5, &errorCode); @@ -1485,7 +1485,7 @@ TrieTestGetRangesFixedSurr(void) { umutablecptrie_close(mutableTrie); return; } - testGetRangesFixedSurr("fixedSurr4", mutableTrie, UCPTRIE_RANGE_FIXED_ALL_SURROGATES, + testGetRangesFixedSurr("fixedSurr4", mutableTrie, UCPMAP_RANGE_FIXED_ALL_SURROGATES, checkRangesFixedSurr4, UPRV_LENGTHOF(checkRangesFixedSurr4)); umutablecptrie_close(mutableTrie); } diff --git a/icu4c/source/test/intltest/ucdtest.cpp b/icu4c/source/test/intltest/ucdtest.cpp index 26a1d23ab67..cdad0ae7c98 100644 --- a/icu4c/source/test/intltest/ucdtest.cpp +++ b/icu4c/source/test/intltest/ucdtest.cpp @@ -7,13 +7,16 @@ #include "unicode/ustring.h" #include "unicode/uchar.h" +#include "unicode/ucpmap.h" #include "unicode/uniset.h" #include "unicode/putil.h" #include "unicode/uscript.h" +#include "unicode/uset.h" #include "cstring.h" #include "hash.h" #include "patternprops.h" #include "normalizer2impl.h" +#include "testutil.h" #include "uparse.h" #include "ucdtest.h" @@ -67,6 +70,8 @@ void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, TESTCASE_AUTO(TestVerticalOrientation); TESTCASE_AUTO(TestDefaultScriptExtensions); TESTCASE_AUTO(TestInvalidCodePointFolding); + TESTCASE_AUTO(TestBinaryCharacterProperties); + TESTCASE_AUTO(TestIntCharacterProperties); TESTCASE_AUTO_END; } @@ -615,3 +620,73 @@ void UnicodeTest::TestInvalidCodePointFolding(void) { cp, u_foldCase(cp, U_FOLD_CASE_EXCLUDE_SPECIAL_I)); } } + +void UnicodeTest::TestBinaryCharacterProperties() { + IcuTestErrorCode errorCode(*this, "TestBinaryCharacterProperties()"); + // Spot-check getBinaryPropertySet() vs. hasBinaryProperty(). + for (int32_t prop = 0; prop < UCHAR_BINARY_LIMIT; ++prop) { + const USet *uset = u_getBinaryPropertySet((UProperty)prop, errorCode); + if (errorCode.errIfFailureAndReset("u_getBinaryPropertySet(%d)", (int)prop)) { + continue; + } + const UnicodeSet &set = *UnicodeSet::fromUSet(uset); + int32_t size = set.size(); + if (size == 0) { + assertFalse(UnicodeString("!hasBinaryProperty(U+0020, ") + prop + u")", + u_hasBinaryProperty(0x20, (UProperty)prop)); + assertFalse(UnicodeString("!hasBinaryProperty(U+0061, ") + prop + u")", + u_hasBinaryProperty(0x61, (UProperty)prop)); + assertFalse(UnicodeString("!hasBinaryProperty(U+4E00, ") + prop + u")", + u_hasBinaryProperty(0x4e00, (UProperty)prop)); + } else { + UChar32 c = set.charAt(0); + if (c > 0) { + assertFalse( + UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c - 1) + + u", " + prop + u")", + u_hasBinaryProperty(c - 1, (UProperty)prop)); + } + assertTrue( + UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) + + u", " + prop + u")", + u_hasBinaryProperty(c, (UProperty)prop)); + c = set.charAt(size - 1); + assertTrue( + UnicodeString("hasBinaryProperty(") + TestUtility::hex(c) + + u", " + prop + u")", + u_hasBinaryProperty(c, (UProperty)prop)); + if (c < 0x10ffff) { + assertFalse( + UnicodeString("!hasBinaryProperty(") + TestUtility::hex(c + 1) + + u", " + prop + u")", + u_hasBinaryProperty(c + 1, (UProperty)prop)); + } + } + } +} + +void UnicodeTest::TestIntCharacterProperties() { + IcuTestErrorCode errorCode(*this, "TestIntCharacterProperties()"); + // Spot-check getIntPropertyMap() vs. getIntPropertyValue(). + for (int32_t prop = UCHAR_INT_START; prop < UCHAR_INT_LIMIT; ++prop) { + const UCPMap *map = u_getIntPropertyMap((UProperty)prop, errorCode); + if (errorCode.errIfFailureAndReset("u_getIntPropertyMap(%d)", (int)prop)) { + continue; + } + uint32_t value; + UChar32 end = ucpmap_getRange(map, 0, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value); + assertTrue("int property first range", end >= 0); + UChar32 c = end / 2; + assertEquals(UnicodeString("int property first range value at ") + TestUtility::hex(c), + u_getIntPropertyValue(c, (UProperty)prop), value); + end = ucpmap_getRange(map, 0x5000, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value); + assertTrue("int property later range", end >= 0); + assertEquals(UnicodeString("int property later range value at ") + TestUtility::hex(end), + u_getIntPropertyValue(end, (UProperty)prop), value); + // ucpmap_get() API coverage + // TODO: move to cucdtst.c + assertEquals( + "int property upcmap_get(U+0061)", + u_getIntPropertyValue(0x61, (UProperty)prop), ucpmap_get(map, 0x61)); + } +} diff --git a/icu4c/source/test/intltest/ucdtest.h b/icu4c/source/test/intltest/ucdtest.h index 1fe75e7eb93..2ed1395b50f 100644 --- a/icu4c/source/test/intltest/ucdtest.h +++ b/icu4c/source/test/intltest/ucdtest.h @@ -46,6 +46,8 @@ public: void TestVerticalOrientation(); void TestDefaultScriptExtensions(); void TestInvalidCodePointFolding(); + void TestBinaryCharacterProperties(); + void TestIntCharacterProperties(); private: diff --git a/icu4c/source/tools/gennorm2/n2builder.cpp b/icu4c/source/tools/gennorm2/n2builder.cpp index f3b7cafc39e..1d388852312 100644 --- a/icu4c/source/tools/gennorm2/n2builder.cpp +++ b/icu4c/source/tools/gennorm2/n2builder.cpp @@ -650,7 +650,7 @@ LocalUCPTriePointer Normalizer2DataBuilder::processData() { // First check that surrogate code *points* are inert. // The parser should have rejected values/mappings for them. uint32_t value; - UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPTRIE_RANGE_NORMAL, 0, + UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value); if (value != Normalizer2Impl::INERT || end < 0xdfff) { fprintf(stderr, @@ -665,7 +665,7 @@ LocalUCPTriePointer Normalizer2DataBuilder::processData() { end = 0; for (UChar32 start = 0x10000;;) { if (start > end) { - end = umutablecptrie_getRange(norm16Trie, start, UCPTRIE_RANGE_NORMAL, 0, + end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value); if (end < 0) { break; } } diff --git a/icu4c/source/tools/gennorm2/norms.cpp b/icu4c/source/tools/gennorm2/norms.cpp index da7e2a80917..96692f233cc 100644 --- a/icu4c/source/tools/gennorm2/norms.cpp +++ b/icu4c/source/tools/gennorm2/norms.cpp @@ -156,7 +156,7 @@ UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t high void Norms::enumRanges(Enumerator &e) { UChar32 start = 0, end; uint32_t i; - while ((end = umutablecptrie_getRange(normTrie, start, UCPTRIE_RANGE_NORMAL, 0, + while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &i)) >= 0) { if (i > 0) { e.rangeHandler(start, end, norms[i]); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterPropertiesImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterPropertiesImpl.java new file mode 100644 index 00000000000..41b005b7c6b --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharacterPropertiesImpl.java @@ -0,0 +1,86 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.impl; + +import com.ibm.icu.text.UnicodeSet; + +/** + * Properties functionality above class UCharacterProperty + * but below class CharacterProperties and class UnicodeSet. + */ +public final class CharacterPropertiesImpl { + /** + * A set of all characters _except_ the second through last characters of + * certain ranges. These ranges are ranges of characters whose + * properties are all exactly alike, e.g. CJK Ideographs from + * U+4E00 to U+9FA5. + */ + private static final UnicodeSet inclusions[] = new UnicodeSet[UCharacterProperty.SRC_COUNT]; + + /** For {@link UnicodeSet#setDefaultXSymbolTable}. */ + public static synchronized void clear() { + for (int i = 0; i < inclusions.length; ++i) { + inclusions[i] = null; + } + } + + private static synchronized UnicodeSet getInclusionsForSource(int src) { + if (inclusions[src] == null) { + UnicodeSet incl = new UnicodeSet(); + switch(src) { + case UCharacterProperty.SRC_CHAR: + UCharacterProperty.INSTANCE.addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_PROPSVEC: + UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_CHAR_AND_PROPSVEC: + UCharacterProperty.INSTANCE.addPropertyStarts(incl); + UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_CASE_AND_NORM: + Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl); + UCaseProps.INSTANCE.addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_NFC: + Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_NFKC: + Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_NFKC_CF: + Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_NFC_CANON_ITER: + Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl); + break; + case UCharacterProperty.SRC_CASE: + UCaseProps.INSTANCE.addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_BIDI: + UBiDiProps.INSTANCE.addPropertyStarts(incl); + break; + case UCharacterProperty.SRC_INPC: + case UCharacterProperty.SRC_INSC: + case UCharacterProperty.SRC_VO: + UCharacterProperty.INSTANCE.ulayout_addPropertyStarts(src, incl); + break; + default: + throw new IllegalStateException("getInclusions(unknown src " + src + ")"); + } + // We do not freeze() the set because we only iterate over it, + // rather than testing contains(), + // so the extra time and memory to optimize that are not necessary. + inclusions[src] = incl; + } + return inclusions[src]; + } + + /** + * Returns a mutable UnicodeSet -- do not modify! + */ + public static UnicodeSet getInclusionsForProperty(int prop) { + int src = UCharacterProperty.INSTANCE.getSource(prop); + return getInclusionsForSource(src); + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java index dad93aaed1a..12c53d6e003 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java @@ -1535,7 +1535,7 @@ public final class UCharacterProperty return -1; // undefined } - public final int getSource(int which) { + final int getSource(int which) { if(whichFor details see the method descriptions. + * For lookup of property values by code point see class {@link UCharacter}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ +public final class CharacterProperties { + private CharacterProperties() {} // all-static + + private static final UnicodeSet sets[] = new UnicodeSet[UProperty.BINARY_LIMIT]; + private static final CodePointMap maps[] = new CodePointMap[UProperty.INT_LIMIT - UProperty.INT_START]; + + private static UnicodeSet makeSet(int property) { + UnicodeSet set = new UnicodeSet(); + UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(property); + int numRanges = inclusions.getRangeCount(); + int startHasProperty = -1; + + for (int i = 0; i < numRanges; ++i) { + int rangeEnd = inclusions.getRangeEnd(i); + for (int c = inclusions.getRangeStart(i); c <= rangeEnd; ++c) { + // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch. + if (UCharacter.hasBinaryProperty(c, property)) { + if (startHasProperty < 0) { + // Transition from false to true. + startHasProperty = c; + } + } else if (startHasProperty >= 0) { + // Transition from true to false. + set.add(startHasProperty, c - 1); + startHasProperty = -1; + } + } + } + if (startHasProperty >= 0) { + set.add(startHasProperty, 0x10FFFF); + } + + return set.freeze(); + } + + private static CodePointMap makeMap(int property) { + int nullValue = property == UProperty.SCRIPT ? UScript.UNKNOWN : 0; + MutableCodePointTrie mutableTrie = new MutableCodePointTrie(nullValue, nullValue); + UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(property); + int numRanges = inclusions.getRangeCount(); + int start = 0; + int value = nullValue; + + for (int i = 0; i < numRanges; ++i) { + int rangeEnd = inclusions.getRangeEnd(i); + for (int c = inclusions.getRangeStart(i); c <= rangeEnd; ++c) { + // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch. + int nextValue = UCharacter.getIntPropertyValue(c, property); + if (value != nextValue) { + if (value != nullValue) { + mutableTrie.setRange(start, c - 1, value); + } + start = c; + value = nextValue; + } + } + } + if (value != 0) { + mutableTrie.setRange(start, 0x10FFFF, value); + } + + CodePointTrie.Type type; + if (property == UProperty.BIDI_CLASS || property == UProperty.GENERAL_CATEGORY) { + type = CodePointTrie.Type.FAST; + } else { + type = CodePointTrie.Type.SMALL; + } + CodePointTrie.ValueWidth valueWidth; + // TODO: UCharacterProperty.IntProperty + int max = UCharacter.getIntPropertyMaxValue(property); + if (max <= 0xff) { + valueWidth = CodePointTrie.ValueWidth.BITS_8; + } else if (max <= 0xffff) { + valueWidth = CodePointTrie.ValueWidth.BITS_16; + } else { + valueWidth = CodePointTrie.ValueWidth.BITS_32; + } + return mutableTrie.buildImmutable(type, valueWidth); + } + + /** + * Returns a frozen UnicodeSet for a binary property. + * Throws an exception if the property number is not one for a binary property. + * + *

The returned set contains all code points for which the property is true. + * + * @param property {@link UProperty#BINARY_START}..{@link UProperty#BINARY_LIMIT}-1 + * @return the property as a set + * @see UProperty + * @see UCharacter#hasBinaryProperty + */ + public static final UnicodeSet getBinaryPropertySet(int property) { + if (property < 0 || UProperty.BINARY_LIMIT <= property) { + throw new IllegalArgumentException("" + property + + " is not a constant for a UProperty binary property"); + } + synchronized(sets) { + UnicodeSet set = sets[property]; + if (set == null) { + sets[property] = set = makeSet(property); + } + return set; + } + } + + /** + * Returns an immutable CodePointMap for an enumerated/catalog/int-valued property. + * Throws an exception if the property number is not one for an "int property". + * + *

The returned object maps all Unicode code points to their values for that property. + * For documentation of the integer values see {@link UCharacter#getIntPropertyValue(int, int)}. + * + *

The actual type of the returned object differs between properties + * and may change over time. + * + * @param property {@link UProperty#INT_START}..{@link UProperty#INT_LIMIT}-1 + * @return the property as a map + * @see UProperty + * @see UCharacter#getIntPropertyValue + */ + public static final CodePointMap getIntPropertyMap(int property) { + if (property < UProperty.INT_START || UProperty.INT_LIMIT <= property) { + throw new IllegalArgumentException("" + property + + " is not a constant for a UProperty int property"); + } + synchronized(maps) { + CodePointMap map = maps[property - UProperty.INT_START]; + if (map == null) { + maps[property - UProperty.INT_START] = map = makeMap(property); + } + return map; + } + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java index 184528b151c..a738359fb92 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/lang/UCharacter.java @@ -5698,7 +5698,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection } /** - * {@icu}

Check a binary Unicode property for a code point. + * {@icu} Check a binary Unicode property for a code point. *

Unicode, especially in version 3.2, defines many more properties * than the original set in UnicodeData.txt. *

This API is intended to reflect Unicode properties as defined in @@ -5720,6 +5720,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection * Unicode version does not have data for the property at all, or * not for this code point. * @see com.ibm.icu.lang.UProperty + * @see CharacterProperties#getBinaryPropertySet(int) * @stable ICU 2.6 */ public static boolean hasBinaryProperty(int ch, int property) @@ -5777,7 +5778,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection } /** - * {@icu}

Returns the property value for an Unicode property type of a code point. + * {@icu} Returns the property value for a Unicode property type of a code point. * Also returns binary and mask property values. *

Unicode, especially in version 3.2, defines many more properties than * the original set in UnicodeData.txt. @@ -5801,8 +5802,9 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection * UProperty.MASK_START <= type < UProperty.MASK_LIMIT. * @return numeric value that is directly the property value or, * for enumerated properties, corresponds to the numeric value of - * the enumerated constant of the respective property value - * enumeration type (cast to enum type if necessary). + * the enumerated constant of the respective property value type + * ({@link ECharacterCategory}, {@link ECharacterDirection}, + * {@link DecompositionType}, etc.). * Returns 0 or 1 (for false / true) for binary Unicode properties. * Returns a bit-mask for mask properties. * Returns 0 if 'type' is out of bounds or if the Unicode version @@ -5812,6 +5814,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection * @see #hasBinaryProperty * @see #getIntPropertyMinValue * @see #getIntPropertyMaxValue + * @see CharacterProperties#getIntPropertyMap(int) * @see #getUnicodeVersion * @stable ICU 2.4 */ diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index a5afcbfb3f7..91143499660 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -18,21 +18,21 @@ import java.util.NoSuchElementException; import java.util.TreeSet; import com.ibm.icu.impl.BMPSet; -import com.ibm.icu.impl.Norm2AllModes; +import com.ibm.icu.impl.CharacterPropertiesImpl; import com.ibm.icu.impl.PatternProps; import com.ibm.icu.impl.RuleCharacterIterator; import com.ibm.icu.impl.SortedSetRelation; import com.ibm.icu.impl.StringRange; -import com.ibm.icu.impl.UBiDiProps; import com.ibm.icu.impl.UCaseProps; -import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.impl.UPropertyAliases; import com.ibm.icu.impl.UnicodeSetStringSpan; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.lang.CharacterProperties; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; +import com.ibm.icu.util.CodePointMap; import com.ibm.icu.util.Freezable; import com.ibm.icu.util.ICUUncheckedIOException; import com.ibm.icu.util.OutputInt; @@ -346,14 +346,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F] private static final String ASSIGNED = "Assigned"; // [:^Cn:] - /** - * A set of all characters _except_ the second through last characters of - * certain ranges. These ranges are ranges of characters whose - * properties are all exactly alike, e.g. CJK Ideographs from - * U+4E00 to U+9FA5. - */ - private static UnicodeSet INCLUSIONS[] = null; - private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. private volatile UnicodeSetStringSpan stringSpan; //---------------------------------------------------------------- @@ -520,8 +512,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa /** * Make this object represent the range start - end. - * If end > start then this object is set to an - * an empty range. + * If end > start then this object is set to an empty range. * * @param start first character in the set, inclusive * @param end last character in the set, inclusive @@ -3186,7 +3177,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa boolean contains(int codePoint); } - private static class NumericValueFilter implements Filter { + private static final class NumericValueFilter implements Filter { double value; NumericValueFilter(double value) { this.value = value; } @Override @@ -3195,29 +3186,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } } - private static class GeneralCategoryMaskFilter implements Filter { - int mask; - GeneralCategoryMaskFilter(int mask) { this.mask = mask; } - @Override - public boolean contains(int ch) { - return ((1 << UCharacter.getType(ch)) & mask) != 0; - } - } - - private static class IntPropertyFilter implements Filter { - int prop; - int value; - IntPropertyFilter(int prop, int value) { - this.prop = prop; - this.value = value; - } - @Override - public boolean contains(int ch) { - return UCharacter.getIntPropertyValue(ch, prop) == value; - } - } - - private static class ScriptExtensionsFilter implements Filter { + private static final class ScriptExtensionsFilter implements Filter { int script; ScriptExtensionsFilter(int script) { this.script = script; } @Override @@ -3229,7 +3198,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // VersionInfo for unassigned characters private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); - private static class VersionFilter implements Filter { + private static final class VersionFilter implements Filter { VersionInfo version; VersionFilter(VersionInfo version) { this.version = version; } @Override @@ -3242,62 +3211,10 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa } } - private static synchronized UnicodeSet getInclusions(int src) { - if (INCLUSIONS == null) { - INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT]; - } - if(INCLUSIONS[src] == null) { - UnicodeSet incl = new UnicodeSet(); - switch(src) { - case UCharacterProperty.SRC_CHAR: - UCharacterProperty.INSTANCE.addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_PROPSVEC: - UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_CHAR_AND_PROPSVEC: - UCharacterProperty.INSTANCE.addPropertyStarts(incl); - UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_CASE_AND_NORM: - Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl); - UCaseProps.INSTANCE.addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_NFC: - Norm2AllModes.getNFCInstance().impl.addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_NFKC: - Norm2AllModes.getNFKCInstance().impl.addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_NFKC_CF: - Norm2AllModes.getNFKC_CFInstance().impl.addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_NFC_CANON_ITER: - Norm2AllModes.getNFCInstance().impl.addCanonIterPropertyStarts(incl); - break; - case UCharacterProperty.SRC_CASE: - UCaseProps.INSTANCE.addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_BIDI: - UBiDiProps.INSTANCE.addPropertyStarts(incl); - break; - case UCharacterProperty.SRC_INPC: - case UCharacterProperty.SRC_INSC: - case UCharacterProperty.SRC_VO: - UCharacterProperty.INSTANCE.ulayout_addPropertyStarts(src, incl); - break; - default: - throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); - } - INCLUSIONS[src] = incl; - } - return INCLUSIONS[src]; - } - /** * Generic filter-based scanning code for UCD property UnicodeSets. */ - private UnicodeSet applyFilter(Filter filter, int src) { + private void applyFilter(Filter filter, UnicodeSet inclusions) { // Logically, walk through all Unicode characters, noting the start // and end of each range for which filter.contain(c) is // true. Add each range to a set. @@ -3305,13 +3222,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // To improve performance, use an inclusions set which // encodes information about character ranges that are known // to have identical properties. - // getInclusions(src) contains exactly the first characters of - // same-value ranges for the given properties "source". + // inclusions contains the first characters of + // same-value ranges for the given property. clear(); int startHasProperty = -1; - UnicodeSet inclusions = getInclusions(src); int limitRange = inclusions.getRangeCount(); for (int j=0; j, Compa if (startHasProperty >= 0) { add_unchecked(startHasProperty, 0x10FFFF); } - - return this; } + /** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ + private static final class GeneralCategoryMaskFilter implements CodePointMap.ValueFilter { + int mask; + GeneralCategoryMaskFilter(int mask) { this.mask = mask; } + @Override + public int apply(int value) { + value = (1 << value) & mask; + if (value != 0) { value = 1; } + return value; + } + } + + /** Maps one map value to 1, all others to 0. */ + private static final class IntValueFilter implements CodePointMap.ValueFilter { + int v; + IntValueFilter(int value) { v = value; } + @Override + public int apply(int value) { return value == v ? 1 : 0; } + } + + private void applyIntPropertyValue(CodePointMap map, CodePointMap.ValueFilter filter) { + clear(); + CodePointMap.Range range = new CodePointMap.Range(); + for (int start = 0; map.getRange(start, filter, range);) { + int end = range.getEnd(); + if (range.getValue() != 0) { + add_unchecked(start, end); + } + start = end + 1; + } + } /** * Remove leading and trailing Pattern_White_Space and compress @@ -3393,13 +3338,31 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @stable ICU 2.4 */ public UnicodeSet applyIntPropertyValue(int prop, int value) { - checkFrozen(); + // All of the following include checkFrozen() before modifying this set. if (prop == UProperty.GENERAL_CATEGORY_MASK) { - applyFilter(new GeneralCategoryMaskFilter(value), UCharacterProperty.SRC_CHAR); + CodePointMap map = CharacterProperties.getIntPropertyMap(UProperty.GENERAL_CATEGORY); + applyIntPropertyValue(map, new GeneralCategoryMaskFilter(value)); } else if (prop == UProperty.SCRIPT_EXTENSIONS) { - applyFilter(new ScriptExtensionsFilter(value), UCharacterProperty.SRC_PROPSVEC); + UnicodeSet inclusions = CharacterPropertiesImpl.getInclusionsForProperty(prop); + applyFilter(new ScriptExtensionsFilter(value), inclusions); + } else if (0 <= prop && prop < UProperty.BINARY_LIMIT) { + if (value == 0 || value == 1) { + set(CharacterProperties.getBinaryPropertySet(prop)); + if (value == 0) { + complement(); + } + } else { + clear(); + } + } else if (UProperty.INT_START <= prop && prop < UProperty.INT_LIMIT) { + CodePointMap map = CharacterProperties.getIntPropertyMap(prop); + applyIntPropertyValue(map, new IntValueFilter(value)); } else { - applyFilter(new IntPropertyFilter(prop, value), UCharacterProperty.INSTANCE.getSource(prop)); + // This code used to always call getInclusions(property source) + // which throws an exception for an unsupported property. + throw new IllegalArgumentException("unsupported property " + prop); + // Otherwise we would just clear() this set because + // getIntPropertyValue(c, prop) returns 0 for all code points. } return this; } @@ -3499,7 +3462,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa case UProperty.NUMERIC_VALUE: { double value = Double.parseDouble(PatternProps.trimWhiteSpace(valueAlias)); - applyFilter(new NumericValueFilter(value), UCharacterProperty.SRC_CHAR); + applyFilter(new NumericValueFilter(value), + CharacterPropertiesImpl.getInclusionsForProperty(p)); return this; } case UProperty.NAME: @@ -3525,7 +3489,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa // VersionInfo.getInstance() does not do // 'loose' matching. VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); - applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); + applyFilter(new VersionFilter(version), + CharacterPropertiesImpl.getInclusionsForProperty(p)); return this; } case UProperty.SCRIPT_EXTENSIONS: @@ -4881,7 +4846,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * of UnicodeSets. *

* WARNING: If this function is used with a UnicodeProperty, and the - * Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call + * Unassigned characters (gc=Cn) are different than in ICU, you MUST call * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. * @@ -4891,7 +4856,9 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa */ @Deprecated public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { - INCLUSIONS = null; // If the properties override inclusions, these have to be regenerated. + // If the properties override inclusions, these have to be regenerated. + // TODO: Check if the Unicode Tools or Unicode Utilities really need this. + CharacterPropertiesImpl.clear(); XSYMBOL_TABLE = xSymbolTable; } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java index d7e4d913606..fc8e89dd33d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java @@ -25,6 +25,7 @@ import com.ibm.icu.impl.Normalizer2Impl; import com.ibm.icu.impl.PatternProps; import com.ibm.icu.impl.UCharacterName; import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.CharacterProperties; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacterCategory; import com.ibm.icu.lang.UCharacterDirection; @@ -35,6 +36,7 @@ import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.CodePointMap; import com.ibm.icu.util.RangeValueIterator; import com.ibm.icu.util.ULocale; import com.ibm.icu.util.ValueIterator; @@ -3641,4 +3643,67 @@ public final class UCharacterTest extends TestFmwk int output = UCharacter.getCharFromNameAlias(alias); assertEquals("alias for '" + input + "'", input, output); } + + @Test + public void TestBinaryCharacterProperties() { + try { + CharacterProperties.getBinaryPropertySet(-1); + fail("getBinaryPropertySet(-1) did not throw an exception"); + CharacterProperties.getBinaryPropertySet(UProperty.BINARY_LIMIT); + fail("getBinaryPropertySet(BINARY_LIMIT) did not throw an exception"); + } catch(Exception expected) { + } + // Spot-check getBinaryPropertySet() vs. hasBinaryProperty(). + for (int prop = 0; prop < UProperty.BINARY_LIMIT; ++prop) { + UnicodeSet set = CharacterProperties.getBinaryPropertySet(prop); + int size = set.size(); + if (size == 0) { + assertFalse("!hasBinaryProperty(U+0020, " + prop + ')', + UCharacter.hasBinaryProperty(0x20, prop)); + assertFalse("!hasBinaryProperty(U+0061, " + prop + ')', + UCharacter.hasBinaryProperty(0x61, prop)); + assertFalse("!hasBinaryProperty(U+4E00, " + prop + ')', + UCharacter.hasBinaryProperty(0x4e00, prop)); + } else { + int c = set.charAt(0); + if (c > 0) { + assertFalse("!hasBinaryProperty(" + Utility.hex(c - 1) + ", " + prop + ')', + UCharacter.hasBinaryProperty(c - 1, prop)); + } + assertTrue("hasBinaryProperty(" + Utility.hex(c) + ", " + prop + ')', + UCharacter.hasBinaryProperty(c, prop)); + c = set.charAt(size - 1); + assertTrue("hasBinaryProperty(" + Utility.hex(c) + ", " + prop + ')', + UCharacter.hasBinaryProperty(c, prop)); + if (c < 0x10ffff) { + assertFalse("!hasBinaryProperty(" + Utility.hex(c + 1) + ", " + prop + ')', + UCharacter.hasBinaryProperty(c + 1, prop)); + } + } + } + } + + @Test + public void TestIntCharacterProperties() { + try { + CharacterProperties.getIntPropertyMap(UProperty.INT_START - 1); + fail("getIntPropertyMap(INT_START-1) did not throw an exception"); + CharacterProperties.getIntPropertyMap(UProperty.INT_LIMIT); + fail("getIntPropertyMap(INT_LIMIT) did not throw an exception"); + } catch(Exception expected) { + } + // Spot-check getIntPropertyMap() vs. getIntPropertyValue(). + CodePointMap.Range range = new CodePointMap.Range(); + for (int prop = UProperty.INT_START; prop < UProperty.INT_LIMIT; ++prop) { + CodePointMap map = CharacterProperties.getIntPropertyMap(prop); + assertTrue("int property first range", map.getRange(0, null, range)); + int c = (range.getStart() + range.getEnd()) / 2; + assertEquals("int property first range value at " + Utility.hex(c), + UCharacter.getIntPropertyValue(c, prop), range.getValue()); + assertTrue("int property later range", map.getRange(0x5000, null, range)); + int end = range.getEnd(); + assertEquals("int property later range value at " + Utility.hex(end), + UCharacter.getIntPropertyValue(end, prop), range.getValue()); + } + } }