diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h index efb8d40eea7..7ba52cade7c 100644 --- a/icu4c/source/common/uprops.h +++ b/icu4c/source/common/uprops.h @@ -136,6 +136,19 @@ namespace { // 0: Script=bits 9..0 // 9.. 0 UScriptCode, or index to Script_Extensions +// *Note*: If we need more than the available bits for new properties, +// then we could move the Age property out of the properties vectors. +// For example, we could store the Age property in its own trie. +// In a small, 8-bit-value-width CodePointTrie, it would be larger than +// the amount of data that we would save in the properties vectors and their trie, +// but the size increase would be a small percentage of the total uprops.icu size. +// It would certainly be a much smaller increase than widening the properties vectors. +// The savings in the properties vectors+trie from pulling out the Age property +// are partly from mediocre correlation between Age and other property values. +// (Adding new characters to existing scripts tends to split property vectors where +// new characters are similar to old ones.) +// See https://github.com/unicode-org/icu/pull/3025 for details. + inline constexpr uint32_t UPROPS_AGE_MASK = 0xff000000; inline constexpr int32_t UPROPS_AGE_SHIFT = 24; diff --git a/icu4c/source/tools/toolutil/toolutil.cpp b/icu4c/source/tools/toolutil/toolutil.cpp index 7e7bdc78a12..934a114a8d9 100644 --- a/icu4c/source/tools/toolutil/toolutil.cpp +++ b/icu4c/source/tools/toolutil/toolutil.cpp @@ -66,9 +66,13 @@ #include "unicode/errorcode.h" #include "unicode/putil.h" +#include "unicode/uchar.h" +#include "unicode/umutablecptrie.h" +#include "unicode/ucptrie.h" #include "cmemory.h" #include "cstring.h" #include "toolutil.h" +#include "uassert.h" U_NAMESPACE_BEGIN @@ -82,6 +86,72 @@ void IcuToolErrorCode::handleFailure() const { exit(errorCode); } +namespace toolutil { + +void setCPTrieBit(UMutableCPTrie *mutableCPTrie, + UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode) { + uint32_t mask = U_MASK(shift); + uint32_t value = on ? mask : 0; + setCPTrieBits(mutableCPTrie, start, end, mask, value, errorCode); +} + +void setCPTrieBits(UMutableCPTrie *mutableCPTrie, + UChar32 start, UChar32 end, uint32_t mask, uint32_t value, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return; } + // The value must not have any bits set outside of the mask. + if ((value & ~mask) != 0) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if (start == end) { + uint32_t oldValue = umutablecptrie_get(mutableCPTrie, start); + uint32_t newValue = (oldValue & ~mask) | value; + if (newValue != oldValue) { + umutablecptrie_set(mutableCPTrie, start, newValue, &errorCode); + } + return; + } + while (start <= end && U_SUCCESS(errorCode)) { + uint32_t oldValue; + UChar32 rangeEnd = umutablecptrie_getRange( + mutableCPTrie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &oldValue); + if (rangeEnd > end) { + rangeEnd = end; + } + uint32_t newValue = (oldValue & ~mask) | value; + if (newValue != oldValue) { + umutablecptrie_setRange(mutableCPTrie, start, rangeEnd, newValue, &errorCode); + } + start = rangeEnd + 1; + } +} + +int32_t getCPTrieSize(UMutableCPTrie *mt, UCPTrieType type, UCPTrieValueWidth valueWidth) { + UErrorCode errorCode = U_ZERO_ERROR; + UCPTrie *cpTrie = umutablecptrie_buildImmutable(mt, type, valueWidth, &errorCode); + if (U_FAILURE(errorCode)) { + fprintf(stderr, + "toolutil/getCPTrieSize error: umutablecptrie_buildImmutable() failed: %s\n", + u_errorName(errorCode)); + return -1; + } + uint8_t block[100000]; + int32_t size = ucptrie_toBinary(cpTrie, block, sizeof(block), &errorCode); + ucptrie_close(cpTrie); + if (U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, + "toolutil/getCPTrieSize error: ucptrie_toBinary() failed: %s (length %ld)\n", + u_errorName(errorCode), (long)size); + return -1; + } + U_ASSERT((size & 3) == 0); // multiple of 4 bytes + return size; +} + +} // toolutil + U_NAMESPACE_END static int32_t currentYear = -1; diff --git a/icu4c/source/tools/toolutil/toolutil.h b/icu4c/source/tools/toolutil/toolutil.h index 98b2155551e..98988f144eb 100644 --- a/icu4c/source/tools/toolutil/toolutil.h +++ b/icu4c/source/tools/toolutil/toolutil.h @@ -26,6 +26,7 @@ #ifdef __cplusplus #include "unicode/errorcode.h" +#include "unicode/umutablecptrie.h" U_NAMESPACE_BEGIN @@ -46,6 +47,30 @@ private: const char *location; }; +namespace toolutil { + +/** + * Sets one bit in the trie values of the start..end range, + * without changing the other bits in the trie values of that range. + */ +U_TOOLUTIL_API void +setCPTrieBit(UMutableCPTrie *mutableCPTrie, + UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode); + +/** + * Sets a bit set (defined by the mask) in the trie values of the start..end range, + * without changing the other bits in the trie values of that range. + * The given value must not have any bits set outside of the mask. + */ +U_TOOLUTIL_API void +setCPTrieBits(UMutableCPTrie *mutableCPTrie, + UChar32 start, UChar32 end, uint32_t mask, uint32_t value, UErrorCode &errorCode); + +U_TOOLUTIL_API int32_t +getCPTrieSize(UMutableCPTrie *mt, UCPTrieType type, UCPTrieValueWidth valueWidth); + +} // toolutil + U_NAMESPACE_END #endif diff --git a/tools/unicode/c/genprops/emojipropsbuilder.cpp b/tools/unicode/c/genprops/emojipropsbuilder.cpp index 44cb8ff9d3b..a8d0c1a8174 100644 --- a/tools/unicode/c/genprops/emojipropsbuilder.cpp +++ b/tools/unicode/c/genprops/emojipropsbuilder.cpp @@ -20,6 +20,7 @@ #include "cmemory.h" #include "emojiprops.h" #include "genprops.h" +#include "toolutil.h" #include "uassert.h" #include "unewdata.h" #include "uparse.h" @@ -108,8 +109,6 @@ public: void parsePropsOfStringsLine(char *fields[][2], UErrorCode &errorCode); private: - void setBit(UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode); - void setBits(UChar32 start, UChar32 end, uint32_t value, uint32_t mask, UErrorCode &errorCode); void parsePropsOfStringsFile(const char *path, UErrorCode &errorCode); static int32_t getTrieIndex(int32_t index) { @@ -231,48 +230,14 @@ EmojiPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, for (const auto &p2b : propToBinaries) { U_ASSERT(p2b.shift < 8); if (newValues.contains(p2b.prop)) { - setBit(props.start, props.end, p2b.shift, props.binProps[p2b.prop], errorCode); + toolutil::setCPTrieBit(mutableCPTrie, + props.start, props.end, p2b.shift, props.binProps[p2b.prop], + errorCode); } } } } -void -EmojiPropsBuilder::setBit(UChar32 start, UChar32 end, int32_t shift, bool on, - UErrorCode &errorCode) { - uint32_t mask = U_MASK(shift); - uint32_t value = on ? mask : 0; - setBits(start, end, value, mask, errorCode); -} - -void -EmojiPropsBuilder::setBits(UChar32 start, UChar32 end, uint32_t value, uint32_t mask, - UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { return; } - - if (start == end) { - uint32_t oldValue = umutablecptrie_get(mutableCPTrie, start); - uint32_t newValue = (oldValue & ~mask) | value; - if (newValue != oldValue) { - umutablecptrie_set(mutableCPTrie, start, newValue, &errorCode); - } - return; - } - while (start <= end && U_SUCCESS(errorCode)) { - uint32_t oldValue; - UChar32 rangeEnd = umutablecptrie_getRange( - mutableCPTrie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &oldValue); - if (rangeEnd > end) { - rangeEnd = end; - } - uint32_t newValue = (oldValue & ~mask) | value; - if (newValue != oldValue) { - umutablecptrie_setRange(mutableCPTrie, start, rangeEnd, newValue, &errorCode); - } - start = rangeEnd + 1; - } -} - namespace { void U_CALLCONV @@ -347,7 +312,8 @@ void EmojiPropsBuilder::parsePropsOfStringsLine(char *fields[][2], UErrorCode &e } uint32_t start, end; u_parseCodePointRange(rangeOrString, &start, &end, &errorCode); - setBit(start, end, EmojiProps::BIT_BASIC_EMOJI, true, errorCode); + toolutil::setCPTrieBit(mutableCPTrie, + start, end, EmojiProps::BIT_BASIC_EMOJI, true, errorCode); } else { // Code point or string: // 23F0 ; Basic_Emoji ; alarm clock @@ -371,7 +337,8 @@ void EmojiPropsBuilder::parsePropsOfStringsLine(char *fields[][2], UErrorCode &e errorCode = U_ILLEGAL_ARGUMENT_ERROR; return; } - setBit(first, first, EmojiProps::BIT_BASIC_EMOJI, true, errorCode); + toolutil::setCPTrieBit(mutableCPTrie, + first, first, EmojiProps::BIT_BASIC_EMOJI, true, errorCode); } else { // more than one code point UnicodeString us(false, s, length);