mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 05:25:34 +00:00
ICU-22785 move cptrie bit setter to toolutil; add getCPTrieSize()
This commit is contained in:
parent
c439dcdf27
commit
47e9389b8e
4 changed files with 116 additions and 41 deletions
|
@ -136,6 +136,19 @@ namespace {
|
|||
// 0: Script=bits 9..0
|
||||
// 9.. 0 UScriptCode, or index to Script_Extensions
|
||||
|
||||
// *Note*: If we need more than the available bits for new properties,
|
||||
// then we could move the Age property out of the properties vectors.
|
||||
// For example, we could store the Age property in its own trie.
|
||||
// In a small, 8-bit-value-width CodePointTrie, it would be larger than
|
||||
// the amount of data that we would save in the properties vectors and their trie,
|
||||
// but the size increase would be a small percentage of the total uprops.icu size.
|
||||
// It would certainly be a much smaller increase than widening the properties vectors.
|
||||
// The savings in the properties vectors+trie from pulling out the Age property
|
||||
// are partly from mediocre correlation between Age and other property values.
|
||||
// (Adding new characters to existing scripts tends to split property vectors where
|
||||
// new characters are similar to old ones.)
|
||||
// See https://github.com/unicode-org/icu/pull/3025 for details.
|
||||
|
||||
inline constexpr uint32_t UPROPS_AGE_MASK = 0xff000000;
|
||||
inline constexpr int32_t UPROPS_AGE_SHIFT = 24;
|
||||
|
||||
|
|
|
@ -66,9 +66,13 @@
|
|||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/umutablecptrie.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "toolutil.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -82,6 +86,72 @@ void IcuToolErrorCode::handleFailure() const {
|
|||
exit(errorCode);
|
||||
}
|
||||
|
||||
namespace toolutil {
|
||||
|
||||
void setCPTrieBit(UMutableCPTrie *mutableCPTrie,
|
||||
UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode) {
|
||||
uint32_t mask = U_MASK(shift);
|
||||
uint32_t value = on ? mask : 0;
|
||||
setCPTrieBits(mutableCPTrie, start, end, mask, value, errorCode);
|
||||
}
|
||||
|
||||
void setCPTrieBits(UMutableCPTrie *mutableCPTrie,
|
||||
UChar32 start, UChar32 end, uint32_t mask, uint32_t value,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
// The value must not have any bits set outside of the mask.
|
||||
if ((value & ~mask) != 0) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if (start == end) {
|
||||
uint32_t oldValue = umutablecptrie_get(mutableCPTrie, start);
|
||||
uint32_t newValue = (oldValue & ~mask) | value;
|
||||
if (newValue != oldValue) {
|
||||
umutablecptrie_set(mutableCPTrie, start, newValue, &errorCode);
|
||||
}
|
||||
return;
|
||||
}
|
||||
while (start <= end && U_SUCCESS(errorCode)) {
|
||||
uint32_t oldValue;
|
||||
UChar32 rangeEnd = umutablecptrie_getRange(
|
||||
mutableCPTrie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &oldValue);
|
||||
if (rangeEnd > end) {
|
||||
rangeEnd = end;
|
||||
}
|
||||
uint32_t newValue = (oldValue & ~mask) | value;
|
||||
if (newValue != oldValue) {
|
||||
umutablecptrie_setRange(mutableCPTrie, start, rangeEnd, newValue, &errorCode);
|
||||
}
|
||||
start = rangeEnd + 1;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t getCPTrieSize(UMutableCPTrie *mt, UCPTrieType type, UCPTrieValueWidth valueWidth) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
UCPTrie *cpTrie = umutablecptrie_buildImmutable(mt, type, valueWidth, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr,
|
||||
"toolutil/getCPTrieSize error: umutablecptrie_buildImmutable() failed: %s\n",
|
||||
u_errorName(errorCode));
|
||||
return -1;
|
||||
}
|
||||
uint8_t block[100000];
|
||||
int32_t size = ucptrie_toBinary(cpTrie, block, sizeof(block), &errorCode);
|
||||
ucptrie_close(cpTrie);
|
||||
if (U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) {
|
||||
fprintf(stderr,
|
||||
"toolutil/getCPTrieSize error: ucptrie_toBinary() failed: %s (length %ld)\n",
|
||||
u_errorName(errorCode), (long)size);
|
||||
return -1;
|
||||
}
|
||||
U_ASSERT((size & 3) == 0); // multiple of 4 bytes
|
||||
return size;
|
||||
}
|
||||
|
||||
} // toolutil
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
static int32_t currentYear = -1;
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#ifdef __cplusplus
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/umutablecptrie.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -46,6 +47,30 @@ private:
|
|||
const char *location;
|
||||
};
|
||||
|
||||
namespace toolutil {
|
||||
|
||||
/**
|
||||
* Sets one bit in the trie values of the start..end range,
|
||||
* without changing the other bits in the trie values of that range.
|
||||
*/
|
||||
U_TOOLUTIL_API void
|
||||
setCPTrieBit(UMutableCPTrie *mutableCPTrie,
|
||||
UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Sets a bit set (defined by the mask) in the trie values of the start..end range,
|
||||
* without changing the other bits in the trie values of that range.
|
||||
* The given value must not have any bits set outside of the mask.
|
||||
*/
|
||||
U_TOOLUTIL_API void
|
||||
setCPTrieBits(UMutableCPTrie *mutableCPTrie,
|
||||
UChar32 start, UChar32 end, uint32_t mask, uint32_t value, UErrorCode &errorCode);
|
||||
|
||||
U_TOOLUTIL_API int32_t
|
||||
getCPTrieSize(UMutableCPTrie *mt, UCPTrieType type, UCPTrieValueWidth valueWidth);
|
||||
|
||||
} // toolutil
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "cmemory.h"
|
||||
#include "emojiprops.h"
|
||||
#include "genprops.h"
|
||||
#include "toolutil.h"
|
||||
#include "uassert.h"
|
||||
#include "unewdata.h"
|
||||
#include "uparse.h"
|
||||
|
@ -108,8 +109,6 @@ public:
|
|||
void parsePropsOfStringsLine(char *fields[][2], UErrorCode &errorCode);
|
||||
|
||||
private:
|
||||
void setBit(UChar32 start, UChar32 end, int32_t shift, bool on, UErrorCode &errorCode);
|
||||
void setBits(UChar32 start, UChar32 end, uint32_t value, uint32_t mask, UErrorCode &errorCode);
|
||||
void parsePropsOfStringsFile(const char *path, UErrorCode &errorCode);
|
||||
|
||||
static int32_t getTrieIndex(int32_t index) {
|
||||
|
@ -231,48 +230,14 @@ EmojiPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
for (const auto &p2b : propToBinaries) {
|
||||
U_ASSERT(p2b.shift < 8);
|
||||
if (newValues.contains(p2b.prop)) {
|
||||
setBit(props.start, props.end, p2b.shift, props.binProps[p2b.prop], errorCode);
|
||||
toolutil::setCPTrieBit(mutableCPTrie,
|
||||
props.start, props.end, p2b.shift, props.binProps[p2b.prop],
|
||||
errorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::setBit(UChar32 start, UChar32 end, int32_t shift, bool on,
|
||||
UErrorCode &errorCode) {
|
||||
uint32_t mask = U_MASK(shift);
|
||||
uint32_t value = on ? mask : 0;
|
||||
setBits(start, end, value, mask, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
EmojiPropsBuilder::setBits(UChar32 start, UChar32 end, uint32_t value, uint32_t mask,
|
||||
UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return; }
|
||||
|
||||
if (start == end) {
|
||||
uint32_t oldValue = umutablecptrie_get(mutableCPTrie, start);
|
||||
uint32_t newValue = (oldValue & ~mask) | value;
|
||||
if (newValue != oldValue) {
|
||||
umutablecptrie_set(mutableCPTrie, start, newValue, &errorCode);
|
||||
}
|
||||
return;
|
||||
}
|
||||
while (start <= end && U_SUCCESS(errorCode)) {
|
||||
uint32_t oldValue;
|
||||
UChar32 rangeEnd = umutablecptrie_getRange(
|
||||
mutableCPTrie, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &oldValue);
|
||||
if (rangeEnd > end) {
|
||||
rangeEnd = end;
|
||||
}
|
||||
uint32_t newValue = (oldValue & ~mask) | value;
|
||||
if (newValue != oldValue) {
|
||||
umutablecptrie_setRange(mutableCPTrie, start, rangeEnd, newValue, &errorCode);
|
||||
}
|
||||
start = rangeEnd + 1;
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void U_CALLCONV
|
||||
|
@ -347,7 +312,8 @@ void EmojiPropsBuilder::parsePropsOfStringsLine(char *fields[][2], UErrorCode &e
|
|||
}
|
||||
uint32_t start, end;
|
||||
u_parseCodePointRange(rangeOrString, &start, &end, &errorCode);
|
||||
setBit(start, end, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
|
||||
toolutil::setCPTrieBit(mutableCPTrie,
|
||||
start, end, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
|
||||
} else {
|
||||
// Code point or string:
|
||||
// 23F0 ; Basic_Emoji ; alarm clock
|
||||
|
@ -371,7 +337,8 @@ void EmojiPropsBuilder::parsePropsOfStringsLine(char *fields[][2], UErrorCode &e
|
|||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
setBit(first, first, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
|
||||
toolutil::setCPTrieBit(mutableCPTrie,
|
||||
first, first, EmojiProps::BIT_BASIC_EMOJI, true, errorCode);
|
||||
} else {
|
||||
// more than one code point
|
||||
UnicodeString us(false, s, length);
|
||||
|
|
Loading…
Add table
Reference in a new issue