ICU-22785 move Block bits from propsvec0 to new trie

This commit is contained in:
Markus Scherer 2024-05-31 16:12:16 -07:00
parent 81492ae9a2
commit 0d8a3ccd11
14 changed files with 2876 additions and 2626 deletions

View file

@ -186,6 +186,9 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
case UPROPS_SRC_ID_COMPAT_MATH:
uprops_addPropertyStarts(src, &sa, &errorCode);
break;
case UPROPS_SRC_BLOCK:
ublock_addPropertyStarts(&sa, errorCode);
break;
default:
errorCode = U_INTERNAL_PROGRAM_ERROR;
break;

View file

@ -23,6 +23,7 @@
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucptrie.h"
#include "unicode/uscript.h"
#include "unicode/udata.h"
#include "uassert.h"
@ -515,6 +516,8 @@ uprv_getMaxValues(int32_t column) {
return indexes[UPROPS_MAX_VALUES_INDEX];
case 2:
return indexes[UPROPS_MAX_VALUES_2_INDEX];
case UPROPS_MAX_VALUES_OTHER_INDEX:
return indexes[column];
default:
return 0;
}
@ -618,7 +621,15 @@ uscript_getScriptExtensions(UChar32 c,
U_CAPI UBlockCode U_EXPORT2
ublock_getCode(UChar32 c) {
return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
// We store Block values indexed by the code point shifted right 4 bits
// and use a "small" UCPTrie=CodePointTrie for minimal data size.
// This works because blocks have xxx0..xxxF ranges.
uint32_t c4 = c; // unsigned so that shifting right does not worry the compiler
// Shift unless out of range, in which case we fetch the trie's error value.
if (c4 <= 0x10ffff) {
c4 >>= 4;
}
return (UBlockCode)ucptrie_get(&block_trie, c4);
}
/* property starts for UnicodeSet ------------------------------------------- */
@ -706,3 +717,18 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
/* add the start code point of each same-value range of the properties vectors trie */
utrie2_enum(&propsVectorsTrie, nullptr, _enumPropertyStartsRange, sa);
}
U_CFUNC void U_EXPORT2
ublock_addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) {
// Add the start code point of each same-value range of the trie.
// We store Block values indexed by the code point shifted right 4 bits;
// see ublock_getCode().
UChar32 start = 0, end;
uint32_t value;
while (start < 0x11000 && // limit: (max code point + 1) >> 4
(end = ucptrie_getRange(&block_trie, start, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &value)) >= 0) {
sa->add(sa->set, start << 4);
start = end + 1;
}
}

File diff suppressed because it is too large Load diff

View file

@ -544,6 +544,14 @@ static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) {
return ubidi_getMaxValue(which);
}
static int32_t getBlock(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return (int32_t)ublock_getCode(c);
}
static int32_t blockGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) {
return uprv_getMaxValues(UPROPS_MAX_VALUES_OTHER_INDEX) & UPROPS_MAX_BLOCK;
}
#if UCONFIG_NO_NORMALIZATION
static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) {
return 0;
@ -683,7 +691,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
* For them, column is the UPropertySource value.
*/
{ UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue },
{ 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue },
{ UPROPS_SRC_BLOCK, 0, 0, getBlock, blockGetMaxValue },
{ UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift },
{ 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue },
{ 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue },

View file

@ -39,16 +39,18 @@ enum {
UPROPS_SCRIPT_EXTENSIONS_INDEX,
UPROPS_RESERVED_INDEX_7,
UPROPS_BLOCK_TRIE_INDEX,
UPROPS_RESERVED_INDEX_8,
/* size of the data file (number of 32-bit units after the header) */
/** size of the data file (number of 32-bit units after the header) */
UPROPS_DATA_TOP_INDEX,
/* maximum values for code values in vector word 0 */
/** maximum values for code values in vector word 0 */
UPROPS_MAX_VALUES_INDEX=10,
/* maximum values for code values in vector word 2 */
/** maximum values for code values in vector word 2 */
UPROPS_MAX_VALUES_2_INDEX,
/** maximum values for other code values */
UPROPS_MAX_VALUES_OTHER_INDEX,
UPROPS_INDEX_COUNT=16
};
@ -117,6 +119,7 @@ enum {
/* number of properties vector words */
#define UPROPS_VECTOR_WORDS 3
// TODO: merge scx+Script bit sets together
/*
* Properties in vector word 0
* Bits
@ -129,7 +132,7 @@ enum {
* 0: Script=bits 21..20 & 7..0
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
* 19..17 East Asian Width
* 16.. 8 UBlockCode
* 16.. 8 reserved since format version 9; was UBlockCode
* 7.. 0 UScriptCode, or index to Script_Extensions
*/
@ -150,8 +153,8 @@ enum {
#define UPROPS_EA_MASK 0x000e0000
#define UPROPS_EA_SHIFT 17
#define UPROPS_BLOCK_MASK 0x0001ff00
#define UPROPS_BLOCK_SHIFT 8
// fine UPROPS_BLOCK_MASK 0x0001ff00
// fine UPROPS_BLOCK_SHIFT 8
#define UPROPS_SCRIPT_LOW_MASK 0x000000ff
@ -319,6 +322,17 @@ inline constexpr uint8_t uprops_idTypeToEncoded[] = {
#define UPROPS_DT_MASK 0x0000001f
#ifdef __cplusplus
namespace {
// Bits 9..0 in UPROPS_MAX_VALUES_OTHER_INDEX
inline constexpr uint32_t UPROPS_MAX_BLOCK = 0x3ff;
} // namespace
#endif // __cplusplus
/**
* Gets the main properties value for a code point.
* Implemented in uchar.c for uprops.cpp.
@ -392,6 +406,8 @@ enum {
ZWNBSP =0xfeff
};
// TODO: Move these two functions into a different header file (new unames.h?) so that uprops.h
// need not be C-compatible any more.
/**
* Get the maximum length of a (regular/1.0/extended) character name.
* @return 0 if no character names available.
@ -445,6 +461,7 @@ enum UPropertySource {
UPROPS_SRC_EMOJI,
UPROPS_SRC_IDSU,
UPROPS_SRC_ID_COMPAT_MATH,
UPROPS_SRC_BLOCK,
/** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
UPROPS_SRC_COUNT
};
@ -476,6 +493,13 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
U_CFUNC void U_EXPORT2
uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode);
#ifdef __cplusplus
U_CFUNC void U_EXPORT2
ublock_addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode);
#endif // __cplusplus
/**
* Return a set of characters for property enumeration.
* For each two consecutive characters (start, limit) in the set,
@ -488,6 +512,8 @@ uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *p
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
*/
// TODO: Move this into a different header file (udataswp.h? new unames.h?) so that uprops.h
// need not be C-compatible any more.
/**
* Swap the ICU Unicode character names file. See uchar.c.
* @internal

Binary file not shown.

View file

@ -72,7 +72,7 @@ export CLDR_SRC=~/cldr/uni/src
export ICU_ROOT=~/icu/uni
export ICU_SRC=$ICU_ROOT/src
export ICU_OUT=$ICU_ROOT/dbg
export ICUDT=icudt75b
export ICUDT=icudt76b
export ICU4C_DATA_IN=$ICU_SRC/icu4c/source/data/in
export ICU4C_UNIDATA=$ICU_SRC/icu4c/source/data/unidata
export LD_LIBRARY_PATH=$ICU_OUT/icu4c/lib
@ -86,7 +86,7 @@ export CLDR_SRC=~/oss/cldr/mine/src
export ICU_ROOT=~/oss/icu
export ICU_SRC=$ICU_ROOT
export ICU_OUT=$ICU_ROOT
export ICUDT=icudt75b
export ICUDT=icudt76b
export ICU4C_DATA_IN=$ICU_SRC/icu4c/source/data/in
export ICU4C_UNIDATA=$ICU_SRC/icu4c/source/data/unidata
export LD_LIBRARY_PATH=$ICU_OUT/icu4c/lib
@ -326,14 +326,14 @@ TODO
output:
...
make[1]: Entering directory '/usr/local/google/home/mscherer/icu/uni/dbg/icu4c/data'
mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt75b
mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt75b
LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt75l.dat ./out/icu4j/icudt75b.dat -s ./out/build/icudt75l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt75b
mv ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt75b"
jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt75b/
mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt76b
mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt76b
LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt76l.dat ./out/icu4j/icudt76b.dat -s ./out/build/icudt76l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt76b
mv ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt76b"
jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt76b/
mkdir -p /tmp/icu4j/main/shared/data
cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data
jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt75b/
jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt76b/
mkdir -p /tmp/icu4j/main/shared/data
cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data
make[1]: Leaving directory '/usr/local/google/home/mscherer/icu/uni/dbg/icu4c/data'
@ -343,8 +343,8 @@ TODO
cp -v com/ibm/icu/impl/data/$ICUDT/brkitr/* $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT/brkitr
cp -v com/ibm/icu/impl/data/$ICUDT/confusables.cfu $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT
cp -v com/ibm/icu/impl/data/$ICUDT/*.nrm $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT
cd com/ibm/icu/impl/data/$ICUDT/
ls *.icu | egrep -v "cnvalias.icu" | awk '{print "cp " $0 " $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT";}' | sh
cd com/ibm/icu/impl/data/icudata/
ls *.icu | egrep -v "cnvalias.icu" | awk '{print "cp " $0 " $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata";}' | sh
- The procedure above is very conservative:
It refreshes only the parts of the ICU4J data that we think are affected by a Unicode data update.
It avoids dealing with any other discrepancies

View file

@ -46,6 +46,7 @@
#include "ucol_imp.h"
#include "ucol_swp.h"
#include "ucnv_bld.h"
#include "udataswp.h"
#include "sprpimpl.h"
#include "rbbidata.h"

View file

@ -440,7 +440,7 @@ group: script_runs
group: uchar
uchar.o
deps
utrie2
ucptrie utrie2
group: messagepattern # for MessageFormat and tools
messagepattern.o

View file

@ -296,13 +296,21 @@ uprops_swap(const UDataSwapper *ds,
// SCX const uint16_t scriptExtensions[2*(i7-i6)];
ds->swapArray16(ds,
inData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
4*(dataIndexes[UPROPS_RESERVED_INDEX_7]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]),
4*(dataIndexes[UPROPS_BLOCK_TRIE_INDEX]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]),
outData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
pErrorCode);
// Swap the Block UCPTrie=CodePointTrie.
int32_t partOffset = dataIndexes[UPROPS_BLOCK_TRIE_INDEX];
int32_t nextOffset = dataIndexes[UPROPS_RESERVED_INDEX_8];
int32_t partLength = 4 * (nextOffset - partOffset);
if (partLength >= 0) {
utrie_swapAnyVersion(ds, inData32 + partOffset, partLength,
outData32 + partOffset, pErrorCode);
}
}
/* i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data */
return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_7];
return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_8];
}
/* Unicode case mapping data swapping --------------------------------------- */

View file

@ -82,6 +82,9 @@ public final class CharacterPropertiesImpl {
case UCharacterProperty.SRC_ID_COMPAT_MATH:
UCharacterProperty.mathCompat_addPropertyStarts(incl);
break;
case UCharacterProperty.SRC_BLOCK:
UCharacterProperty.INSTANCE.ublock_addPropertyStarts(incl);
break;
default:
throw new IllegalStateException("getInclusions(unknown src " + src + ")");
}

View file

@ -114,8 +114,9 @@ public final class UCharacterProperty
public static final int SRC_EMOJI=15;
public static final int SRC_IDSU=16;
public static final int SRC_ID_COMPAT_MATH=17;
public static final int SRC_BLOCK=18;
/** One more than the highest UPropertySource (SRC_) constant. */
public static final int SRC_COUNT=18;
public static final int SRC_COUNT=19;
private static final class LayoutProps {
private static final class IsAcceptable implements ICUBinary.Authenticate {
@ -736,7 +737,24 @@ public final class UCharacterProperty
return UBiDiProps.INSTANCE.getClass(c);
}
},
new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_),
new IntProperty(SRC_BLOCK) { // BLOCK
@Override
int getValue(int c) {
// We store Block values indexed by the code point shifted right 4 bits
// and use a "small" UCPTrie=CodePointTrie for minimal data size.
// This works because blocks have xxx0..xxxF ranges.
int c4 = c;
// Shift unless out of range, in which case we fetch the trie's error value.
if (c4 <= 0x10ffff) {
c4 >>= 4;
}
return m_blockTrie_.get(c4);
}
@Override
int getMaxValue(int which) {
return m_maxValuesOther_ & MAX_BLOCK;
}
},
new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS
@Override
int getValue(int c) {
@ -1273,13 +1291,17 @@ public final class UCharacterProperty
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
int m_maxJTGValue_;
/** maximum values for other code values */
int m_maxValuesOther_;
/**
* Script_Extensions data
*/
public char[] m_scriptExtensions_;
CodePointTrie m_blockTrie_;
// private variables -------------------------------------------------
/**
@ -1346,7 +1368,8 @@ public final class UCharacterProperty
/*
* Properties in vector word 0
* Bits
* 31..24 DerivedAge version major/minor one nibble each
* 31..26 Age major version (0..63)
* 25..24 Age minor version (0..3)
* 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index
* 3: Script value from Script_Extensions
* 2: Script=Inherited
@ -1354,7 +1377,7 @@ public final class UCharacterProperty
* 0: Script=bits 21..20 & 7..0
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
* 19..17 East Asian Width
* 16.. 8 UBlockCode
* 16.. 8 reserved since format version 9; was UBlockCode
* 7.. 0 UScriptCode, or index to Script_Extensions
*/
@ -1381,16 +1404,6 @@ public final class UCharacterProperty
* Equivalent to icu4c UPROPS_EA_SHIFT
*/
private static final int EAST_ASIAN_SHIFT_ = 17;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_MASK
*/
private static final int BLOCK_MASK_ = 0x0001ff00;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_SHIFT
*/
private static final int BLOCK_SHIFT_ = 8;
/**
* Integer properties mask and shift values for scripts.
* Equivalent to icu4c UPROPS_SHIFT_LOW_MASK.
@ -1549,6 +1562,8 @@ public final class UCharacterProperty
*/
private static final int AGE_SHIFT_ = 24;
// Bits 9..0 in UPROPS_MAX_VALUES_OTHER_INDEX
private static final int MAX_BLOCK = 0x3ff;
// private constructors --------------------------------------------------
@ -1577,12 +1592,13 @@ public final class UCharacterProperty
int additionalVectorsOffset = bytes.getInt();
m_additionalColumnsCount_ = bytes.getInt();
int scriptExtensionsOffset = bytes.getInt();
int reservedOffset7 = bytes.getInt();
/* reservedOffset8 = */ bytes.getInt();
int blockTrieOffset = bytes.getInt();
int reservedOffset8 = bytes.getInt();
/* dataTopOffset = */ bytes.getInt();
m_maxBlockScriptValue_ = bytes.getInt();
m_maxJTGValue_ = bytes.getInt();
ICUBinary.skipBytes(bytes, (16 - 12) << 2);
m_maxValuesOther_ = bytes.getInt();
ICUBinary.skipBytes(bytes, (16 - 13) << 2);
// read the main properties trie
m_trie_ = Trie2_16.createFromSerialized(bytes);
@ -1614,10 +1630,20 @@ public final class UCharacterProperty
}
// Script_Extensions
int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
int numChars = (blockTrieOffset - scriptExtensionsOffset) * 2;
if(numChars > 0) {
m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0);
}
// Read the blockTrie.
int partLength = (reservedOffset8 - blockTrieOffset) * 4;
int triePosition = bytes.position();
m_blockTrie_ = CodePointTrie.fromBinary(null, CodePointTrie.ValueWidth.BITS_16, bytes);
trieLength = bytes.position() - triePosition;
if (trieLength > partLength) {
throw new ICUUncheckedIOException("uprops.icu: not enough bytes for blockTrie");
}
ICUBinary.skipBytes(bytes, partLength - trieLength); // skip padding after trie bytes
}
private static final class IsAcceptable implements ICUBinary.Authenticate {
@ -1794,6 +1820,19 @@ public final class UCharacterProperty
}
}
public void ublock_addPropertyStarts(UnicodeSet set) {
// Add the start code point of each same-value range of the trie.
// We store Block values indexed by the code point shifted right 4 bits;
// see ublock_getCode().
CodePointMap.Range range = new CodePointMap.Range();
int start = 0;
while (start < 0x11000 && // limit: (max code point + 1) >> 4
m_blockTrie_.getRange(start, null, range)) {
set.add(start << 4);
start = range.getEnd() + 1;
}
}
public boolean hasIDType(int c, int typeIndex) {
if (typeIndex < 0 || typeIndex >= idTypeToEncoded.length) {
return false;

View file

@ -22,7 +22,9 @@
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucptrie.h"
#include "unicode/udata.h"
#include "unicode/umutablecptrie.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/usetiter.h"
@ -81,13 +83,14 @@ Formally, the file contains the following structures:
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
i7 blockTrieIndex; -- 32-bit unit index to the Block property trie (format version 9+)
i8 reservedIndex8; -- top of the previous part of the data; i8 and i9 have the same values
i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
i12..i15 reservedIndexes; -- reserved values; 0 for now
i12 maxValuesOther; -- additional maximum values, see uprops.h (format version 9+)
i13..i15 reservedIndexes; -- reserved values; 0 for now
PT serialized properties trie, see utrie2.h (byte size: 4*(i0-16))
@ -114,6 +117,10 @@ Formally, the file contains the following structures:
vs. another value (and the index is to a pair).
(See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
blockTrie serialized CodePointTrie/UCPTrie for the Block property (format version 9+)
Indexed by (code point >> 4). Takes advantage of each Block having xxx0..xxxF boundaries.
Trie lookup and properties:
In order to condense the data for the 21-bit code space, several properties of
@ -299,6 +306,10 @@ ICU 75 uses the vector word 2 bits 31..26 for encoded Identifier_Type bit sets.
Age major:minor version bit fields changed from 4:4 to 6:2 so that age=16.0 fits.
Block data moved from props vector 0 into its own new CodePointTrie.
Reserve 10 bits in the new indexes[UPROPS_MAX_VALUES_OTHER_INDEX] for the max Block value,
although the trie can hold 16-bit values.
----------------------------------------------------------------------------- */
U_NAMESPACE_USE
@ -343,7 +354,11 @@ private:
UTrie2 *pTrie;
UTrie2 *props2Trie;
UPropsVectors *pv;
UMutableCPTrie *mutableBlockTrie = nullptr;
UCPTrie *blockTrie = nullptr;
UnicodeString scriptExtensions;
uint8_t blockTrieBytes[100000];
int32_t blockTrieSize = 0;
};
CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
@ -358,12 +373,19 @@ CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
fprintf(stderr, "genprops error: corepropsbuilder upvec_open() failed - %s\n",
u_errorName(errorCode));
}
mutableBlockTrie = umutablecptrie_open(0, 0, &errorCode);
if (U_FAILURE(errorCode)) {
fprintf(stderr, "genprops/Block error: umutablecptrie_open() failed: %s\n",
u_errorName(errorCode));
}
}
CorePropsBuilder::~CorePropsBuilder() {
utrie2_close(pTrie);
utrie2_close(props2Trie);
upvec_close(pv);
umutablecptrie_close(mutableBlockTrie);
ucptrie_close(blockTrie);
}
void
@ -693,7 +715,6 @@ struct PropToEnum {
const PropToEnum
propToEnums[]={
{ UCHAR_BLOCK, 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
@ -755,6 +776,16 @@ CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK,
&errorCode);
}
if (newValues.contains(UCHAR_BLOCK)) {
uint32_t value = props.getIntProp(UCHAR_BLOCK);
if ((start & 0xf) != 0 || (end & 0xf) != 0xf || value > UPROPS_MAX_BLOCK) {
fprintf(stderr, "genprops error: %04lX..%04lX Block 0x%x cannot be encoded\n",
(long)start, (long)end, (int)value);
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
umutablecptrie_setRange(mutableBlockTrie, start >> 4, end >> 4, value, &errorCode);
}
// Set the script value if the Script_Extensions revert to {Script}.
// Otherwise we would have to duplicate the code for doing so.
@ -895,6 +926,24 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
scriptExtensions.append((char16_t)0);
}
blockTrie = umutablecptrie_buildImmutable(
mutableBlockTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_16, &errorCode);
if (U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops/Block error: umutablecptrie_buildImmutable() failed: %s\n",
u_errorName(errorCode));
return;
}
blockTrieSize = ucptrie_toBinary(blockTrie,
blockTrieBytes, sizeof(blockTrieBytes), &errorCode);
if (U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops/Block error: ucptrie_toBinary() failed: %s (length %ld)\n",
u_errorName(errorCode), (long)trieSize);
return;
}
U_ASSERT((blockTrieSize & 3) == 0); // multiple of 4 bytes
/* set indexes */
int32_t offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
offset+=trieSize>>2;
@ -909,14 +958,14 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
offset+=pvCount;
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=offset;
offset+=scriptExtensions.length()/2;
indexes[UPROPS_RESERVED_INDEX_7]=offset;
indexes[UPROPS_BLOCK_TRIE_INDEX]=offset;
offset+=blockTrieSize/4;
indexes[UPROPS_RESERVED_INDEX_8]=offset;
indexes[UPROPS_DATA_TOP_INDEX]=offset;
totalSize=4*offset;
indexes[UPROPS_MAX_VALUES_INDEX]=
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
(int32_t)splitScriptCodeOrIndex(USCRIPT_CODE_LIMIT-1);
indexes[UPROPS_MAX_VALUES_2_INDEX]=
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
@ -924,6 +973,8 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
((int32_t)U_DT_COUNT-1);
indexes[UPROPS_MAX_VALUES_OTHER_INDEX]=
(int32_t)UBLOCK_COUNT-1;
if(!beQuiet) {
puts("* uprops.icu stats *");
@ -932,6 +983,7 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
printf("number of additional props vectors: %5u\n", (int)pvRows);
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions.length());
printf("size in bytes of Block trie: %5u\n", (int)blockTrieSize);
printf("data size: %6ld\n", (long)totalSize);
}
}
@ -988,6 +1040,8 @@ CorePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
"",
"};\n\n");
usrc_writeUCPTrie(f, "block", blockTrie, UPRV_TARGET_SYNTAX_CCODE);
usrc_writeArray(f,
"static const int32_t indexes[UPROPS_INDEX_COUNT]={",
indexes, 32, UPROPS_INDEX_COUNT,
@ -1018,6 +1072,7 @@ CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorC
udata_writeBlock(pData, props2TrieBlock, props2TrieSize);
udata_writeBlock(pData, pvArray, pvCount*4);
udata_writeBlock(pData, scriptExtensions.getBuffer(), scriptExtensions.length()*2);
udata_writeBlock(pData, blockTrieBytes, blockTrieSize);
long dataLength=udata_finish(pData, &errorCode);
if(U_FAILURE(errorCode)) {