mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 05:25:34 +00:00
ICU-22785 move Block bits from propsvec0 to new trie
This commit is contained in:
parent
81492ae9a2
commit
0d8a3ccd11
14 changed files with 2876 additions and 2626 deletions
|
@ -186,6 +186,9 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
|
|||
case UPROPS_SRC_ID_COMPAT_MATH:
|
||||
uprops_addPropertyStarts(src, &sa, &errorCode);
|
||||
break;
|
||||
case UPROPS_SRC_BLOCK:
|
||||
ublock_addPropertyStarts(&sa, errorCode);
|
||||
break;
|
||||
default:
|
||||
errorCode = U_INTERNAL_PROGRAM_ERROR;
|
||||
break;
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "uassert.h"
|
||||
|
@ -515,6 +516,8 @@ uprv_getMaxValues(int32_t column) {
|
|||
return indexes[UPROPS_MAX_VALUES_INDEX];
|
||||
case 2:
|
||||
return indexes[UPROPS_MAX_VALUES_2_INDEX];
|
||||
case UPROPS_MAX_VALUES_OTHER_INDEX:
|
||||
return indexes[column];
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
@ -618,7 +621,15 @@ uscript_getScriptExtensions(UChar32 c,
|
|||
|
||||
U_CAPI UBlockCode U_EXPORT2
|
||||
ublock_getCode(UChar32 c) {
|
||||
return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
|
||||
// We store Block values indexed by the code point shifted right 4 bits
|
||||
// and use a "small" UCPTrie=CodePointTrie for minimal data size.
|
||||
// This works because blocks have xxx0..xxxF ranges.
|
||||
uint32_t c4 = c; // unsigned so that shifting right does not worry the compiler
|
||||
// Shift unless out of range, in which case we fetch the trie's error value.
|
||||
if (c4 <= 0x10ffff) {
|
||||
c4 >>= 4;
|
||||
}
|
||||
return (UBlockCode)ucptrie_get(&block_trie, c4);
|
||||
}
|
||||
|
||||
/* property starts for UnicodeSet ------------------------------------------- */
|
||||
|
@ -706,3 +717,18 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
|||
/* add the start code point of each same-value range of the properties vectors trie */
|
||||
utrie2_enum(&propsVectorsTrie, nullptr, _enumPropertyStartsRange, sa);
|
||||
}
|
||||
|
||||
U_CFUNC void U_EXPORT2
|
||||
ublock_addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) {
|
||||
// Add the start code point of each same-value range of the trie.
|
||||
// We store Block values indexed by the code point shifted right 4 bits;
|
||||
// see ublock_getCode().
|
||||
UChar32 start = 0, end;
|
||||
uint32_t value;
|
||||
while (start < 0x11000 && // limit: (max code point + 1) >> 4
|
||||
(end = ucptrie_getRange(&block_trie, start, UCPMAP_RANGE_NORMAL, 0,
|
||||
nullptr, nullptr, &value)) >= 0) {
|
||||
sa->add(sa->set, start << 4);
|
||||
start = end + 1;
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -544,6 +544,14 @@ static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) {
|
|||
return ubidi_getMaxValue(which);
|
||||
}
|
||||
|
||||
static int32_t getBlock(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
|
||||
return (int32_t)ublock_getCode(c);
|
||||
}
|
||||
|
||||
static int32_t blockGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) {
|
||||
return uprv_getMaxValues(UPROPS_MAX_VALUES_OTHER_INDEX) & UPROPS_MAX_BLOCK;
|
||||
}
|
||||
|
||||
#if UCONFIG_NO_NORMALIZATION
|
||||
static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) {
|
||||
return 0;
|
||||
|
@ -683,7 +691,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
|
|||
* For them, column is the UPropertySource value.
|
||||
*/
|
||||
{ UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue },
|
||||
{ 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue },
|
||||
{ UPROPS_SRC_BLOCK, 0, 0, getBlock, blockGetMaxValue },
|
||||
{ UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift },
|
||||
{ 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue },
|
||||
{ 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue },
|
||||
|
|
|
@ -39,16 +39,18 @@ enum {
|
|||
|
||||
UPROPS_SCRIPT_EXTENSIONS_INDEX,
|
||||
|
||||
UPROPS_RESERVED_INDEX_7,
|
||||
UPROPS_BLOCK_TRIE_INDEX,
|
||||
UPROPS_RESERVED_INDEX_8,
|
||||
|
||||
/* size of the data file (number of 32-bit units after the header) */
|
||||
/** size of the data file (number of 32-bit units after the header) */
|
||||
UPROPS_DATA_TOP_INDEX,
|
||||
|
||||
/* maximum values for code values in vector word 0 */
|
||||
/** maximum values for code values in vector word 0 */
|
||||
UPROPS_MAX_VALUES_INDEX=10,
|
||||
/* maximum values for code values in vector word 2 */
|
||||
/** maximum values for code values in vector word 2 */
|
||||
UPROPS_MAX_VALUES_2_INDEX,
|
||||
/** maximum values for other code values */
|
||||
UPROPS_MAX_VALUES_OTHER_INDEX,
|
||||
|
||||
UPROPS_INDEX_COUNT=16
|
||||
};
|
||||
|
@ -117,6 +119,7 @@ enum {
|
|||
/* number of properties vector words */
|
||||
#define UPROPS_VECTOR_WORDS 3
|
||||
|
||||
// TODO: merge scx+Script bit sets together
|
||||
/*
|
||||
* Properties in vector word 0
|
||||
* Bits
|
||||
|
@ -129,7 +132,7 @@ enum {
|
|||
* 0: Script=bits 21..20 & 7..0
|
||||
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
|
||||
* 19..17 East Asian Width
|
||||
* 16.. 8 UBlockCode
|
||||
* 16.. 8 reserved since format version 9; was UBlockCode
|
||||
* 7.. 0 UScriptCode, or index to Script_Extensions
|
||||
*/
|
||||
|
||||
|
@ -150,8 +153,8 @@ enum {
|
|||
#define UPROPS_EA_MASK 0x000e0000
|
||||
#define UPROPS_EA_SHIFT 17
|
||||
|
||||
#define UPROPS_BLOCK_MASK 0x0001ff00
|
||||
#define UPROPS_BLOCK_SHIFT 8
|
||||
// fine UPROPS_BLOCK_MASK 0x0001ff00
|
||||
// fine UPROPS_BLOCK_SHIFT 8
|
||||
|
||||
#define UPROPS_SCRIPT_LOW_MASK 0x000000ff
|
||||
|
||||
|
@ -319,6 +322,17 @@ inline constexpr uint8_t uprops_idTypeToEncoded[] = {
|
|||
|
||||
#define UPROPS_DT_MASK 0x0000001f
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
namespace {
|
||||
|
||||
// Bits 9..0 in UPROPS_MAX_VALUES_OTHER_INDEX
|
||||
inline constexpr uint32_t UPROPS_MAX_BLOCK = 0x3ff;
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // __cplusplus
|
||||
|
||||
/**
|
||||
* Gets the main properties value for a code point.
|
||||
* Implemented in uchar.c for uprops.cpp.
|
||||
|
@ -392,6 +406,8 @@ enum {
|
|||
ZWNBSP =0xfeff
|
||||
};
|
||||
|
||||
// TODO: Move these two functions into a different header file (new unames.h?) so that uprops.h
|
||||
// need not be C-compatible any more.
|
||||
/**
|
||||
* Get the maximum length of a (regular/1.0/extended) character name.
|
||||
* @return 0 if no character names available.
|
||||
|
@ -445,6 +461,7 @@ enum UPropertySource {
|
|||
UPROPS_SRC_EMOJI,
|
||||
UPROPS_SRC_IDSU,
|
||||
UPROPS_SRC_ID_COMPAT_MATH,
|
||||
UPROPS_SRC_BLOCK,
|
||||
/** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
|
||||
UPROPS_SRC_COUNT
|
||||
};
|
||||
|
@ -476,6 +493,13 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
|
|||
U_CFUNC void U_EXPORT2
|
||||
uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
U_CFUNC void U_EXPORT2
|
||||
ublock_addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode);
|
||||
|
||||
#endif // __cplusplus
|
||||
|
||||
/**
|
||||
* Return a set of characters for property enumeration.
|
||||
* For each two consecutive characters (start, limit) in the set,
|
||||
|
@ -488,6 +512,8 @@ uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *p
|
|||
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
*/
|
||||
|
||||
// TODO: Move this into a different header file (udataswp.h? new unames.h?) so that uprops.h
|
||||
// need not be C-compatible any more.
|
||||
/**
|
||||
* Swap the ICU Unicode character names file. See uchar.c.
|
||||
* @internal
|
||||
|
|
Binary file not shown.
|
@ -72,7 +72,7 @@ export CLDR_SRC=~/cldr/uni/src
|
|||
export ICU_ROOT=~/icu/uni
|
||||
export ICU_SRC=$ICU_ROOT/src
|
||||
export ICU_OUT=$ICU_ROOT/dbg
|
||||
export ICUDT=icudt75b
|
||||
export ICUDT=icudt76b
|
||||
export ICU4C_DATA_IN=$ICU_SRC/icu4c/source/data/in
|
||||
export ICU4C_UNIDATA=$ICU_SRC/icu4c/source/data/unidata
|
||||
export LD_LIBRARY_PATH=$ICU_OUT/icu4c/lib
|
||||
|
@ -86,7 +86,7 @@ export CLDR_SRC=~/oss/cldr/mine/src
|
|||
export ICU_ROOT=~/oss/icu
|
||||
export ICU_SRC=$ICU_ROOT
|
||||
export ICU_OUT=$ICU_ROOT
|
||||
export ICUDT=icudt75b
|
||||
export ICUDT=icudt76b
|
||||
export ICU4C_DATA_IN=$ICU_SRC/icu4c/source/data/in
|
||||
export ICU4C_UNIDATA=$ICU_SRC/icu4c/source/data/unidata
|
||||
export LD_LIBRARY_PATH=$ICU_OUT/icu4c/lib
|
||||
|
@ -326,14 +326,14 @@ TODO
|
|||
output:
|
||||
...
|
||||
make[1]: Entering directory '/usr/local/google/home/mscherer/icu/uni/dbg/icu4c/data'
|
||||
mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt75b
|
||||
mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt75b
|
||||
LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt75l.dat ./out/icu4j/icudt75b.dat -s ./out/build/icudt75l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt75b
|
||||
mv ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt75b"
|
||||
jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt75b/
|
||||
mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt76b
|
||||
mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt76b
|
||||
LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt76l.dat ./out/icu4j/icudt76b.dat -s ./out/build/icudt76l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt76b
|
||||
mv ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt76b"
|
||||
jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt76b/
|
||||
mkdir -p /tmp/icu4j/main/shared/data
|
||||
cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data
|
||||
jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt75b/
|
||||
jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt76b/
|
||||
mkdir -p /tmp/icu4j/main/shared/data
|
||||
cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data
|
||||
make[1]: Leaving directory '/usr/local/google/home/mscherer/icu/uni/dbg/icu4c/data'
|
||||
|
@ -343,8 +343,8 @@ TODO
|
|||
cp -v com/ibm/icu/impl/data/$ICUDT/brkitr/* $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT/brkitr
|
||||
cp -v com/ibm/icu/impl/data/$ICUDT/confusables.cfu $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT
|
||||
cp -v com/ibm/icu/impl/data/$ICUDT/*.nrm $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT
|
||||
cd com/ibm/icu/impl/data/$ICUDT/
|
||||
ls *.icu | egrep -v "cnvalias.icu" | awk '{print "cp " $0 " $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT";}' | sh
|
||||
cd com/ibm/icu/impl/data/icudata/
|
||||
ls *.icu | egrep -v "cnvalias.icu" | awk '{print "cp " $0 " $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata";}' | sh
|
||||
- The procedure above is very conservative:
|
||||
It refreshes only the parts of the ICU4J data that we think are affected by a Unicode data update.
|
||||
It avoids dealing with any other discrepancies
|
||||
|
|
|
@ -46,6 +46,7 @@
|
|||
#include "ucol_imp.h"
|
||||
#include "ucol_swp.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "udataswp.h"
|
||||
#include "sprpimpl.h"
|
||||
#include "rbbidata.h"
|
||||
|
||||
|
|
|
@ -440,7 +440,7 @@ group: script_runs
|
|||
group: uchar
|
||||
uchar.o
|
||||
deps
|
||||
utrie2
|
||||
ucptrie utrie2
|
||||
|
||||
group: messagepattern # for MessageFormat and tools
|
||||
messagepattern.o
|
||||
|
|
|
@ -296,13 +296,21 @@ uprops_swap(const UDataSwapper *ds,
|
|||
// SCX const uint16_t scriptExtensions[2*(i7-i6)];
|
||||
ds->swapArray16(ds,
|
||||
inData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
|
||||
4*(dataIndexes[UPROPS_RESERVED_INDEX_7]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]),
|
||||
4*(dataIndexes[UPROPS_BLOCK_TRIE_INDEX]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]),
|
||||
outData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
|
||||
pErrorCode);
|
||||
|
||||
// Swap the Block UCPTrie=CodePointTrie.
|
||||
int32_t partOffset = dataIndexes[UPROPS_BLOCK_TRIE_INDEX];
|
||||
int32_t nextOffset = dataIndexes[UPROPS_RESERVED_INDEX_8];
|
||||
int32_t partLength = 4 * (nextOffset - partOffset);
|
||||
if (partLength >= 0) {
|
||||
utrie_swapAnyVersion(ds, inData32 + partOffset, partLength,
|
||||
outData32 + partOffset, pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
/* i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data */
|
||||
return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_7];
|
||||
return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_8];
|
||||
}
|
||||
|
||||
/* Unicode case mapping data swapping --------------------------------------- */
|
||||
|
|
|
@ -82,6 +82,9 @@ public final class CharacterPropertiesImpl {
|
|||
case UCharacterProperty.SRC_ID_COMPAT_MATH:
|
||||
UCharacterProperty.mathCompat_addPropertyStarts(incl);
|
||||
break;
|
||||
case UCharacterProperty.SRC_BLOCK:
|
||||
UCharacterProperty.INSTANCE.ublock_addPropertyStarts(incl);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("getInclusions(unknown src " + src + ")");
|
||||
}
|
||||
|
|
|
@ -114,8 +114,9 @@ public final class UCharacterProperty
|
|||
public static final int SRC_EMOJI=15;
|
||||
public static final int SRC_IDSU=16;
|
||||
public static final int SRC_ID_COMPAT_MATH=17;
|
||||
public static final int SRC_BLOCK=18;
|
||||
/** One more than the highest UPropertySource (SRC_) constant. */
|
||||
public static final int SRC_COUNT=18;
|
||||
public static final int SRC_COUNT=19;
|
||||
|
||||
private static final class LayoutProps {
|
||||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
|
@ -736,7 +737,24 @@ public final class UCharacterProperty
|
|||
return UBiDiProps.INSTANCE.getClass(c);
|
||||
}
|
||||
},
|
||||
new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_),
|
||||
new IntProperty(SRC_BLOCK) { // BLOCK
|
||||
@Override
|
||||
int getValue(int c) {
|
||||
// We store Block values indexed by the code point shifted right 4 bits
|
||||
// and use a "small" UCPTrie=CodePointTrie for minimal data size.
|
||||
// This works because blocks have xxx0..xxxF ranges.
|
||||
int c4 = c;
|
||||
// Shift unless out of range, in which case we fetch the trie's error value.
|
||||
if (c4 <= 0x10ffff) {
|
||||
c4 >>= 4;
|
||||
}
|
||||
return m_blockTrie_.get(c4);
|
||||
}
|
||||
@Override
|
||||
int getMaxValue(int which) {
|
||||
return m_maxValuesOther_ & MAX_BLOCK;
|
||||
}
|
||||
},
|
||||
new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS
|
||||
@Override
|
||||
int getValue(int c) {
|
||||
|
@ -1273,13 +1291,17 @@ public final class UCharacterProperty
|
|||
* Maximum values for script, bits used as in vector word
|
||||
* 0
|
||||
*/
|
||||
int m_maxJTGValue_;
|
||||
int m_maxJTGValue_;
|
||||
/** maximum values for other code values */
|
||||
int m_maxValuesOther_;
|
||||
|
||||
/**
|
||||
* Script_Extensions data
|
||||
*/
|
||||
public char[] m_scriptExtensions_;
|
||||
|
||||
CodePointTrie m_blockTrie_;
|
||||
|
||||
// private variables -------------------------------------------------
|
||||
|
||||
/**
|
||||
|
@ -1346,7 +1368,8 @@ public final class UCharacterProperty
|
|||
/*
|
||||
* Properties in vector word 0
|
||||
* Bits
|
||||
* 31..24 DerivedAge version major/minor one nibble each
|
||||
* 31..26 Age major version (0..63)
|
||||
* 25..24 Age minor version (0..3)
|
||||
* 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index
|
||||
* 3: Script value from Script_Extensions
|
||||
* 2: Script=Inherited
|
||||
|
@ -1354,7 +1377,7 @@ public final class UCharacterProperty
|
|||
* 0: Script=bits 21..20 & 7..0
|
||||
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
|
||||
* 19..17 East Asian Width
|
||||
* 16.. 8 UBlockCode
|
||||
* 16.. 8 reserved since format version 9; was UBlockCode
|
||||
* 7.. 0 UScriptCode, or index to Script_Extensions
|
||||
*/
|
||||
|
||||
|
@ -1381,16 +1404,6 @@ public final class UCharacterProperty
|
|||
* Equivalent to icu4c UPROPS_EA_SHIFT
|
||||
*/
|
||||
private static final int EAST_ASIAN_SHIFT_ = 17;
|
||||
/**
|
||||
* Integer properties mask and shift values for blocks.
|
||||
* Equivalent to icu4c UPROPS_BLOCK_MASK
|
||||
*/
|
||||
private static final int BLOCK_MASK_ = 0x0001ff00;
|
||||
/**
|
||||
* Integer properties mask and shift values for blocks.
|
||||
* Equivalent to icu4c UPROPS_BLOCK_SHIFT
|
||||
*/
|
||||
private static final int BLOCK_SHIFT_ = 8;
|
||||
/**
|
||||
* Integer properties mask and shift values for scripts.
|
||||
* Equivalent to icu4c UPROPS_SHIFT_LOW_MASK.
|
||||
|
@ -1549,6 +1562,8 @@ public final class UCharacterProperty
|
|||
*/
|
||||
private static final int AGE_SHIFT_ = 24;
|
||||
|
||||
// Bits 9..0 in UPROPS_MAX_VALUES_OTHER_INDEX
|
||||
private static final int MAX_BLOCK = 0x3ff;
|
||||
|
||||
// private constructors --------------------------------------------------
|
||||
|
||||
|
@ -1577,12 +1592,13 @@ public final class UCharacterProperty
|
|||
int additionalVectorsOffset = bytes.getInt();
|
||||
m_additionalColumnsCount_ = bytes.getInt();
|
||||
int scriptExtensionsOffset = bytes.getInt();
|
||||
int reservedOffset7 = bytes.getInt();
|
||||
/* reservedOffset8 = */ bytes.getInt();
|
||||
int blockTrieOffset = bytes.getInt();
|
||||
int reservedOffset8 = bytes.getInt();
|
||||
/* dataTopOffset = */ bytes.getInt();
|
||||
m_maxBlockScriptValue_ = bytes.getInt();
|
||||
m_maxJTGValue_ = bytes.getInt();
|
||||
ICUBinary.skipBytes(bytes, (16 - 12) << 2);
|
||||
m_maxValuesOther_ = bytes.getInt();
|
||||
ICUBinary.skipBytes(bytes, (16 - 13) << 2);
|
||||
|
||||
// read the main properties trie
|
||||
m_trie_ = Trie2_16.createFromSerialized(bytes);
|
||||
|
@ -1614,10 +1630,20 @@ public final class UCharacterProperty
|
|||
}
|
||||
|
||||
// Script_Extensions
|
||||
int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
|
||||
int numChars = (blockTrieOffset - scriptExtensionsOffset) * 2;
|
||||
if(numChars > 0) {
|
||||
m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0);
|
||||
}
|
||||
|
||||
// Read the blockTrie.
|
||||
int partLength = (reservedOffset8 - blockTrieOffset) * 4;
|
||||
int triePosition = bytes.position();
|
||||
m_blockTrie_ = CodePointTrie.fromBinary(null, CodePointTrie.ValueWidth.BITS_16, bytes);
|
||||
trieLength = bytes.position() - triePosition;
|
||||
if (trieLength > partLength) {
|
||||
throw new ICUUncheckedIOException("uprops.icu: not enough bytes for blockTrie");
|
||||
}
|
||||
ICUBinary.skipBytes(bytes, partLength - trieLength); // skip padding after trie bytes
|
||||
}
|
||||
|
||||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
|
@ -1794,6 +1820,19 @@ public final class UCharacterProperty
|
|||
}
|
||||
}
|
||||
|
||||
public void ublock_addPropertyStarts(UnicodeSet set) {
|
||||
// Add the start code point of each same-value range of the trie.
|
||||
// We store Block values indexed by the code point shifted right 4 bits;
|
||||
// see ublock_getCode().
|
||||
CodePointMap.Range range = new CodePointMap.Range();
|
||||
int start = 0;
|
||||
while (start < 0x11000 && // limit: (max code point + 1) >> 4
|
||||
m_blockTrie_.getRange(start, null, range)) {
|
||||
set.add(start << 4);
|
||||
start = range.getEnd() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasIDType(int c, int typeIndex) {
|
||||
if (typeIndex < 0 || typeIndex >= idTypeToEncoded.length) {
|
||||
return false;
|
||||
|
|
Binary file not shown.
|
@ -22,7 +22,9 @@
|
|||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucptrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/umutablecptrie.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/usetiter.h"
|
||||
|
@ -81,13 +83,14 @@ Formally, the file contains the following structures:
|
|||
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
|
||||
|
||||
i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
|
||||
i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
|
||||
i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
|
||||
i7 blockTrieIndex; -- 32-bit unit index to the Block property trie (format version 9+)
|
||||
i8 reservedIndex8; -- top of the previous part of the data; i8 and i9 have the same values
|
||||
i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
|
||||
|
||||
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
|
||||
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
|
||||
i12..i15 reservedIndexes; -- reserved values; 0 for now
|
||||
i12 maxValuesOther; -- additional maximum values, see uprops.h (format version 9+)
|
||||
i13..i15 reservedIndexes; -- reserved values; 0 for now
|
||||
|
||||
PT serialized properties trie, see utrie2.h (byte size: 4*(i0-16))
|
||||
|
||||
|
@ -114,6 +117,10 @@ Formally, the file contains the following structures:
|
|||
vs. another value (and the index is to a pair).
|
||||
(See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
|
||||
|
||||
blockTrie serialized CodePointTrie/UCPTrie for the Block property (format version 9+)
|
||||
|
||||
Indexed by (code point >> 4). Takes advantage of each Block having xxx0..xxxF boundaries.
|
||||
|
||||
Trie lookup and properties:
|
||||
|
||||
In order to condense the data for the 21-bit code space, several properties of
|
||||
|
@ -299,6 +306,10 @@ ICU 75 uses the vector word 2 bits 31..26 for encoded Identifier_Type bit sets.
|
|||
|
||||
Age major:minor version bit fields changed from 4:4 to 6:2 so that age=16.0 fits.
|
||||
|
||||
Block data moved from props vector 0 into its own new CodePointTrie.
|
||||
Reserve 10 bits in the new indexes[UPROPS_MAX_VALUES_OTHER_INDEX] for the max Block value,
|
||||
although the trie can hold 16-bit values.
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -343,7 +354,11 @@ private:
|
|||
UTrie2 *pTrie;
|
||||
UTrie2 *props2Trie;
|
||||
UPropsVectors *pv;
|
||||
UMutableCPTrie *mutableBlockTrie = nullptr;
|
||||
UCPTrie *blockTrie = nullptr;
|
||||
UnicodeString scriptExtensions;
|
||||
uint8_t blockTrieBytes[100000];
|
||||
int32_t blockTrieSize = 0;
|
||||
};
|
||||
|
||||
CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
|
||||
|
@ -358,12 +373,19 @@ CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
|
|||
fprintf(stderr, "genprops error: corepropsbuilder upvec_open() failed - %s\n",
|
||||
u_errorName(errorCode));
|
||||
}
|
||||
mutableBlockTrie = umutablecptrie_open(0, 0, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops/Block error: umutablecptrie_open() failed: %s\n",
|
||||
u_errorName(errorCode));
|
||||
}
|
||||
}
|
||||
|
||||
CorePropsBuilder::~CorePropsBuilder() {
|
||||
utrie2_close(pTrie);
|
||||
utrie2_close(props2Trie);
|
||||
upvec_close(pv);
|
||||
umutablecptrie_close(mutableBlockTrie);
|
||||
ucptrie_close(blockTrie);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -693,7 +715,6 @@ struct PropToEnum {
|
|||
|
||||
const PropToEnum
|
||||
propToEnums[]={
|
||||
{ UCHAR_BLOCK, 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
|
||||
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
|
||||
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
|
||||
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
|
||||
|
@ -755,6 +776,16 @@ CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK,
|
||||
&errorCode);
|
||||
}
|
||||
if (newValues.contains(UCHAR_BLOCK)) {
|
||||
uint32_t value = props.getIntProp(UCHAR_BLOCK);
|
||||
if ((start & 0xf) != 0 || (end & 0xf) != 0xf || value > UPROPS_MAX_BLOCK) {
|
||||
fprintf(stderr, "genprops error: %04lX..%04lX Block 0x%x cannot be encoded\n",
|
||||
(long)start, (long)end, (int)value);
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
umutablecptrie_setRange(mutableBlockTrie, start >> 4, end >> 4, value, &errorCode);
|
||||
}
|
||||
|
||||
// Set the script value if the Script_Extensions revert to {Script}.
|
||||
// Otherwise we would have to duplicate the code for doing so.
|
||||
|
@ -895,6 +926,24 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
|
|||
scriptExtensions.append((char16_t)0);
|
||||
}
|
||||
|
||||
blockTrie = umutablecptrie_buildImmutable(
|
||||
mutableBlockTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_16, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr,
|
||||
"genprops/Block error: umutablecptrie_buildImmutable() failed: %s\n",
|
||||
u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
blockTrieSize = ucptrie_toBinary(blockTrie,
|
||||
blockTrieBytes, sizeof(blockTrieBytes), &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr,
|
||||
"genprops/Block error: ucptrie_toBinary() failed: %s (length %ld)\n",
|
||||
u_errorName(errorCode), (long)trieSize);
|
||||
return;
|
||||
}
|
||||
U_ASSERT((blockTrieSize & 3) == 0); // multiple of 4 bytes
|
||||
|
||||
/* set indexes */
|
||||
int32_t offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
|
||||
offset+=trieSize>>2;
|
||||
|
@ -909,14 +958,14 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
|
|||
offset+=pvCount;
|
||||
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=offset;
|
||||
offset+=scriptExtensions.length()/2;
|
||||
indexes[UPROPS_RESERVED_INDEX_7]=offset;
|
||||
indexes[UPROPS_BLOCK_TRIE_INDEX]=offset;
|
||||
offset+=blockTrieSize/4;
|
||||
indexes[UPROPS_RESERVED_INDEX_8]=offset;
|
||||
indexes[UPROPS_DATA_TOP_INDEX]=offset;
|
||||
totalSize=4*offset;
|
||||
|
||||
indexes[UPROPS_MAX_VALUES_INDEX]=
|
||||
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
|
||||
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
|
||||
(int32_t)splitScriptCodeOrIndex(USCRIPT_CODE_LIMIT-1);
|
||||
indexes[UPROPS_MAX_VALUES_2_INDEX]=
|
||||
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
|
||||
|
@ -924,6 +973,8 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
|
|||
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
|
||||
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
|
||||
((int32_t)U_DT_COUNT-1);
|
||||
indexes[UPROPS_MAX_VALUES_OTHER_INDEX]=
|
||||
(int32_t)UBLOCK_COUNT-1;
|
||||
|
||||
if(!beQuiet) {
|
||||
puts("* uprops.icu stats *");
|
||||
|
@ -932,6 +983,7 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
|
|||
printf("number of additional props vectors: %5u\n", (int)pvRows);
|
||||
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
|
||||
printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions.length());
|
||||
printf("size in bytes of Block trie: %5u\n", (int)blockTrieSize);
|
||||
printf("data size: %6ld\n", (long)totalSize);
|
||||
}
|
||||
}
|
||||
|
@ -988,6 +1040,8 @@ CorePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
|
|||
"",
|
||||
"};\n\n");
|
||||
|
||||
usrc_writeUCPTrie(f, "block", blockTrie, UPRV_TARGET_SYNTAX_CCODE);
|
||||
|
||||
usrc_writeArray(f,
|
||||
"static const int32_t indexes[UPROPS_INDEX_COUNT]={",
|
||||
indexes, 32, UPROPS_INDEX_COUNT,
|
||||
|
@ -1018,6 +1072,7 @@ CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorC
|
|||
udata_writeBlock(pData, props2TrieBlock, props2TrieSize);
|
||||
udata_writeBlock(pData, pvArray, pvCount*4);
|
||||
udata_writeBlock(pData, scriptExtensions.getBuffer(), scriptExtensions.length()*2);
|
||||
udata_writeBlock(pData, blockTrieBytes, blockTrieSize);
|
||||
|
||||
long dataLength=udata_finish(pData, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
|
|
Loading…
Add table
Reference in a new issue