mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 15:42:14 +00:00
ICU-22785 uprops.icu: coalesce scx+sc bits
This commit is contained in:
parent
0d8a3ccd11
commit
c439dcdf27
9 changed files with 585 additions and 653 deletions
|
@ -543,7 +543,7 @@ uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
|
|||
return USCRIPT_INVALID_CODE;
|
||||
}
|
||||
uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
|
||||
uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
|
||||
uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
|
||||
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
|
||||
return (UScriptCode)codeOrIndex;
|
||||
} else if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) {
|
||||
|
@ -558,7 +558,7 @@ uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
|
|||
U_CAPI UBool U_EXPORT2
|
||||
uscript_hasScript(UChar32 c, UScriptCode sc) UPRV_NO_SANITIZE_UNDEFINED {
|
||||
uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
|
||||
uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
|
||||
uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
|
||||
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
|
||||
return sc==(UScriptCode)codeOrIndex;
|
||||
}
|
||||
|
@ -590,7 +590,7 @@ uscript_getScriptExtensions(UChar32 c,
|
|||
return 0;
|
||||
}
|
||||
uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
|
||||
uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
|
||||
uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
|
||||
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
|
||||
if(capacity==0) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -585,8 +585,7 @@ static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*whi
|
|||
}
|
||||
|
||||
static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) {
|
||||
uint32_t scriptX=uprv_getMaxValues(0)&UPROPS_SCRIPT_X_MASK;
|
||||
return uprops_mergeScriptCodeOrIndex(scriptX);
|
||||
return uprv_getMaxValues(0)&UPROPS_MAX_SCRIPT;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -119,66 +119,40 @@ enum {
|
|||
/* number of properties vector words */
|
||||
#define UPROPS_VECTOR_WORDS 3
|
||||
|
||||
// TODO: merge scx+Script bit sets together
|
||||
/*
|
||||
* Properties in vector word 0
|
||||
* Bits
|
||||
* 31..26 Age major version (0..63)
|
||||
* 25..24 Age minor version (0..3)
|
||||
* 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index
|
||||
* 3: Script value from Script_Extensions
|
||||
* 2: Script=Inherited
|
||||
* 1: Script=Common
|
||||
* 0: Script=bits 21..20 & 7..0
|
||||
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
|
||||
* 19..17 East Asian Width
|
||||
* 16.. 8 reserved since format version 9; was UBlockCode
|
||||
* 7.. 0 UScriptCode, or index to Script_Extensions
|
||||
*/
|
||||
|
||||
#define UPROPS_AGE_MASK 0xff000000
|
||||
#define UPROPS_AGE_SHIFT 24
|
||||
|
||||
/* Script_Extensions: mask includes Script */
|
||||
#define UPROPS_SCRIPT_X_MASK 0x00f000ff
|
||||
#define UPROPS_SCRIPT_X_SHIFT 22
|
||||
|
||||
// The UScriptCode or Script_Extensions index is split across two bit fields.
|
||||
// (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.)
|
||||
// Shift the high bits right by 12 to assemble the full value.
|
||||
#define UPROPS_SCRIPT_HIGH_MASK 0x00300000
|
||||
#define UPROPS_SCRIPT_HIGH_SHIFT 12
|
||||
#define UPROPS_MAX_SCRIPT 0x3ff
|
||||
|
||||
#define UPROPS_EA_MASK 0x000e0000
|
||||
#define UPROPS_EA_SHIFT 17
|
||||
|
||||
// fine UPROPS_BLOCK_MASK 0x0001ff00
|
||||
// fine UPROPS_BLOCK_SHIFT 8
|
||||
|
||||
#define UPROPS_SCRIPT_LOW_MASK 0x000000ff
|
||||
|
||||
/* UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
|
||||
#define UPROPS_SCRIPT_X_WITH_COMMON 0x400000
|
||||
#define UPROPS_SCRIPT_X_WITH_INHERITED 0x800000
|
||||
#define UPROPS_SCRIPT_X_WITH_OTHER 0xc00000
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
namespace {
|
||||
|
||||
// Properties in vector word 0
|
||||
// Bits
|
||||
// 31..26 Age major version (major=0..63)
|
||||
// 25..24 Age minor version (minor=0..3)
|
||||
// 23..15 reserved
|
||||
// 14..12 East Asian Width
|
||||
// 11..10 3..1: Bits 9..0 = Script_Extensions index
|
||||
// 3: Script value from Script_Extensions
|
||||
// 2: Script=Inherited
|
||||
// 1: Script=Common
|
||||
// 0: Script=bits 9..0
|
||||
// 9.. 0 UScriptCode, or index to Script_Extensions
|
||||
|
||||
inline constexpr uint32_t UPROPS_AGE_MASK = 0xff000000;
|
||||
inline constexpr int32_t UPROPS_AGE_SHIFT = 24;
|
||||
|
||||
inline constexpr uint8_t UPROPS_AGE_MAJOR_MAX = 63;
|
||||
inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = 3;
|
||||
|
||||
inline uint32_t uprops_mergeScriptCodeOrIndex(uint32_t scriptX) {
|
||||
return
|
||||
((scriptX & UPROPS_SCRIPT_HIGH_MASK) >> UPROPS_SCRIPT_HIGH_SHIFT) |
|
||||
(scriptX & UPROPS_SCRIPT_LOW_MASK);
|
||||
}
|
||||
inline constexpr uint32_t UPROPS_EA_MASK = 0x00007000;
|
||||
inline constexpr int32_t UPROPS_EA_SHIFT = 12;
|
||||
|
||||
} // namespace
|
||||
/** Script_Extensions: mask includes Script */
|
||||
inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = 0x00000fff;
|
||||
|
||||
#endif // __cplusplus
|
||||
// UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions.
|
||||
inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_OTHER = 0xc00;
|
||||
inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_INHERITED = 0x800;
|
||||
inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_COMMON = 0x400;
|
||||
inline constexpr int32_t UPROPS_MAX_SCRIPT = 0x3ff;
|
||||
|
||||
/*
|
||||
* Properties in vector word 1
|
||||
|
@ -240,10 +214,6 @@ enum {
|
|||
* 4.. 0 Decomposition Type
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
namespace {
|
||||
|
||||
// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type
|
||||
// The Identifier_Type maps each code point to a *set* of one or more values.
|
||||
// Some can be combined with others, some can only occur alone.
|
||||
|
|
Binary file not shown.
|
@ -803,8 +803,7 @@ public final class UCharacterProperty
|
|||
}
|
||||
@Override
|
||||
int getMaxValue(int which) {
|
||||
int scriptX=getMaxValues(0)&SCRIPT_X_MASK;
|
||||
return mergeScriptCodeOrIndex(scriptX);
|
||||
return getMaxValues(0)&MAX_SCRIPT;
|
||||
}
|
||||
},
|
||||
new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE
|
||||
|
@ -1365,62 +1364,31 @@ public final class UCharacterProperty
|
|||
NumericType.NUMERIC;
|
||||
}
|
||||
|
||||
/*
|
||||
* Properties in vector word 0
|
||||
* Bits
|
||||
* 31..26 Age major version (0..63)
|
||||
* 25..24 Age minor version (0..3)
|
||||
* 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index
|
||||
* 3: Script value from Script_Extensions
|
||||
* 2: Script=Inherited
|
||||
* 1: Script=Common
|
||||
* 0: Script=bits 21..20 & 7..0
|
||||
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
|
||||
* 19..17 East Asian Width
|
||||
* 16.. 8 reserved since format version 9; was UBlockCode
|
||||
* 7.. 0 UScriptCode, or index to Script_Extensions
|
||||
*/
|
||||
// Properties in vector word 0
|
||||
// Bits
|
||||
// 31..26 Age major version (major=0..63)
|
||||
// 25..24 Age minor version (minor=0..3)
|
||||
// 23..15 reserved
|
||||
// 14..12 East Asian Width
|
||||
// 11..10 3..1: Bits 9..0 = Script_Extensions index
|
||||
// 3: Script value from Script_Extensions
|
||||
// 2: Script=Inherited
|
||||
// 1: Script=Common
|
||||
// 0: Script=bits 9..0
|
||||
// 9.. 0 UScriptCode, or index to Script_Extensions
|
||||
|
||||
/**
|
||||
* Script_Extensions: mask includes Script
|
||||
*/
|
||||
public static final int SCRIPT_X_MASK = 0x00f000ff;
|
||||
//private static final int SCRIPT_X_SHIFT = 22;
|
||||
private static final int EAST_ASIAN_MASK_ = 0x00007000;
|
||||
private static final int EAST_ASIAN_SHIFT_ = 12;
|
||||
|
||||
// The UScriptCode or Script_Extensions index is split across two bit fields.
|
||||
// (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.)
|
||||
// Shift the high bits right by 12 to assemble the full value.
|
||||
public static final int SCRIPT_HIGH_MASK = 0x00300000;
|
||||
public static final int SCRIPT_HIGH_SHIFT = 12;
|
||||
/** Script_Extensions: mask includes Script */
|
||||
public static final int SCRIPT_X_MASK = 0x00000fff;
|
||||
|
||||
// SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions.
|
||||
public static final int SCRIPT_X_WITH_OTHER = 0xc00;
|
||||
public static final int SCRIPT_X_WITH_INHERITED = 0x800;
|
||||
public static final int SCRIPT_X_WITH_COMMON = 0x400;
|
||||
public static final int MAX_SCRIPT = 0x3ff;
|
||||
|
||||
/**
|
||||
* Integer properties mask and shift values for East Asian cell width.
|
||||
* Equivalent to icu4c UPROPS_EA_MASK
|
||||
*/
|
||||
private static final int EAST_ASIAN_MASK_ = 0x000e0000;
|
||||
/**
|
||||
* Integer properties mask and shift values for East Asian cell width.
|
||||
* Equivalent to icu4c UPROPS_EA_SHIFT
|
||||
*/
|
||||
private static final int EAST_ASIAN_SHIFT_ = 17;
|
||||
/**
|
||||
* Integer properties mask and shift values for scripts.
|
||||
* Equivalent to icu4c UPROPS_SHIFT_LOW_MASK.
|
||||
*/
|
||||
public static final int SCRIPT_LOW_MASK = 0x000000ff;
|
||||
|
||||
/* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
|
||||
public static final int SCRIPT_X_WITH_COMMON = 0x400000;
|
||||
public static final int SCRIPT_X_WITH_INHERITED = 0x800000;
|
||||
public static final int SCRIPT_X_WITH_OTHER = 0xc00000;
|
||||
|
||||
public static final int mergeScriptCodeOrIndex(int scriptX) {
|
||||
return
|
||||
((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) |
|
||||
(scriptX & SCRIPT_LOW_MASK);
|
||||
}
|
||||
|
||||
/**
|
||||
* Additional properties used in internal trie data
|
||||
*/
|
||||
|
|
|
@ -1227,7 +1227,7 @@ public final class UScript {
|
|||
public static final int getScript(int codepoint){
|
||||
if (codepoint >= UCharacter.MIN_VALUE & codepoint <= UCharacter.MAX_VALUE) {
|
||||
int scriptX=UCharacterProperty.INSTANCE.getAdditional(codepoint, 0)&UCharacterProperty.SCRIPT_X_MASK;
|
||||
int codeOrIndex=UCharacterProperty.mergeScriptCodeOrIndex(scriptX);
|
||||
int codeOrIndex=scriptX&UCharacterProperty.MAX_SCRIPT;
|
||||
if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
|
||||
return codeOrIndex;
|
||||
} else if(scriptX<UCharacterProperty.SCRIPT_X_WITH_INHERITED) {
|
||||
|
@ -1257,7 +1257,7 @@ public final class UScript {
|
|||
*/
|
||||
public static final boolean hasScript(int c, int sc) {
|
||||
int scriptX=UCharacterProperty.INSTANCE.getAdditional(c, 0)&UCharacterProperty.SCRIPT_X_MASK;
|
||||
int codeOrIndex=UCharacterProperty.mergeScriptCodeOrIndex(scriptX);
|
||||
int codeOrIndex=scriptX&UCharacterProperty.MAX_SCRIPT;
|
||||
if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
|
||||
return sc==codeOrIndex;
|
||||
}
|
||||
|
@ -1307,7 +1307,7 @@ public final class UScript {
|
|||
public static final int getScriptExtensions(int c, BitSet set) {
|
||||
set.clear();
|
||||
int scriptX=UCharacterProperty.INSTANCE.getAdditional(c, 0)&UCharacterProperty.SCRIPT_X_MASK;
|
||||
int codeOrIndex=UCharacterProperty.mergeScriptCodeOrIndex(scriptX);
|
||||
int codeOrIndex=scriptX&UCharacterProperty.MAX_SCRIPT;
|
||||
if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
|
||||
set.set(codeOrIndex);
|
||||
return codeOrIndex;
|
||||
|
|
Binary file not shown.
|
@ -310,6 +310,8 @@ Block data moved from props vector 0 into its own new CodePointTrie.
|
|||
Reserve 10 bits in the new indexes[UPROPS_MAX_VALUES_OTHER_INDEX] for the max Block value,
|
||||
although the trie can hold 16-bit values.
|
||||
|
||||
Props vector 0 bits shuffled so that script and script extensions bits are contiguous.
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -331,12 +333,6 @@ UDataInfo dataInfo={
|
|||
{ 16, 0, 0, 0 } /* dataVersion */
|
||||
};
|
||||
|
||||
inline uint32_t splitScriptCodeOrIndex(uint32_t v) {
|
||||
return
|
||||
((v << UPROPS_SCRIPT_HIGH_SHIFT) & UPROPS_SCRIPT_HIGH_MASK) |
|
||||
(v & UPROPS_SCRIPT_LOW_MASK);
|
||||
}
|
||||
|
||||
class CorePropsBuilder : public PropsBuilder {
|
||||
public:
|
||||
CorePropsBuilder(UErrorCode &errorCode);
|
||||
|
@ -800,11 +796,10 @@ CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
!newValues.contains(UCHAR_SCRIPT);
|
||||
if(newValues.contains(UCHAR_SCRIPT) || revertToScript) {
|
||||
int32_t script=props.getIntProp(UCHAR_SCRIPT);
|
||||
uint32_t value=splitScriptCodeOrIndex(script);
|
||||
// Use UPROPS_SCRIPT_X_MASK:
|
||||
// When writing a Script code, remove Script_Extensions bits as well.
|
||||
// If needed, they will get written again.
|
||||
upvec_setValue(pv, start, pvecEnd, 0, value, UPROPS_SCRIPT_X_MASK, &errorCode);
|
||||
upvec_setValue(pv, start, pvecEnd, 0, script, UPROPS_SCRIPT_X_MASK, &errorCode);
|
||||
}
|
||||
// Write a new (Script, Script_Extensions) value if there are Script_Extensions
|
||||
// and either Script or Script_Extensions are new on the current line.
|
||||
|
@ -850,7 +845,7 @@ CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
errorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
return;
|
||||
}
|
||||
scriptX|=splitScriptCodeOrIndex(index);
|
||||
scriptX|=index;
|
||||
upvec_setValue(pv, start, pvecEnd, 0, scriptX, UPROPS_SCRIPT_X_MASK, &errorCode);
|
||||
}
|
||||
if(newValues.contains(UCHAR_IDENTIFIER_TYPE)) {
|
||||
|
@ -966,7 +961,7 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
|
|||
|
||||
indexes[UPROPS_MAX_VALUES_INDEX]=
|
||||
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
|
||||
(int32_t)splitScriptCodeOrIndex(USCRIPT_CODE_LIMIT-1);
|
||||
((int32_t)USCRIPT_CODE_LIMIT-1);
|
||||
indexes[UPROPS_MAX_VALUES_2_INDEX]=
|
||||
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
|
||||
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
|
||||
|
|
Loading…
Add table
Reference in a new issue