ICU-22785 uprops.icu: coalesce scx+sc bits

This commit is contained in:
Markus Scherer 2024-06-04 13:44:10 -07:00
parent 0d8a3ccd11
commit c439dcdf27
9 changed files with 585 additions and 653 deletions

View file

@ -543,7 +543,7 @@ uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
return USCRIPT_INVALID_CODE;
}
uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
return (UScriptCode)codeOrIndex;
} else if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) {
@ -558,7 +558,7 @@ uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
U_CAPI UBool U_EXPORT2
uscript_hasScript(UChar32 c, UScriptCode sc) UPRV_NO_SANITIZE_UNDEFINED {
uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
return sc==(UScriptCode)codeOrIndex;
}
@ -590,7 +590,7 @@ uscript_getScriptExtensions(UChar32 c,
return 0;
}
uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
uint32_t codeOrIndex=uprops_mergeScriptCodeOrIndex(scriptX);
uint32_t codeOrIndex=scriptX&UPROPS_MAX_SCRIPT;
if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
if(capacity==0) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;

File diff suppressed because it is too large Load diff

View file

@ -585,8 +585,7 @@ static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*whi
}
static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) {
uint32_t scriptX=uprv_getMaxValues(0)&UPROPS_SCRIPT_X_MASK;
return uprops_mergeScriptCodeOrIndex(scriptX);
return uprv_getMaxValues(0)&UPROPS_MAX_SCRIPT;
}
/*

View file

@ -119,66 +119,40 @@ enum {
/* number of properties vector words */
#define UPROPS_VECTOR_WORDS 3
// TODO: merge scx+Script bit sets together
/*
* Properties in vector word 0
* Bits
* 31..26 Age major version (0..63)
* 25..24 Age minor version (0..3)
* 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index
* 3: Script value from Script_Extensions
* 2: Script=Inherited
* 1: Script=Common
* 0: Script=bits 21..20 & 7..0
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
* 19..17 East Asian Width
* 16.. 8 reserved since format version 9; was UBlockCode
* 7.. 0 UScriptCode, or index to Script_Extensions
*/
#define UPROPS_AGE_MASK 0xff000000
#define UPROPS_AGE_SHIFT 24
/* Script_Extensions: mask includes Script */
#define UPROPS_SCRIPT_X_MASK 0x00f000ff
#define UPROPS_SCRIPT_X_SHIFT 22
// The UScriptCode or Script_Extensions index is split across two bit fields.
// (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.)
// Shift the high bits right by 12 to assemble the full value.
#define UPROPS_SCRIPT_HIGH_MASK 0x00300000
#define UPROPS_SCRIPT_HIGH_SHIFT 12
#define UPROPS_MAX_SCRIPT 0x3ff
#define UPROPS_EA_MASK 0x000e0000
#define UPROPS_EA_SHIFT 17
// fine UPROPS_BLOCK_MASK 0x0001ff00
// fine UPROPS_BLOCK_SHIFT 8
#define UPROPS_SCRIPT_LOW_MASK 0x000000ff
/* UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
#define UPROPS_SCRIPT_X_WITH_COMMON 0x400000
#define UPROPS_SCRIPT_X_WITH_INHERITED 0x800000
#define UPROPS_SCRIPT_X_WITH_OTHER 0xc00000
#ifdef __cplusplus
namespace {
// Properties in vector word 0
// Bits
// 31..26 Age major version (major=0..63)
// 25..24 Age minor version (minor=0..3)
// 23..15 reserved
// 14..12 East Asian Width
// 11..10 3..1: Bits 9..0 = Script_Extensions index
// 3: Script value from Script_Extensions
// 2: Script=Inherited
// 1: Script=Common
// 0: Script=bits 9..0
// 9.. 0 UScriptCode, or index to Script_Extensions
inline constexpr uint32_t UPROPS_AGE_MASK = 0xff000000;
inline constexpr int32_t UPROPS_AGE_SHIFT = 24;
inline constexpr uint8_t UPROPS_AGE_MAJOR_MAX = 63;
inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = 3;
inline uint32_t uprops_mergeScriptCodeOrIndex(uint32_t scriptX) {
return
((scriptX & UPROPS_SCRIPT_HIGH_MASK) >> UPROPS_SCRIPT_HIGH_SHIFT) |
(scriptX & UPROPS_SCRIPT_LOW_MASK);
}
inline constexpr uint32_t UPROPS_EA_MASK = 0x00007000;
inline constexpr int32_t UPROPS_EA_SHIFT = 12;
} // namespace
/** Script_Extensions: mask includes Script */
inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = 0x00000fff;
#endif // __cplusplus
// UPROPS_SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions.
inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_OTHER = 0xc00;
inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_INHERITED = 0x800;
inline constexpr uint32_t UPROPS_SCRIPT_X_WITH_COMMON = 0x400;
inline constexpr int32_t UPROPS_MAX_SCRIPT = 0x3ff;
/*
* Properties in vector word 1
@ -240,10 +214,6 @@ enum {
* 4.. 0 Decomposition Type
*/
#ifdef __cplusplus
namespace {
// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type
// The Identifier_Type maps each code point to a *set* of one or more values.
// Some can be combined with others, some can only occur alone.

Binary file not shown.

View file

@ -803,8 +803,7 @@ public final class UCharacterProperty
}
@Override
int getMaxValue(int which) {
int scriptX=getMaxValues(0)&SCRIPT_X_MASK;
return mergeScriptCodeOrIndex(scriptX);
return getMaxValues(0)&MAX_SCRIPT;
}
},
new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE
@ -1365,62 +1364,31 @@ public final class UCharacterProperty
NumericType.NUMERIC;
}
/*
* Properties in vector word 0
* Bits
* 31..26 Age major version (0..63)
* 25..24 Age minor version (0..3)
* 23..22 3..1: Bits 21..20 & 7..0 = Script_Extensions index
* 3: Script value from Script_Extensions
* 2: Script=Inherited
* 1: Script=Common
* 0: Script=bits 21..20 & 7..0
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
* 19..17 East Asian Width
* 16.. 8 reserved since format version 9; was UBlockCode
* 7.. 0 UScriptCode, or index to Script_Extensions
*/
// Properties in vector word 0
// Bits
// 31..26 Age major version (major=0..63)
// 25..24 Age minor version (minor=0..3)
// 23..15 reserved
// 14..12 East Asian Width
// 11..10 3..1: Bits 9..0 = Script_Extensions index
// 3: Script value from Script_Extensions
// 2: Script=Inherited
// 1: Script=Common
// 0: Script=bits 9..0
// 9.. 0 UScriptCode, or index to Script_Extensions
/**
* Script_Extensions: mask includes Script
*/
public static final int SCRIPT_X_MASK = 0x00f000ff;
//private static final int SCRIPT_X_SHIFT = 22;
private static final int EAST_ASIAN_MASK_ = 0x00007000;
private static final int EAST_ASIAN_SHIFT_ = 12;
// The UScriptCode or Script_Extensions index is split across two bit fields.
// (Starting with Unicode 13/ICU 66/2019 due to more varied Script_Extensions.)
// Shift the high bits right by 12 to assemble the full value.
public static final int SCRIPT_HIGH_MASK = 0x00300000;
public static final int SCRIPT_HIGH_SHIFT = 12;
/** Script_Extensions: mask includes Script */
public static final int SCRIPT_X_MASK = 0x00000fff;
// SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions.
public static final int SCRIPT_X_WITH_OTHER = 0xc00;
public static final int SCRIPT_X_WITH_INHERITED = 0x800;
public static final int SCRIPT_X_WITH_COMMON = 0x400;
public static final int MAX_SCRIPT = 0x3ff;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_MASK
*/
private static final int EAST_ASIAN_MASK_ = 0x000e0000;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_SHIFT
*/
private static final int EAST_ASIAN_SHIFT_ = 17;
/**
* Integer properties mask and shift values for scripts.
* Equivalent to icu4c UPROPS_SHIFT_LOW_MASK.
*/
public static final int SCRIPT_LOW_MASK = 0x000000ff;
/* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
public static final int SCRIPT_X_WITH_COMMON = 0x400000;
public static final int SCRIPT_X_WITH_INHERITED = 0x800000;
public static final int SCRIPT_X_WITH_OTHER = 0xc00000;
public static final int mergeScriptCodeOrIndex(int scriptX) {
return
((scriptX & SCRIPT_HIGH_MASK) >> SCRIPT_HIGH_SHIFT) |
(scriptX & SCRIPT_LOW_MASK);
}
/**
* Additional properties used in internal trie data
*/

View file

@ -1227,7 +1227,7 @@ public final class UScript {
public static final int getScript(int codepoint){
if (codepoint >= UCharacter.MIN_VALUE & codepoint <= UCharacter.MAX_VALUE) {
int scriptX=UCharacterProperty.INSTANCE.getAdditional(codepoint, 0)&UCharacterProperty.SCRIPT_X_MASK;
int codeOrIndex=UCharacterProperty.mergeScriptCodeOrIndex(scriptX);
int codeOrIndex=scriptX&UCharacterProperty.MAX_SCRIPT;
if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
return codeOrIndex;
} else if(scriptX<UCharacterProperty.SCRIPT_X_WITH_INHERITED) {
@ -1257,7 +1257,7 @@ public final class UScript {
*/
public static final boolean hasScript(int c, int sc) {
int scriptX=UCharacterProperty.INSTANCE.getAdditional(c, 0)&UCharacterProperty.SCRIPT_X_MASK;
int codeOrIndex=UCharacterProperty.mergeScriptCodeOrIndex(scriptX);
int codeOrIndex=scriptX&UCharacterProperty.MAX_SCRIPT;
if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
return sc==codeOrIndex;
}
@ -1307,7 +1307,7 @@ public final class UScript {
public static final int getScriptExtensions(int c, BitSet set) {
set.clear();
int scriptX=UCharacterProperty.INSTANCE.getAdditional(c, 0)&UCharacterProperty.SCRIPT_X_MASK;
int codeOrIndex=UCharacterProperty.mergeScriptCodeOrIndex(scriptX);
int codeOrIndex=scriptX&UCharacterProperty.MAX_SCRIPT;
if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
set.set(codeOrIndex);
return codeOrIndex;

View file

@ -310,6 +310,8 @@ Block data moved from props vector 0 into its own new CodePointTrie.
Reserve 10 bits in the new indexes[UPROPS_MAX_VALUES_OTHER_INDEX] for the max Block value,
although the trie can hold 16-bit values.
Props vector 0 bits shuffled so that script and script extensions bits are contiguous.
----------------------------------------------------------------------------- */
U_NAMESPACE_USE
@ -331,12 +333,6 @@ UDataInfo dataInfo={
{ 16, 0, 0, 0 } /* dataVersion */
};
inline uint32_t splitScriptCodeOrIndex(uint32_t v) {
return
((v << UPROPS_SCRIPT_HIGH_SHIFT) & UPROPS_SCRIPT_HIGH_MASK) |
(v & UPROPS_SCRIPT_LOW_MASK);
}
class CorePropsBuilder : public PropsBuilder {
public:
CorePropsBuilder(UErrorCode &errorCode);
@ -800,11 +796,10 @@ CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
!newValues.contains(UCHAR_SCRIPT);
if(newValues.contains(UCHAR_SCRIPT) || revertToScript) {
int32_t script=props.getIntProp(UCHAR_SCRIPT);
uint32_t value=splitScriptCodeOrIndex(script);
// Use UPROPS_SCRIPT_X_MASK:
// When writing a Script code, remove Script_Extensions bits as well.
// If needed, they will get written again.
upvec_setValue(pv, start, pvecEnd, 0, value, UPROPS_SCRIPT_X_MASK, &errorCode);
upvec_setValue(pv, start, pvecEnd, 0, script, UPROPS_SCRIPT_X_MASK, &errorCode);
}
// Write a new (Script, Script_Extensions) value if there are Script_Extensions
// and either Script or Script_Extensions are new on the current line.
@ -850,7 +845,7 @@ CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
errorCode=U_BUFFER_OVERFLOW_ERROR;
return;
}
scriptX|=splitScriptCodeOrIndex(index);
scriptX|=index;
upvec_setValue(pv, start, pvecEnd, 0, scriptX, UPROPS_SCRIPT_X_MASK, &errorCode);
}
if(newValues.contains(UCHAR_IDENTIFIER_TYPE)) {
@ -966,7 +961,7 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
indexes[UPROPS_MAX_VALUES_INDEX]=
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
(int32_t)splitScriptCodeOrIndex(USCRIPT_CODE_LIMIT-1);
((int32_t)USCRIPT_CODE_LIMIT-1);
indexes[UPROPS_MAX_VALUES_2_INDEX]=
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|