mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-13630 ucase.icu formatVersion 4: more compressible exceptions, and more room for future exceptions growth
X-SVN-Rev: 41093
This commit is contained in:
parent
1752b5c8c9
commit
b3aec18a3c
10 changed files with 854 additions and 710 deletions
|
@ -138,6 +138,11 @@ ucase_tolower(UChar32 c) {
|
|||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
|
||||
}
|
||||
|
@ -155,6 +160,11 @@ ucase_toupper(UChar32 c) {
|
|||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
|
||||
}
|
||||
|
@ -172,6 +182,11 @@ ucase_totitle(UChar32 c) {
|
|||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
int32_t idx;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
|
||||
idx=UCASE_EXC_TITLE;
|
||||
|
@ -254,6 +269,11 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
|||
sa->add(sa->set, c);
|
||||
}
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
|
||||
}
|
||||
|
||||
/* get the closure string pointer & length */
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
|
||||
|
@ -590,7 +610,12 @@ ucase_isSoftDotted(UChar32 c) {
|
|||
U_CAPI UBool U_EXPORT2
|
||||
ucase_isCaseSensitive(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
return (UBool)((props&UCASE_SENSITIVE)!=0);
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
return (UBool)((props&UCASE_SENSITIVE)!=0);
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
|
||||
}
|
||||
}
|
||||
|
||||
/* string casing ------------------------------------------------------------ */
|
||||
|
@ -1140,6 +1165,11 @@ ucase_toFullLower(UChar32 c,
|
|||
}
|
||||
}
|
||||
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
|
||||
}
|
||||
|
@ -1229,6 +1259,11 @@ toUpperOrTitle(UChar32 c,
|
|||
}
|
||||
}
|
||||
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
|
||||
idx=UCASE_EXC_TITLE;
|
||||
} else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
|
||||
|
@ -1334,6 +1369,14 @@ ucase_fold(UChar32 c, uint32_t options) {
|
|||
}
|
||||
}
|
||||
}
|
||||
if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
|
||||
return c;
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
|
||||
idx=UCASE_EXC_FOLD;
|
||||
} else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
|
||||
|
@ -1421,6 +1464,14 @@ ucase_toFullFolding(UChar32 c,
|
|||
}
|
||||
}
|
||||
|
||||
if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
|
||||
return ~c;
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
int32_t delta;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
|
||||
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
|
||||
idx=UCASE_EXC_FOLD;
|
||||
} else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
|
||||
|
|
|
@ -354,8 +354,8 @@ enum {
|
|||
#define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2)
|
||||
|
||||
#define UCASE_IGNORABLE 4
|
||||
#define UCASE_SENSITIVE 8
|
||||
#define UCASE_EXCEPTION 0x10
|
||||
#define UCASE_EXCEPTION 8
|
||||
#define UCASE_SENSITIVE 0x10
|
||||
|
||||
#define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
|
||||
|
||||
|
@ -379,9 +379,9 @@ enum {
|
|||
# define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT))
|
||||
#endif
|
||||
|
||||
/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
|
||||
#define UCASE_EXC_SHIFT 5
|
||||
#define UCASE_EXC_MASK 0xffe0
|
||||
/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
|
||||
#define UCASE_EXC_SHIFT 4
|
||||
#define UCASE_EXC_MASK 0xfff0
|
||||
#define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1)
|
||||
|
||||
/* definitions for 16-bit main exceptions word ------------------------------ */
|
||||
|
@ -392,7 +392,7 @@ enum {
|
|||
UCASE_EXC_FOLD,
|
||||
UCASE_EXC_UPPER,
|
||||
UCASE_EXC_TITLE,
|
||||
UCASE_EXC_4, /* reserved */
|
||||
UCASE_EXC_DELTA,
|
||||
UCASE_EXC_5, /* reserved */
|
||||
UCASE_EXC_CLOSURE,
|
||||
UCASE_EXC_FULL_MAPPINGS,
|
||||
|
@ -402,7 +402,11 @@ enum {
|
|||
/* each slot is 2 uint16_t instead of 1 */
|
||||
#define UCASE_EXC_DOUBLE_SLOTS 0x100
|
||||
|
||||
/* reserved: exception bits 11..9 */
|
||||
enum {
|
||||
UCASE_EXC_NO_SIMPLE_CASE_FOLDING=0x200,
|
||||
UCASE_EXC_DELTA_IS_NEGATIVE=0x400,
|
||||
UCASE_EXC_SENSITIVE=0x800
|
||||
};
|
||||
|
||||
/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
|
||||
#define UCASE_EXC_DOT_SHIFT 7
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -43,7 +43,7 @@ $E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
|
|||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
$E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
|
||||
$E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}\uFDD0];
|
||||
$EmojiNRK = [[\p{Emoji}] - [\p{Grapheme_Cluster_Break = Regional_Indicator}*\u00230-9©®™〰〽]];
|
||||
|
||||
## -------------------------------------------------
|
||||
|
|
|
@ -51,7 +51,7 @@ $E_Modifier = [\p{Word_Break = EM}];
|
|||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
$EBG = [\p{Word_Break = EBG}];
|
||||
$EBG = [\p{Word_Break = EBG}\uFDD0];
|
||||
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
|
||||
|
||||
$Han = [:Han:];
|
||||
|
|
|
@ -51,7 +51,7 @@ $E_Modifier = [\p{Word_Break = EM}];
|
|||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
$EBG = [\p{Word_Break = EBG}];
|
||||
$EBG = [\p{Word_Break = EBG}\uFDD0];
|
||||
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
|
||||
|
||||
$Han = [:Han:];
|
||||
|
|
Binary file not shown.
|
@ -240,6 +240,21 @@ set(ICU4C_SRC_DIR /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c)
|
|||
genuca/genuca --hanOrder radical-stroke $ICU_SRC/icu4c
|
||||
- rebuild ICU (make install) & tools
|
||||
|
||||
* Fix case props
|
||||
genprops error: casepropsbuilder: too many exceptions words
|
||||
genprops error: failure finalizing the data - U_BUFFER_OVERFLOW_ERROR
|
||||
- With the addition of Georgian Mtavruli capital letters,
|
||||
there are now too many simple case mappings with big mapping deltas
|
||||
that yield uncompressible exceptions.
|
||||
- Changing the data structure (now formatVersion 4),
|
||||
adding one bit for no-simple-case-folding (for Cherokee), and
|
||||
one optional slot for a big delta (for most faraway mappings),
|
||||
together with another bit for whether that is negative.
|
||||
This makes most Cherokee & Georgian etc. case mappings compressible,
|
||||
reducing the number of exceptions words.
|
||||
- Further changes to gain one more bit for the exceptions index,
|
||||
for future growth. Details see casepropsbuilder.cpp.
|
||||
|
||||
* update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to
|
||||
sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar)
|
||||
- grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters
|
||||
|
@ -249,6 +264,13 @@ set(ICU4C_SRC_DIR /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c)
|
|||
* run & fix ICU4C tests
|
||||
- Andy handles RBBI & spoof check test failures
|
||||
|
||||
TODO:
|
||||
- Errors in char.txt, word.txt, word_POSIX.txt like
|
||||
createRuleBasedBreakIterator: ICU Error "U_BRK_RULE_EMPTY_SET" at line 46, column 16
|
||||
because \p{Grapheme_Cluster_Break = EBG} and \p{Word_Break = EBG} are empty.
|
||||
-> Temporary(!) workaround: Add an arbitrary code point to these sets to make them
|
||||
not empty, just to get ICU building.
|
||||
|
||||
* collation: CLDR collation root, UCA DUCET
|
||||
|
||||
- UCA DUCET goes into Mark's Unicode tools, see
|
||||
|
|
|
@ -193,6 +193,10 @@ public final class UCaseProps {
|
|||
} else {
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) {
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(hasSlot(excWord, EXC_LOWER)) {
|
||||
c=getSlotValue(excWord, EXC_LOWER, excOffset);
|
||||
}
|
||||
|
@ -209,6 +213,10 @@ public final class UCaseProps {
|
|||
} else {
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) {
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(hasSlot(excWord, EXC_UPPER)) {
|
||||
c=getSlotValue(excWord, EXC_UPPER, excOffset);
|
||||
}
|
||||
|
@ -225,6 +233,10 @@ public final class UCaseProps {
|
|||
} else {
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) {
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
int index;
|
||||
if(hasSlot(excWord, EXC_TITLE)) {
|
||||
index=EXC_TITLE;
|
||||
|
@ -305,6 +317,10 @@ public final class UCaseProps {
|
|||
set.add(c);
|
||||
}
|
||||
}
|
||||
if(hasSlot(excWord, EXC_DELTA)) {
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
set.add((excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
|
||||
}
|
||||
|
||||
/* get the closure string pointer & length */
|
||||
if(hasSlot(excWord, EXC_CLOSURE)) {
|
||||
|
@ -479,7 +495,12 @@ public final class UCaseProps {
|
|||
}
|
||||
|
||||
public final boolean isCaseSensitive(int c) {
|
||||
return (trie.get(c)&SENSITIVE)!=0;
|
||||
int props=trie.get(c);
|
||||
if(!propsHasException(props)) {
|
||||
return (props&SENSITIVE)!=0;
|
||||
} else {
|
||||
return (exceptions.charAt(getExceptionsOffset(props))&EXC_SENSITIVE)!=0;
|
||||
}
|
||||
}
|
||||
|
||||
// string casing ------------------------------------------------------- ***
|
||||
|
@ -1109,6 +1130,10 @@ public final class UCaseProps {
|
|||
}
|
||||
}
|
||||
|
||||
if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) {
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(hasSlot(excWord, EXC_LOWER)) {
|
||||
result=getSlotValue(excWord, EXC_LOWER, excOffset2);
|
||||
}
|
||||
|
@ -1201,6 +1226,10 @@ public final class UCaseProps {
|
|||
}
|
||||
}
|
||||
|
||||
if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) {
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
|
||||
index=EXC_TITLE;
|
||||
} else if(hasSlot(excWord, EXC_UPPER)) {
|
||||
|
@ -1314,6 +1343,13 @@ public final class UCaseProps {
|
|||
}
|
||||
}
|
||||
}
|
||||
if((excWord&EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
|
||||
return c;
|
||||
}
|
||||
if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) {
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(hasSlot(excWord, EXC_FOLD)) {
|
||||
index=EXC_FOLD;
|
||||
} else if(hasSlot(excWord, EXC_LOWER)) {
|
||||
|
@ -1408,6 +1444,13 @@ public final class UCaseProps {
|
|||
}
|
||||
}
|
||||
|
||||
if((excWord&EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
|
||||
return ~c;
|
||||
}
|
||||
if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) {
|
||||
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
|
||||
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
|
||||
}
|
||||
if(hasSlot(excWord, EXC_FOLD)) {
|
||||
index=EXC_FOLD;
|
||||
} else if(hasSlot(excWord, EXC_LOWER)) {
|
||||
|
@ -1534,8 +1577,8 @@ public final class UCaseProps {
|
|||
}
|
||||
|
||||
static final int IGNORABLE=4;
|
||||
private static final int SENSITIVE= 8;
|
||||
private static final int EXCEPTION= 0x10;
|
||||
private static final int EXCEPTION= 8;
|
||||
private static final int SENSITIVE= 0x10;
|
||||
|
||||
private static final int DOT_MASK= 0x60;
|
||||
//private static final int NO_DOT= 0; /* normal characters with cc=0 */
|
||||
|
@ -1553,9 +1596,9 @@ public final class UCaseProps {
|
|||
return (short)props>>DELTA_SHIFT;
|
||||
}
|
||||
|
||||
/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
|
||||
private static final int EXC_SHIFT= 5;
|
||||
//private static final int EXC_MASK= 0xffe0;
|
||||
/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
|
||||
private static final int EXC_SHIFT= 4;
|
||||
//private static final int EXC_MASK= 0xfff0;
|
||||
//private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1);
|
||||
|
||||
/* definitions for 16-bit main exceptions word ------------------------------ */
|
||||
|
@ -1565,7 +1608,7 @@ public final class UCaseProps {
|
|||
private static final int EXC_FOLD=1;
|
||||
private static final int EXC_UPPER=2;
|
||||
private static final int EXC_TITLE=3;
|
||||
//private static final int EXC_4=4; /* reserved */
|
||||
private static final int EXC_DELTA=4;
|
||||
//private static final int EXC_5=5; /* reserved */
|
||||
private static final int EXC_CLOSURE=6;
|
||||
private static final int EXC_FULL_MAPPINGS=7;
|
||||
|
@ -1574,7 +1617,9 @@ public final class UCaseProps {
|
|||
/* each slot is 2 uint16_t instead of 1 */
|
||||
private static final int EXC_DOUBLE_SLOTS= 0x100;
|
||||
|
||||
/* reserved: exception bits 11..9 */
|
||||
private static final int EXC_NO_SIMPLE_CASE_FOLDING=0x200;
|
||||
private static final int EXC_DELTA_IS_NEGATIVE=0x400;
|
||||
private static final int EXC_SENSITIVE=0x800;
|
||||
|
||||
/* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
|
||||
private static final int EXC_DOT_SHIFT=7;
|
||||
|
|
|
@ -49,7 +49,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
|
|||
precedes the actual data. It contains platform properties values and the
|
||||
file format version.
|
||||
|
||||
The following is a description of format version 3.0 .
|
||||
The following is a description of format version 4.0 .
|
||||
|
||||
Format version 1.1 adds data for case closure.
|
||||
|
||||
|
@ -63,6 +63,16 @@ It moves the Case_Ignorable flag from sometimes-trie-bit 6, sometimes-exception-
|
|||
to always-trie-bit 2 and adjusts the higher trie bits accordingly.
|
||||
Exception index reduced from 12 bits to 11, simple case mapping delta reduced from 10 bits to 9.
|
||||
|
||||
Format version 4.0 (ICU 62) swaps trie data bits 3 and 4, exception vs. case-sensitive,
|
||||
and when exception=1 then data bits 15..4 (not 15..5) are used for the exception index,
|
||||
and the case-sensitive bit is moved into the excWord. This will allow for more exceptions words.
|
||||
Also, an additional optional exception slot is used for a 16-bit delta,
|
||||
with one more excWord bit if the delta is actually negative,
|
||||
for a reasonably compact, and compressible, encoding of simple case mappings
|
||||
between distant blocks for Cherokee, Georgian, and similar.
|
||||
Another excWord bit is used to indicate that the character has no simple case folding,
|
||||
even if it has a simple lowercase mapping.
|
||||
|
||||
The file contains the following structures:
|
||||
|
||||
const int32_t indexes[i0] with values i0, i1, ...:
|
||||
|
@ -89,7 +99,7 @@ The file contains the following structures:
|
|||
Trie data word:
|
||||
Bits
|
||||
if(exception) {
|
||||
15..5 unsigned exception index
|
||||
15..4 unsigned exception index
|
||||
} else {
|
||||
if(not uncased) {
|
||||
15..7 signed delta to simple case mapping code point
|
||||
|
@ -103,8 +113,8 @@ if(exception) {
|
|||
3 other cc
|
||||
The runtime code relies on these two bits to be adjacent with this encoding.
|
||||
}
|
||||
4 exception
|
||||
3 case-sensitive
|
||||
4 case-sensitive
|
||||
3 exception
|
||||
2 case-ignorable
|
||||
1..0 0 uncased
|
||||
1 lowercase
|
||||
|
@ -132,10 +142,9 @@ Bits
|
|||
1 soft-dotted character
|
||||
2 cc=230
|
||||
3 other cc
|
||||
11 reserved
|
||||
(was used in formatVersion 1.2..2.0:
|
||||
case-ignorable (used when the character is cased or has another exception))
|
||||
10.. 9 reserved
|
||||
11 same as non-exception case-sensitive bit
|
||||
10 the delta in the optional value slot is negative
|
||||
9 no simple case folding, even if there is a simple lowercase mapping
|
||||
8 if set, then for each optional-value slot there are 2 uint16_t values
|
||||
(high and low parts of 32-bit values)
|
||||
instead of single ones
|
||||
|
@ -146,7 +155,8 @@ Optional-value slots:
|
|||
1 case folding (code point)
|
||||
2 uppercase mapping (code point)
|
||||
3 titlecase mapping (code point)
|
||||
4 reserved
|
||||
4 delta to simple case mapping code point
|
||||
(add delta to input code point, or subtract if excWord bit 10 is set)
|
||||
5 reserved
|
||||
6 closure mappings (new in format version 1.1)
|
||||
7 there is at least one full (string) case mapping
|
||||
|
@ -214,8 +224,8 @@ static UDataInfo dataInfo={
|
|||
|
||||
/* dataFormat="cAsE" */
|
||||
{ UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
|
||||
{ 3, 0, 0, 0 }, /* formatVersion */
|
||||
{ 6, 0, 0, 0 } /* dataVersion */
|
||||
{ 4, 0, 0, 0 }, /* formatVersion */
|
||||
{ 11, 0, 0, 0 } /* dataVersion */
|
||||
};
|
||||
|
||||
#define UGENCASE_EXC_SHIFT 20
|
||||
|
@ -226,16 +236,20 @@ enum {
|
|||
};
|
||||
|
||||
struct ExcProps {
|
||||
ExcProps()
|
||||
: hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE) {}
|
||||
ExcProps(const UniProps &otherProps)
|
||||
: props(otherProps),
|
||||
hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE) {}
|
||||
ExcProps() :
|
||||
delta(0), hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE),
|
||||
hasNoSimpleCaseFolding(FALSE) {}
|
||||
ExcProps(const UniProps &otherProps) :
|
||||
props(otherProps),
|
||||
delta(0), hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE),
|
||||
hasNoSimpleCaseFolding(FALSE) {}
|
||||
|
||||
UniProps props;
|
||||
UnicodeSet closure;
|
||||
int32_t delta;
|
||||
UBool hasConditionalCaseMappings;
|
||||
UBool hasTurkicCaseFolding;
|
||||
UBool hasNoSimpleCaseFolding;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -385,6 +399,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
|
||||
/* default: map to self */
|
||||
int32_t delta=0;
|
||||
UBool noDelta=FALSE;
|
||||
|
||||
uint32_t type;
|
||||
if(props.binProps[UCHAR_LOWERCASE]) {
|
||||
|
@ -398,6 +413,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
}
|
||||
uint32_t value=type;
|
||||
|
||||
// Examine simple case mappings.
|
||||
UBool hasMapping=FALSE;
|
||||
if(props.suc>=0) {
|
||||
/* uppercase mapping as delta if the character is lowercase */
|
||||
|
@ -405,6 +421,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
if(type==UCASE_LOWER) {
|
||||
delta=props.suc-start;
|
||||
} else {
|
||||
noDelta=TRUE;
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
}
|
||||
|
@ -414,6 +431,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
if(type>=UCASE_UPPER) {
|
||||
delta=props.slc-start;
|
||||
} else {
|
||||
noDelta=TRUE;
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
}
|
||||
|
@ -421,40 +439,52 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
hasMapping=TRUE;
|
||||
}
|
||||
if(props.suc!=props.stc) {
|
||||
noDelta=TRUE;
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
|
||||
// Simple case folding falls back to simple lowercasing.
|
||||
// If they differ, then store them separately.
|
||||
UChar32 scf=props.scf;
|
||||
if(scf>=0 && scf!=props.slc) {
|
||||
hasMapping=noDelta=TRUE;
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
|
||||
// If there is no case folding but there is a lowercase mapping,
|
||||
// then set a bit for that.
|
||||
// For example: Cherokee uppercase syllables since Unicode 8.
|
||||
// (Full case folding falls back to simple case folding,
|
||||
// not to full lowercasing, so we need not also handle it specially
|
||||
// for such cases.)
|
||||
UBool hasNoSimpleCaseFolding=FALSE;
|
||||
if(scf<0 && props.slc>=0) {
|
||||
hasNoSimpleCaseFolding=TRUE;
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
|
||||
if(noDelta) {
|
||||
delta=0;
|
||||
} else if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
|
||||
// The case mapping delta is too big for the main data word.
|
||||
// Store it in an exceptions slot.
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
|
||||
// Examine full case mappings.
|
||||
if(!props.lc.isEmpty() || !props.uc.isEmpty() || !props.tc.isEmpty() ||
|
||||
newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS)
|
||||
) {
|
||||
hasMapping=TRUE;
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
if( (props.scf>=0 && props.scf!=props.slc) ||
|
||||
(!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) ||
|
||||
if( (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) ||
|
||||
newValues.contains(PPUCD_TURKIC_CASE_FOLDING)
|
||||
) {
|
||||
hasMapping=TRUE;
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
|
||||
// Simple case folding falls back to simple lowercasing.
|
||||
// If there is no case folding but there is a lowercase mapping,
|
||||
// then add a case folding mapping to the code point.
|
||||
// For example: Cherokee uppercase syllables since Unicode 8.
|
||||
// (Full case folding falls back to simple case folding,
|
||||
// not to full lowercasing, so we need not also handle it specially
|
||||
// for such cases.)
|
||||
UChar32 scf=props.scf;
|
||||
if(scf<0 && props.slc>=0) {
|
||||
scf=start;
|
||||
hasMapping=TRUE;
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
|
||||
if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
|
||||
if(props.binProps[UCHAR_SOFT_DOTTED]) {
|
||||
value|=UCASE_SOFT_DOTTED;
|
||||
}
|
||||
|
@ -502,8 +532,10 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|||
return;
|
||||
}
|
||||
newExcProps->props.scf=scf;
|
||||
newExcProps->delta=delta;
|
||||
newExcProps->hasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS);
|
||||
newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING);
|
||||
newExcProps->hasNoSimpleCaseFolding=hasNoSimpleCaseFolding;
|
||||
value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT;
|
||||
excProps[excPropsCount++]=newExcProps;
|
||||
} else {
|
||||
|
@ -880,8 +912,8 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* copy and shift the soft-dotted bits */
|
||||
UChar excWord=(UChar)((value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT);
|
||||
/* copy and shift the soft-dotted and case-sensitive bits */
|
||||
UChar excWord=(UChar)((value&(UCASE_DOT_MASK|UCASE_SENSITIVE))<<UCASE_EXC_DOT_SHIFT);
|
||||
|
||||
UniProps &p=ep.props;
|
||||
|
||||
|
@ -896,6 +928,9 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC
|
|||
excWord|=UCASE_EXC_CONDITIONAL_FOLD;
|
||||
p.cf.remove();
|
||||
}
|
||||
if(ep.hasNoSimpleCaseFolding) {
|
||||
excWord|=UCASE_EXC_NO_SIMPLE_CASE_FOLDING;
|
||||
}
|
||||
|
||||
/* remove redundant data */
|
||||
/* do not store full mappings if they are the same as the simple ones */
|
||||
|
@ -917,36 +952,48 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC
|
|||
uint32_t slotBits=0;
|
||||
int32_t count=0;
|
||||
|
||||
if(p.slc>=0) {
|
||||
slots[count]=(uint32_t)p.slc;
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_LOWER);
|
||||
}
|
||||
if( p.scf>=0 &&
|
||||
(p.slc>=0 ?
|
||||
p.scf!=p.slc :
|
||||
p.scf!=c)) {
|
||||
slots[count]=(uint32_t)p.scf;
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_FOLD);
|
||||
}
|
||||
if(p.suc>=0) {
|
||||
slots[count]=(uint32_t)p.suc;
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_UPPER);
|
||||
}
|
||||
if(p.suc!=p.stc) {
|
||||
if(p.stc>=0) {
|
||||
slots[count]=(uint32_t)p.stc;
|
||||
} else {
|
||||
slots[count]=(uint32_t)c;
|
||||
if(ep.delta!=0) {
|
||||
int32_t delta=ep.delta;
|
||||
if(delta<0) {
|
||||
excWord|=UCASE_EXC_DELTA_IS_NEGATIVE;
|
||||
delta=-delta;
|
||||
}
|
||||
slots[count]=(uint32_t)delta;
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_TITLE);
|
||||
excWord|=U_MASK(UCASE_EXC_DELTA);
|
||||
} else {
|
||||
if(p.slc>=0) {
|
||||
slots[count]=(uint32_t)p.slc;
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_LOWER);
|
||||
}
|
||||
if( p.scf>=0 &&
|
||||
(p.slc>=0 ?
|
||||
p.scf!=p.slc :
|
||||
p.scf!=c)) {
|
||||
slots[count]=(uint32_t)p.scf;
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_FOLD);
|
||||
}
|
||||
if(p.suc>=0) {
|
||||
slots[count]=(uint32_t)p.suc;
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_UPPER);
|
||||
}
|
||||
if(p.suc!=p.stc) {
|
||||
if(p.stc>=0) {
|
||||
slots[count]=(uint32_t)p.stc;
|
||||
} else {
|
||||
slots[count]=(uint32_t)c;
|
||||
}
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_TITLE);
|
||||
}
|
||||
}
|
||||
|
||||
/* length of case closure */
|
||||
|
@ -994,33 +1041,43 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC
|
|||
return excIndex;
|
||||
} else {
|
||||
/* write slots */
|
||||
int32_t excIndex=exceptions.length();
|
||||
exceptions.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */
|
||||
UnicodeString excString;
|
||||
excString.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */
|
||||
|
||||
if(slotBits<=0xffff) {
|
||||
for(int32_t i=0; i<count; ++i) {
|
||||
exceptions.append((UChar)slots[i]);
|
||||
excString.append((UChar)slots[i]);
|
||||
}
|
||||
} else {
|
||||
excWord|=UCASE_EXC_DOUBLE_SLOTS;
|
||||
for(int32_t i=0; i<count; ++i) {
|
||||
exceptions.append((UChar)(slots[i]>>16));
|
||||
exceptions.append((UChar)slots[i]);
|
||||
excString.append((UChar)(slots[i]>>16));
|
||||
excString.append((UChar)slots[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/* write the full case mapping strings */
|
||||
exceptions.append(p.lc);
|
||||
exceptions.append(p.cf);
|
||||
exceptions.append(p.uc);
|
||||
exceptions.append(p.tc);
|
||||
excString.append(p.lc);
|
||||
excString.append(p.cf);
|
||||
excString.append(p.uc);
|
||||
excString.append(p.tc);
|
||||
|
||||
/* write the closure data */
|
||||
exceptions.append(closureString);
|
||||
excString.append(closureString);
|
||||
|
||||
/* write the main exceptions word */
|
||||
exceptions.setCharAt(excIndex, (UChar)excWord);
|
||||
excString.setCharAt(0, (UChar)excWord);
|
||||
|
||||
// Try to share data.
|
||||
if(count==1 && ep.delta!=0) {
|
||||
int32_t excIndex=exceptions.indexOf(excString);
|
||||
if(excIndex>=0) {
|
||||
printf("share delta: U+%04lx %ld\n", (long)c, (long)ep.delta);
|
||||
return excIndex;
|
||||
}
|
||||
}
|
||||
int32_t excIndex=exceptions.length();
|
||||
exceptions.append(excString);
|
||||
return excIndex;
|
||||
}
|
||||
}
|
||||
|
@ -1065,7 +1122,6 @@ CasePropsBuilder::build(UErrorCode &errorCode) {
|
|||
}
|
||||
|
||||
makeCaseClosure(errorCode);
|
||||
makeExceptions(errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
|
||||
/*
|
||||
|
@ -1090,6 +1146,9 @@ CasePropsBuilder::build(UErrorCode &errorCode) {
|
|||
return;
|
||||
}
|
||||
|
||||
makeExceptions(errorCode);
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
|
||||
utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops/case error: utrie2_freeze() failed: %s\n",
|
||||
|
|
Loading…
Add table
Reference in a new issue