ICU-13630 ucase.icu formatVersion 4: more compressible exceptions, and more room for future exceptions growth

X-SVN-Rev: 41093
This commit is contained in:
Markus Scherer 2018-03-12 00:15:40 +00:00
parent 1752b5c8c9
commit b3aec18a3c
10 changed files with 854 additions and 710 deletions

View file

@ -138,6 +138,11 @@ ucase_tolower(UChar32 c) {
} else {
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
uint16_t excWord=*pe++;
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
}
@ -155,6 +160,11 @@ ucase_toupper(UChar32 c) {
} else {
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
uint16_t excWord=*pe++;
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
}
@ -172,6 +182,11 @@ ucase_totitle(UChar32 c) {
} else {
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
uint16_t excWord=*pe++;
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
int32_t idx;
if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
idx=UCASE_EXC_TITLE;
@ -254,6 +269,11 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
sa->add(sa->set, c);
}
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
}
/* get the closure string pointer & length */
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
@ -590,7 +610,12 @@ ucase_isSoftDotted(UChar32 c) {
U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(UChar32 c) {
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
return (UBool)((props&UCASE_SENSITIVE)!=0);
if(!UCASE_HAS_EXCEPTION(props)) {
return (UBool)((props&UCASE_SENSITIVE)!=0);
} else {
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
}
}
/* string casing ------------------------------------------------------------ */
@ -1140,6 +1165,11 @@ ucase_toFullLower(UChar32 c,
}
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
}
@ -1229,6 +1259,11 @@ toUpperOrTitle(UChar32 c,
}
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
idx=UCASE_EXC_TITLE;
} else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
@ -1334,6 +1369,14 @@ ucase_fold(UChar32 c, uint32_t options) {
}
}
}
if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
return c;
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
idx=UCASE_EXC_FOLD;
} else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
@ -1421,6 +1464,14 @@ ucase_toFullFolding(UChar32 c,
}
}
if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
return ~c;
}
if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
int32_t delta;
GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
idx=UCASE_EXC_FOLD;
} else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {

View file

@ -354,8 +354,8 @@ enum {
#define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2)
#define UCASE_IGNORABLE 4
#define UCASE_SENSITIVE 8
#define UCASE_EXCEPTION 0x10
#define UCASE_EXCEPTION 8
#define UCASE_SENSITIVE 0x10
#define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
@ -379,9 +379,9 @@ enum {
# define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT))
#endif
/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
#define UCASE_EXC_SHIFT 5
#define UCASE_EXC_MASK 0xffe0
/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
#define UCASE_EXC_SHIFT 4
#define UCASE_EXC_MASK 0xfff0
#define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1)
/* definitions for 16-bit main exceptions word ------------------------------ */
@ -392,7 +392,7 @@ enum {
UCASE_EXC_FOLD,
UCASE_EXC_UPPER,
UCASE_EXC_TITLE,
UCASE_EXC_4, /* reserved */
UCASE_EXC_DELTA,
UCASE_EXC_5, /* reserved */
UCASE_EXC_CLOSURE,
UCASE_EXC_FULL_MAPPINGS,
@ -402,7 +402,11 @@ enum {
/* each slot is 2 uint16_t instead of 1 */
#define UCASE_EXC_DOUBLE_SLOTS 0x100
/* reserved: exception bits 11..9 */
enum {
UCASE_EXC_NO_SIMPLE_CASE_FOLDING=0x200,
UCASE_EXC_DELTA_IS_NEGATIVE=0x400,
UCASE_EXC_SENSITIVE=0x800
};
/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
#define UCASE_EXC_DOT_SHIFT 7

File diff suppressed because it is too large Load diff

View file

@ -43,7 +43,7 @@ $E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
$E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
$E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}\uFDD0];
$EmojiNRK = [[\p{Emoji}] - [\p{Grapheme_Cluster_Break = Regional_Indicator}*\u00230-9©®™〰〽]];
## -------------------------------------------------

View file

@ -51,7 +51,7 @@ $E_Modifier = [\p{Word_Break = EM}];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
$EBG = [\p{Word_Break = EBG}];
$EBG = [\p{Word_Break = EBG}\uFDD0];
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
$Han = [:Han:];

View file

@ -51,7 +51,7 @@ $E_Modifier = [\p{Word_Break = EM}];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
$EBG = [\p{Word_Break = EBG}];
$EBG = [\p{Word_Break = EBG}\uFDD0];
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
$Han = [:Han:];

Binary file not shown.

View file

@ -240,6 +240,21 @@ set(ICU4C_SRC_DIR /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c)
genuca/genuca --hanOrder radical-stroke $ICU_SRC/icu4c
- rebuild ICU (make install) & tools
* Fix case props
genprops error: casepropsbuilder: too many exceptions words
genprops error: failure finalizing the data - U_BUFFER_OVERFLOW_ERROR
- With the addition of Georgian Mtavruli capital letters,
there are now too many simple case mappings with big mapping deltas
that yield uncompressible exceptions.
- Changing the data structure (now formatVersion 4),
adding one bit for no-simple-case-folding (for Cherokee), and
one optional slot for a big delta (for most faraway mappings),
together with another bit for whether that is negative.
This makes most Cherokee & Georgian etc. case mappings compressible,
reducing the number of exceptions words.
- Further changes to gain one more bit for the exceptions index,
for future growth. Details see casepropsbuilder.cpp.
* update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to
sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar)
- grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters
@ -249,6 +264,13 @@ set(ICU4C_SRC_DIR /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c)
* run & fix ICU4C tests
- Andy handles RBBI & spoof check test failures
TODO:
- Errors in char.txt, word.txt, word_POSIX.txt like
createRuleBasedBreakIterator: ICU Error "U_BRK_RULE_EMPTY_SET" at line 46, column 16
because \p{Grapheme_Cluster_Break = EBG} and \p{Word_Break = EBG} are empty.
-> Temporary(!) workaround: Add an arbitrary code point to these sets to make them
not empty, just to get ICU building.
* collation: CLDR collation root, UCA DUCET
- UCA DUCET goes into Mark's Unicode tools, see

View file

@ -193,6 +193,10 @@ public final class UCaseProps {
} else {
int excOffset=getExceptionsOffset(props);
int excWord=exceptions.charAt(excOffset++);
if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) {
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(hasSlot(excWord, EXC_LOWER)) {
c=getSlotValue(excWord, EXC_LOWER, excOffset);
}
@ -209,6 +213,10 @@ public final class UCaseProps {
} else {
int excOffset=getExceptionsOffset(props);
int excWord=exceptions.charAt(excOffset++);
if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) {
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(hasSlot(excWord, EXC_UPPER)) {
c=getSlotValue(excWord, EXC_UPPER, excOffset);
}
@ -225,6 +233,10 @@ public final class UCaseProps {
} else {
int excOffset=getExceptionsOffset(props);
int excWord=exceptions.charAt(excOffset++);
if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) {
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
int index;
if(hasSlot(excWord, EXC_TITLE)) {
index=EXC_TITLE;
@ -305,6 +317,10 @@ public final class UCaseProps {
set.add(c);
}
}
if(hasSlot(excWord, EXC_DELTA)) {
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
set.add((excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
}
/* get the closure string pointer & length */
if(hasSlot(excWord, EXC_CLOSURE)) {
@ -479,7 +495,12 @@ public final class UCaseProps {
}
public final boolean isCaseSensitive(int c) {
return (trie.get(c)&SENSITIVE)!=0;
int props=trie.get(c);
if(!propsHasException(props)) {
return (props&SENSITIVE)!=0;
} else {
return (exceptions.charAt(getExceptionsOffset(props))&EXC_SENSITIVE)!=0;
}
}
// string casing ------------------------------------------------------- ***
@ -1109,6 +1130,10 @@ public final class UCaseProps {
}
}
if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) {
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(hasSlot(excWord, EXC_LOWER)) {
result=getSlotValue(excWord, EXC_LOWER, excOffset2);
}
@ -1201,6 +1226,10 @@ public final class UCaseProps {
}
}
if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) {
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
index=EXC_TITLE;
} else if(hasSlot(excWord, EXC_UPPER)) {
@ -1314,6 +1343,13 @@ public final class UCaseProps {
}
}
}
if((excWord&EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
return c;
}
if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) {
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(hasSlot(excWord, EXC_FOLD)) {
index=EXC_FOLD;
} else if(hasSlot(excWord, EXC_LOWER)) {
@ -1408,6 +1444,13 @@ public final class UCaseProps {
}
}
if((excWord&EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
return ~c;
}
if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) {
int delta=getSlotValue(excWord, EXC_DELTA, excOffset);
return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
}
if(hasSlot(excWord, EXC_FOLD)) {
index=EXC_FOLD;
} else if(hasSlot(excWord, EXC_LOWER)) {
@ -1534,8 +1577,8 @@ public final class UCaseProps {
}
static final int IGNORABLE=4;
private static final int SENSITIVE= 8;
private static final int EXCEPTION= 0x10;
private static final int EXCEPTION= 8;
private static final int SENSITIVE= 0x10;
private static final int DOT_MASK= 0x60;
//private static final int NO_DOT= 0; /* normal characters with cc=0 */
@ -1553,9 +1596,9 @@ public final class UCaseProps {
return (short)props>>DELTA_SHIFT;
}
/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
private static final int EXC_SHIFT= 5;
//private static final int EXC_MASK= 0xffe0;
/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
private static final int EXC_SHIFT= 4;
//private static final int EXC_MASK= 0xfff0;
//private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1);
/* definitions for 16-bit main exceptions word ------------------------------ */
@ -1565,7 +1608,7 @@ public final class UCaseProps {
private static final int EXC_FOLD=1;
private static final int EXC_UPPER=2;
private static final int EXC_TITLE=3;
//private static final int EXC_4=4; /* reserved */
private static final int EXC_DELTA=4;
//private static final int EXC_5=5; /* reserved */
private static final int EXC_CLOSURE=6;
private static final int EXC_FULL_MAPPINGS=7;
@ -1574,7 +1617,9 @@ public final class UCaseProps {
/* each slot is 2 uint16_t instead of 1 */
private static final int EXC_DOUBLE_SLOTS= 0x100;
/* reserved: exception bits 11..9 */
private static final int EXC_NO_SIMPLE_CASE_FOLDING=0x200;
private static final int EXC_DELTA_IS_NEGATIVE=0x400;
private static final int EXC_SENSITIVE=0x800;
/* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
private static final int EXC_DOT_SHIFT=7;

View file

@ -49,7 +49,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 3.0 .
The following is a description of format version 4.0 .
Format version 1.1 adds data for case closure.
@ -63,6 +63,16 @@ It moves the Case_Ignorable flag from sometimes-trie-bit 6, sometimes-exception-
to always-trie-bit 2 and adjusts the higher trie bits accordingly.
Exception index reduced from 12 bits to 11, simple case mapping delta reduced from 10 bits to 9.
Format version 4.0 (ICU 62) swaps trie data bits 3 and 4, exception vs. case-sensitive,
and when exception=1 then data bits 15..4 (not 15..5) are used for the exception index,
and the case-sensitive bit is moved into the excWord. This will allow for more exceptions words.
Also, an additional optional exception slot is used for a 16-bit delta,
with one more excWord bit if the delta is actually negative,
for a reasonably compact, and compressible, encoding of simple case mappings
between distant blocks for Cherokee, Georgian, and similar.
Another excWord bit is used to indicate that the character has no simple case folding,
even if it has a simple lowercase mapping.
The file contains the following structures:
const int32_t indexes[i0] with values i0, i1, ...:
@ -89,7 +99,7 @@ The file contains the following structures:
Trie data word:
Bits
if(exception) {
15..5 unsigned exception index
15..4 unsigned exception index
} else {
if(not uncased) {
15..7 signed delta to simple case mapping code point
@ -103,8 +113,8 @@ if(exception) {
3 other cc
The runtime code relies on these two bits to be adjacent with this encoding.
}
4 exception
3 case-sensitive
4 case-sensitive
3 exception
2 case-ignorable
1..0 0 uncased
1 lowercase
@ -132,10 +142,9 @@ Bits
1 soft-dotted character
2 cc=230
3 other cc
11 reserved
(was used in formatVersion 1.2..2.0:
case-ignorable (used when the character is cased or has another exception))
10.. 9 reserved
11 same as non-exception case-sensitive bit
10 the delta in the optional value slot is negative
9 no simple case folding, even if there is a simple lowercase mapping
8 if set, then for each optional-value slot there are 2 uint16_t values
(high and low parts of 32-bit values)
instead of single ones
@ -146,7 +155,8 @@ Optional-value slots:
1 case folding (code point)
2 uppercase mapping (code point)
3 titlecase mapping (code point)
4 reserved
4 delta to simple case mapping code point
(add delta to input code point, or subtract if excWord bit 10 is set)
5 reserved
6 closure mappings (new in format version 1.1)
7 there is at least one full (string) case mapping
@ -214,8 +224,8 @@ static UDataInfo dataInfo={
/* dataFormat="cAsE" */
{ UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
{ 3, 0, 0, 0 }, /* formatVersion */
{ 6, 0, 0, 0 } /* dataVersion */
{ 4, 0, 0, 0 }, /* formatVersion */
{ 11, 0, 0, 0 } /* dataVersion */
};
#define UGENCASE_EXC_SHIFT 20
@ -226,16 +236,20 @@ enum {
};
struct ExcProps {
ExcProps()
: hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE) {}
ExcProps(const UniProps &otherProps)
: props(otherProps),
hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE) {}
ExcProps() :
delta(0), hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE),
hasNoSimpleCaseFolding(FALSE) {}
ExcProps(const UniProps &otherProps) :
props(otherProps),
delta(0), hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE),
hasNoSimpleCaseFolding(FALSE) {}
UniProps props;
UnicodeSet closure;
int32_t delta;
UBool hasConditionalCaseMappings;
UBool hasTurkicCaseFolding;
UBool hasNoSimpleCaseFolding;
};
/*
@ -385,6 +399,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
/* default: map to self */
int32_t delta=0;
UBool noDelta=FALSE;
uint32_t type;
if(props.binProps[UCHAR_LOWERCASE]) {
@ -398,6 +413,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
}
uint32_t value=type;
// Examine simple case mappings.
UBool hasMapping=FALSE;
if(props.suc>=0) {
/* uppercase mapping as delta if the character is lowercase */
@ -405,6 +421,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
if(type==UCASE_LOWER) {
delta=props.suc-start;
} else {
noDelta=TRUE;
value|=UCASE_EXCEPTION;
}
}
@ -414,6 +431,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
if(type>=UCASE_UPPER) {
delta=props.slc-start;
} else {
noDelta=TRUE;
value|=UCASE_EXCEPTION;
}
}
@ -421,40 +439,52 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
hasMapping=TRUE;
}
if(props.suc!=props.stc) {
noDelta=TRUE;
value|=UCASE_EXCEPTION;
}
// Simple case folding falls back to simple lowercasing.
// If they differ, then store them separately.
UChar32 scf=props.scf;
if(scf>=0 && scf!=props.slc) {
hasMapping=noDelta=TRUE;
value|=UCASE_EXCEPTION;
}
// If there is no case folding but there is a lowercase mapping,
// then set a bit for that.
// For example: Cherokee uppercase syllables since Unicode 8.
// (Full case folding falls back to simple case folding,
// not to full lowercasing, so we need not also handle it specially
// for such cases.)
UBool hasNoSimpleCaseFolding=FALSE;
if(scf<0 && props.slc>=0) {
hasNoSimpleCaseFolding=TRUE;
value|=UCASE_EXCEPTION;
}
if(noDelta) {
delta=0;
} else if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
// The case mapping delta is too big for the main data word.
// Store it in an exceptions slot.
value|=UCASE_EXCEPTION;
}
// Examine full case mappings.
if(!props.lc.isEmpty() || !props.uc.isEmpty() || !props.tc.isEmpty() ||
newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS)
) {
hasMapping=TRUE;
value|=UCASE_EXCEPTION;
}
if( (props.scf>=0 && props.scf!=props.slc) ||
(!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) ||
if( (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) ||
newValues.contains(PPUCD_TURKIC_CASE_FOLDING)
) {
hasMapping=TRUE;
value|=UCASE_EXCEPTION;
}
// Simple case folding falls back to simple lowercasing.
// If there is no case folding but there is a lowercase mapping,
// then add a case folding mapping to the code point.
// For example: Cherokee uppercase syllables since Unicode 8.
// (Full case folding falls back to simple case folding,
// not to full lowercasing, so we need not also handle it specially
// for such cases.)
UChar32 scf=props.scf;
if(scf<0 && props.slc>=0) {
scf=start;
hasMapping=TRUE;
value|=UCASE_EXCEPTION;
}
if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
value|=UCASE_EXCEPTION;
}
if(props.binProps[UCHAR_SOFT_DOTTED]) {
value|=UCASE_SOFT_DOTTED;
}
@ -502,8 +532,10 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
return;
}
newExcProps->props.scf=scf;
newExcProps->delta=delta;
newExcProps->hasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS);
newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING);
newExcProps->hasNoSimpleCaseFolding=hasNoSimpleCaseFolding;
value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT;
excProps[excPropsCount++]=newExcProps;
} else {
@ -880,8 +912,8 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC
return 0;
}
/* copy and shift the soft-dotted bits */
UChar excWord=(UChar)((value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT);
/* copy and shift the soft-dotted and case-sensitive bits */
UChar excWord=(UChar)((value&(UCASE_DOT_MASK|UCASE_SENSITIVE))<<UCASE_EXC_DOT_SHIFT);
UniProps &p=ep.props;
@ -896,6 +928,9 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC
excWord|=UCASE_EXC_CONDITIONAL_FOLD;
p.cf.remove();
}
if(ep.hasNoSimpleCaseFolding) {
excWord|=UCASE_EXC_NO_SIMPLE_CASE_FOLDING;
}
/* remove redundant data */
/* do not store full mappings if they are the same as the simple ones */
@ -917,36 +952,48 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC
uint32_t slotBits=0;
int32_t count=0;
if(p.slc>=0) {
slots[count]=(uint32_t)p.slc;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_LOWER);
}
if( p.scf>=0 &&
(p.slc>=0 ?
p.scf!=p.slc :
p.scf!=c)) {
slots[count]=(uint32_t)p.scf;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_FOLD);
}
if(p.suc>=0) {
slots[count]=(uint32_t)p.suc;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_UPPER);
}
if(p.suc!=p.stc) {
if(p.stc>=0) {
slots[count]=(uint32_t)p.stc;
} else {
slots[count]=(uint32_t)c;
if(ep.delta!=0) {
int32_t delta=ep.delta;
if(delta<0) {
excWord|=UCASE_EXC_DELTA_IS_NEGATIVE;
delta=-delta;
}
slots[count]=(uint32_t)delta;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_TITLE);
excWord|=U_MASK(UCASE_EXC_DELTA);
} else {
if(p.slc>=0) {
slots[count]=(uint32_t)p.slc;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_LOWER);
}
if( p.scf>=0 &&
(p.slc>=0 ?
p.scf!=p.slc :
p.scf!=c)) {
slots[count]=(uint32_t)p.scf;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_FOLD);
}
if(p.suc>=0) {
slots[count]=(uint32_t)p.suc;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_UPPER);
}
if(p.suc!=p.stc) {
if(p.stc>=0) {
slots[count]=(uint32_t)p.stc;
} else {
slots[count]=(uint32_t)c;
}
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_TITLE);
}
}
/* length of case closure */
@ -994,33 +1041,43 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC
return excIndex;
} else {
/* write slots */
int32_t excIndex=exceptions.length();
exceptions.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */
UnicodeString excString;
excString.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */
if(slotBits<=0xffff) {
for(int32_t i=0; i<count; ++i) {
exceptions.append((UChar)slots[i]);
excString.append((UChar)slots[i]);
}
} else {
excWord|=UCASE_EXC_DOUBLE_SLOTS;
for(int32_t i=0; i<count; ++i) {
exceptions.append((UChar)(slots[i]>>16));
exceptions.append((UChar)slots[i]);
excString.append((UChar)(slots[i]>>16));
excString.append((UChar)slots[i]);
}
}
/* write the full case mapping strings */
exceptions.append(p.lc);
exceptions.append(p.cf);
exceptions.append(p.uc);
exceptions.append(p.tc);
excString.append(p.lc);
excString.append(p.cf);
excString.append(p.uc);
excString.append(p.tc);
/* write the closure data */
exceptions.append(closureString);
excString.append(closureString);
/* write the main exceptions word */
exceptions.setCharAt(excIndex, (UChar)excWord);
excString.setCharAt(0, (UChar)excWord);
// Try to share data.
if(count==1 && ep.delta!=0) {
int32_t excIndex=exceptions.indexOf(excString);
if(excIndex>=0) {
printf("share delta: U+%04lx %ld\n", (long)c, (long)ep.delta);
return excIndex;
}
}
int32_t excIndex=exceptions.length();
exceptions.append(excString);
return excIndex;
}
}
@ -1065,7 +1122,6 @@ CasePropsBuilder::build(UErrorCode &errorCode) {
}
makeCaseClosure(errorCode);
makeExceptions(errorCode);
if(U_FAILURE(errorCode)) { return; }
/*
@ -1090,6 +1146,9 @@ CasePropsBuilder::build(UErrorCode &errorCode) {
return;
}
makeExceptions(errorCode);
if(U_FAILURE(errorCode)) { return; }
utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops/case error: utrie2_freeze() failed: %s\n",