diff --git a/icu4c/source/common/ucase.cpp b/icu4c/source/common/ucase.cpp index 95b27acb754..cbd5a6efb56 100644 --- a/icu4c/source/common/ucase.cpp +++ b/icu4c/source/common/ucase.cpp @@ -138,6 +138,11 @@ ucase_tolower(UChar32 c) { } else { const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); uint16_t excWord=*pe++; + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); } @@ -155,6 +160,11 @@ ucase_toupper(UChar32 c) { } else { const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); uint16_t excWord=*pe++; + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); } @@ -172,6 +182,11 @@ ucase_totitle(UChar32 c) { } else { const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); uint16_t excWord=*pe++; + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } int32_t idx; if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { idx=UCASE_EXC_TITLE; @@ -254,6 +269,11 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { sa->add(sa->set, c); } } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta); + } /* get the closure string pointer & length */ if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { @@ -590,7 +610,12 @@ ucase_isSoftDotted(UChar32 c) { U_CAPI UBool U_EXPORT2 ucase_isCaseSensitive(UChar32 c) { uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - return (UBool)((props&UCASE_SENSITIVE)!=0); + if(!UCASE_HAS_EXCEPTION(props)) { + return (UBool)((props&UCASE_SENSITIVE)!=0); + } else { + const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); + return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0); + } } /* string casing ------------------------------------------------------------ */ @@ -1140,6 +1165,11 @@ ucase_toFullLower(UChar32 c, } } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); } @@ -1229,6 +1259,11 @@ toUpperOrTitle(UChar32 c, } } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { idx=UCASE_EXC_TITLE; } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { @@ -1334,6 +1369,14 @@ ucase_fold(UChar32 c, uint32_t options) { } } } + if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) { + return c; + } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { idx=UCASE_EXC_FOLD; } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { @@ -1421,6 +1464,14 @@ ucase_toFullFolding(UChar32 c, } } + if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) { + return ~c; + } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { idx=UCASE_EXC_FOLD; } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { diff --git a/icu4c/source/common/ucase.h b/icu4c/source/common/ucase.h index a7a8c9f00d1..b0a453b87e8 100644 --- a/icu4c/source/common/ucase.h +++ b/icu4c/source/common/ucase.h @@ -354,8 +354,8 @@ enum { #define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2) #define UCASE_IGNORABLE 4 -#define UCASE_SENSITIVE 8 -#define UCASE_EXCEPTION 0x10 +#define UCASE_EXCEPTION 8 +#define UCASE_SENSITIVE 0x10 #define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) @@ -379,9 +379,9 @@ enum { # define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT)) #endif -/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */ -#define UCASE_EXC_SHIFT 5 -#define UCASE_EXC_MASK 0xffe0 +/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ +#define UCASE_EXC_SHIFT 4 +#define UCASE_EXC_MASK 0xfff0 #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1) /* definitions for 16-bit main exceptions word ------------------------------ */ @@ -392,7 +392,7 @@ enum { UCASE_EXC_FOLD, UCASE_EXC_UPPER, UCASE_EXC_TITLE, - UCASE_EXC_4, /* reserved */ + UCASE_EXC_DELTA, UCASE_EXC_5, /* reserved */ UCASE_EXC_CLOSURE, UCASE_EXC_FULL_MAPPINGS, @@ -402,7 +402,11 @@ enum { /* each slot is 2 uint16_t instead of 1 */ #define UCASE_EXC_DOUBLE_SLOTS 0x100 -/* reserved: exception bits 11..9 */ +enum { + UCASE_EXC_NO_SIMPLE_CASE_FOLDING=0x200, + UCASE_EXC_DELTA_IS_NEGATIVE=0x400, + UCASE_EXC_SENSITIVE=0x800 +}; /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<>DELTA_SHIFT; } - /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */ - private static final int EXC_SHIFT= 5; - //private static final int EXC_MASK= 0xffe0; + /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ + private static final int EXC_SHIFT= 4; + //private static final int EXC_MASK= 0xfff0; //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1); /* definitions for 16-bit main exceptions word ------------------------------ */ @@ -1565,7 +1608,7 @@ public final class UCaseProps { private static final int EXC_FOLD=1; private static final int EXC_UPPER=2; private static final int EXC_TITLE=3; - //private static final int EXC_4=4; /* reserved */ + private static final int EXC_DELTA=4; //private static final int EXC_5=5; /* reserved */ private static final int EXC_CLOSURE=6; private static final int EXC_FULL_MAPPINGS=7; @@ -1574,7 +1617,9 @@ public final class UCaseProps { /* each slot is 2 uint16_t instead of 1 */ private static final int EXC_DOUBLE_SLOTS= 0x100; - /* reserved: exception bits 11..9 */ + private static final int EXC_NO_SIMPLE_CASE_FOLDING=0x200; + private static final int EXC_DELTA_IS_NEGATIVE=0x400; + private static final int EXC_SENSITIVE=0x800; /* EXC_DOT_MASK=DOT_MASK<=0) { /* uppercase mapping as delta if the character is lowercase */ @@ -405,6 +421,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, if(type==UCASE_LOWER) { delta=props.suc-start; } else { + noDelta=TRUE; value|=UCASE_EXCEPTION; } } @@ -414,6 +431,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, if(type>=UCASE_UPPER) { delta=props.slc-start; } else { + noDelta=TRUE; value|=UCASE_EXCEPTION; } } @@ -421,40 +439,52 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, hasMapping=TRUE; } if(props.suc!=props.stc) { + noDelta=TRUE; value|=UCASE_EXCEPTION; } + + // Simple case folding falls back to simple lowercasing. + // If they differ, then store them separately. + UChar32 scf=props.scf; + if(scf>=0 && scf!=props.slc) { + hasMapping=noDelta=TRUE; + value|=UCASE_EXCEPTION; + } + + // If there is no case folding but there is a lowercase mapping, + // then set a bit for that. + // For example: Cherokee uppercase syllables since Unicode 8. + // (Full case folding falls back to simple case folding, + // not to full lowercasing, so we need not also handle it specially + // for such cases.) + UBool hasNoSimpleCaseFolding=FALSE; + if(scf<0 && props.slc>=0) { + hasNoSimpleCaseFolding=TRUE; + value|=UCASE_EXCEPTION; + } + + if(noDelta) { + delta=0; + } else if(delta=0 && props.scf!=props.slc) || - (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) || + if( (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) || newValues.contains(PPUCD_TURKIC_CASE_FOLDING) ) { hasMapping=TRUE; value|=UCASE_EXCEPTION; } - // Simple case folding falls back to simple lowercasing. - // If there is no case folding but there is a lowercase mapping, - // then add a case folding mapping to the code point. - // For example: Cherokee uppercase syllables since Unicode 8. - // (Full case folding falls back to simple case folding, - // not to full lowercasing, so we need not also handle it specially - // for such cases.) - UChar32 scf=props.scf; - if(scf<0 && props.slc>=0) { - scf=start; - hasMapping=TRUE; - value|=UCASE_EXCEPTION; - } - - if(deltaprops.scf=scf; + newExcProps->delta=delta; newExcProps->hasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS); newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING); + newExcProps->hasNoSimpleCaseFolding=hasNoSimpleCaseFolding; value|=(uint32_t)excPropsCount<=0) { - slots[count]=(uint32_t)p.slc; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_LOWER); - } - if( p.scf>=0 && - (p.slc>=0 ? - p.scf!=p.slc : - p.scf!=c)) { - slots[count]=(uint32_t)p.scf; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_FOLD); - } - if(p.suc>=0) { - slots[count]=(uint32_t)p.suc; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_UPPER); - } - if(p.suc!=p.stc) { - if(p.stc>=0) { - slots[count]=(uint32_t)p.stc; - } else { - slots[count]=(uint32_t)c; + if(ep.delta!=0) { + int32_t delta=ep.delta; + if(delta<0) { + excWord|=UCASE_EXC_DELTA_IS_NEGATIVE; + delta=-delta; } + slots[count]=(uint32_t)delta; slotBits|=slots[count]; ++count; - excWord|=U_MASK(UCASE_EXC_TITLE); + excWord|=U_MASK(UCASE_EXC_DELTA); + } else { + if(p.slc>=0) { + slots[count]=(uint32_t)p.slc; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_LOWER); + } + if( p.scf>=0 && + (p.slc>=0 ? + p.scf!=p.slc : + p.scf!=c)) { + slots[count]=(uint32_t)p.scf; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_FOLD); + } + if(p.suc>=0) { + slots[count]=(uint32_t)p.suc; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_UPPER); + } + if(p.suc!=p.stc) { + if(p.stc>=0) { + slots[count]=(uint32_t)p.stc; + } else { + slots[count]=(uint32_t)c; + } + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_TITLE); + } } /* length of case closure */ @@ -994,33 +1041,43 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC return excIndex; } else { /* write slots */ - int32_t excIndex=exceptions.length(); - exceptions.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */ + UnicodeString excString; + excString.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */ if(slotBits<=0xffff) { for(int32_t i=0; i>16)); - exceptions.append((UChar)slots[i]); + excString.append((UChar)(slots[i]>>16)); + excString.append((UChar)slots[i]); } } /* write the full case mapping strings */ - exceptions.append(p.lc); - exceptions.append(p.cf); - exceptions.append(p.uc); - exceptions.append(p.tc); + excString.append(p.lc); + excString.append(p.cf); + excString.append(p.uc); + excString.append(p.tc); /* write the closure data */ - exceptions.append(closureString); + excString.append(closureString); /* write the main exceptions word */ - exceptions.setCharAt(excIndex, (UChar)excWord); + excString.setCharAt(0, (UChar)excWord); + // Try to share data. + if(count==1 && ep.delta!=0) { + int32_t excIndex=exceptions.indexOf(excString); + if(excIndex>=0) { + printf("share delta: U+%04lx %ld\n", (long)c, (long)ep.delta); + return excIndex; + } + } + int32_t excIndex=exceptions.length(); + exceptions.append(excString); return excIndex; } } @@ -1065,7 +1122,6 @@ CasePropsBuilder::build(UErrorCode &errorCode) { } makeCaseClosure(errorCode); - makeExceptions(errorCode); if(U_FAILURE(errorCode)) { return; } /* @@ -1090,6 +1146,9 @@ CasePropsBuilder::build(UErrorCode &errorCode) { return; } + makeExceptions(errorCode); + if(U_FAILURE(errorCode)) { return; } + utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops/case error: utrie2_freeze() failed: %s\n",