From b3aec18a3c1c7fc4718d3256a10e631194717266 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 12 Mar 2018 00:15:40 +0000 Subject: [PATCH] ICU-13630 ucase.icu formatVersion 4: more compressible exceptions, and more room for future exceptions growth X-SVN-Rev: 41093 --- icu4c/source/common/ucase.cpp | 53 +- icu4c/source/common/ucase.h | 18 +- icu4c/source/common/ucase_props_data.h | 1191 ++++++++--------- icu4c/source/data/brkitr/rules/char.txt | 2 +- icu4c/source/data/brkitr/rules/word.txt | 2 +- icu4c/source/data/brkitr/rules/word_POSIX.txt | 2 +- icu4c/source/data/in/ucase.icu | Bin 29448 -> 28306 bytes icu4c/source/data/unidata/changes.txt | 22 + .../core/src/com/ibm/icu/impl/UCaseProps.java | 61 +- tools/unicode/c/genprops/casepropsbuilder.cpp | 213 +-- 10 files changed, 854 insertions(+), 710 deletions(-) diff --git a/icu4c/source/common/ucase.cpp b/icu4c/source/common/ucase.cpp index 95b27acb754..cbd5a6efb56 100644 --- a/icu4c/source/common/ucase.cpp +++ b/icu4c/source/common/ucase.cpp @@ -138,6 +138,11 @@ ucase_tolower(UChar32 c) { } else { const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); uint16_t excWord=*pe++; + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); } @@ -155,6 +160,11 @@ ucase_toupper(UChar32 c) { } else { const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); uint16_t excWord=*pe++; + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); } @@ -172,6 +182,11 @@ ucase_totitle(UChar32 c) { } else { const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); uint16_t excWord=*pe++; + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } int32_t idx; if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { idx=UCASE_EXC_TITLE; @@ -254,6 +269,11 @@ ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { sa->add(sa->set, c); } } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta); + } /* get the closure string pointer & length */ if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { @@ -590,7 +610,12 @@ ucase_isSoftDotted(UChar32 c) { U_CAPI UBool U_EXPORT2 ucase_isCaseSensitive(UChar32 c) { uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); - return (UBool)((props&UCASE_SENSITIVE)!=0); + if(!UCASE_HAS_EXCEPTION(props)) { + return (UBool)((props&UCASE_SENSITIVE)!=0); + } else { + const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); + return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0); + } } /* string casing ------------------------------------------------------------ */ @@ -1140,6 +1165,11 @@ ucase_toFullLower(UChar32 c, } } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); } @@ -1229,6 +1259,11 @@ toUpperOrTitle(UChar32 c, } } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { idx=UCASE_EXC_TITLE; } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { @@ -1334,6 +1369,14 @@ ucase_fold(UChar32 c, uint32_t options) { } } } + if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) { + return c; + } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { idx=UCASE_EXC_FOLD; } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { @@ -1421,6 +1464,14 @@ ucase_toFullFolding(UChar32 c, } } + if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) { + return ~c; + } + if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) { + int32_t delta; + GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta); + return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { idx=UCASE_EXC_FOLD; } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { diff --git a/icu4c/source/common/ucase.h b/icu4c/source/common/ucase.h index a7a8c9f00d1..b0a453b87e8 100644 --- a/icu4c/source/common/ucase.h +++ b/icu4c/source/common/ucase.h @@ -354,8 +354,8 @@ enum { #define UCASE_IS_UPPER_OR_TITLE(props) ((props)&2) #define UCASE_IGNORABLE 4 -#define UCASE_SENSITIVE 8 -#define UCASE_EXCEPTION 0x10 +#define UCASE_EXCEPTION 8 +#define UCASE_SENSITIVE 0x10 #define UCASE_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) @@ -379,9 +379,9 @@ enum { # define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT)) #endif -/* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */ -#define UCASE_EXC_SHIFT 5 -#define UCASE_EXC_MASK 0xffe0 +/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ +#define UCASE_EXC_SHIFT 4 +#define UCASE_EXC_MASK 0xfff0 #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1) /* definitions for 16-bit main exceptions word ------------------------------ */ @@ -392,7 +392,7 @@ enum { UCASE_EXC_FOLD, UCASE_EXC_UPPER, UCASE_EXC_TITLE, - UCASE_EXC_4, /* reserved */ + UCASE_EXC_DELTA, UCASE_EXC_5, /* reserved */ UCASE_EXC_CLOSURE, UCASE_EXC_FULL_MAPPINGS, @@ -402,7 +402,11 @@ enum { /* each slot is 2 uint16_t instead of 1 */ #define UCASE_EXC_DOUBLE_SLOTS 0x100 -/* reserved: exception bits 11..9 */ +enum { + UCASE_EXC_NO_SIMPLE_CASE_FOLDING=0x200, + UCASE_EXC_DELTA_IS_NEGATIVE=0x400, + UCASE_EXC_SENSITIVE=0x800 +}; /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<d?k*Gp4V8eX0T;kEfD0JW7)QdQm>?Q7pe^Y03CA($IPMyC zMiUndIw~q?02f5XFvzALK?ip+qah98+6_|Z>W3j_wWCI>sHmR z+ucEn*rt8DGCB&_S;NO3tAYz~SwINn!t)ucJe{#Wd%(MF#w@G?`{3Ax;~0B9c(-=F zYrjsbT>B{pE5npBWsFj-oT;3vT&T=cu2AZg2Bk@vqckgzDNB?U$_vWN%3I3&%IC@! z)wy7VgAE=wvuhs9>UsTntxhs?kWk036JLE2M_i^`k4|X5wKFocT z`#AR~(DClmIIVS0gMNnlQuo#Fn-Fsw`rYo2)CVDxegWuW_fv?wnrD2L<8$uyoNjP$ z1l{ER5V+aB)%~OUSFTq*K~Gmt)U%gofaf3&jU-zeD!e=uo?~+Ix91z{8S6R5eGDpB zc_tw`hW_8^7ke&?=ejmdMbuv)-R8L)_#oF$cQ5oj=~?Po>|PDI6@3Hx8=iMPA9yx< zwt9Z@{NiC=&D-4@^6u*`@?NM6L_gd+6nG4Jg?FrXg7+-%RPS`}CElyNH+mbr70PVy z{lNL&C%j9&tG(;JuX#5?&ab`a{n*={cT|2glD)TifADVi7HAjZ&b?6aYZ0wbt5ABQ zQ`;N#d_HfvzwLB5$?sa^ zAFG|J)$ls!XcuS~<6ZqH?HcVyXd1QI+Qnwf9IY9;McOmKBKNb}8g6@0dtG}An)kKO zwJo6EYaN=)7xwvl6-puc?zVddcF(}>8TikbfnsyTEAcJnpIOh1OHhwa9%L|vq_ekgeUx*xIER=PxsaN{=}s#eAoDHH1(PD zTGm^AvurW<+IY){xQBcTe2aZcU|*irM@Fo%c^v9q?|aqvB)-{u({~V7y!U*c!1@JN zzb(EWx!(@c`v67t(>H)YzwVd0DwOrUo?PndA7Il~C=pX~6e zj!S3y&vocU{tNvV`7eY1niMwSaTB+c_->1P%=X_8ZeB|Ng(y; z_Q+IcvYc1W<&9#`l%8Lk_ozg!4gQV(cg~*8feW-h1}+O+ zo7LADnVa<2z#YF!-rTC=8(&nvKQKSA=-)Vs&Hk1^OJGG#72h|X>0Hk!(eF?bEoIOjG+At{+lbXoc zoSV`(g(yb2Ph@iA8|Em-;9?~{uD#Vl z7c2LLZ*6$5;a0SFEzgEVx^ZwuL^DWViIBsolvoBTbw$dl`GyCu;}`)l=gmleH}msS(u5mB`6^kmiMWdcNE& zJe(R<%^~u6BYQ-LPlOsQMR+ESM#JZigf^i^LOlY6)!3_S2R9@rR^FMxr9sBFb7@E? z(#_-;{(RBktJ5;yCesSQh7@mR|vyxy(m)kK}l(3UM?-siCSgTO`w!BbrYI%9W3Qs z>%a4m_mh+Uo+A+FEs@TjYRf}#`Omw|r}9}9lH?^N?am!^&8#s|9+h7?aqT1$wHkRz z3u%Q+nA>5Guwsd$`BHOBlGcmjtuqoCEtFx>L!L+a%9%&BYonE;5TAWhRkA$f$kWkO z(dm5c`Z6x;Y_!Z|kC!}AIet|-W~4GQ@x5vVk%wsV`vPH;bu7UFqC0WKxn#w1W&K@UbQqE{%eOi}N zAI==pevf~DkkO-cY(ZT1WooKjnjvuT6A$pN>W)|mx{<7scflq z_PnX=nRHZZXO7yug2WlgkkmDuCdSjSMW5*Vbq_MV- zj2*}0H$*nbwVLdO$G0BeRNkNzp3rmxx37z=Yuq8?DXzStwW9Py##&A+{X=9W+N#K^ zFaID{Uh+shL~JULL>(>&4v8`@F<-JQJt@-SEso(=;q-?)y(w<7R*pn2YbbZ16@XR- zxe|y~f+RUc$q6s=6h6{3dz}K)MQ|?uJ7xi2{lssY^lOjw5*kALK~nWe!fW+w*v~D;csM2dK$sM;LX&+KV@zrGp#`BYh<%TI%RL+Z?m^- zb?o;1e+x_%fpv42ehn1ARQDJ)WK{L2x=~G|>~D~zqc)A&KB`BhcsEFtZ`pp>0}QDg zQYj;>`-9F>YQFQKf?s{Xas?wTUJ8x-Ib7H+oZ6hr`jl0%T<(AL4*dNPvz&;i` zpDIHt9c7idQrUCUi43{<3a`v#&1|*m$UDYPXI8eD_m^Ftj;DQ^9Dk~-MsJ98vDx&r z`cuwqM>ZdMg?5Ez8Y?}Yne5W$=5Gi2G~1)Lr<`*!UW)F4zrwSgaC@Z8E$NWa)uU&Q zo;`X9B9@NcGDJOVi@BD0Wcb0}kfa?|srW1KeC{N!%erz%%9rhZJznD#M+ zW6Q_Zjcp#=Hr6+;bX;s))40}g9pg$)u9{Fkp=Cn*gu>IxPpdnv`Lwpvd=pD2#wIpR zY@OIKvE=ls)9X)fIlcY#!ZXUxs5_(ijJ7j;HKjGNnx>l8nvR;1NmVD;pWJeC`^kl; zl%G;}O7kggr})N~j*pFR8s9p;V|>Y}Rj1aU+Hz|9sfAVLRdrR(Rc%$i>eA|1byIa~ zbw_o{1Z&Jddw?>(45!ZA%+cH@RbS-IR_gb!S`k?CXqOo6OVI z%+^GdHTuLX&NW5u90YaVrHPT2EV8Go;ryGz+$3w%$^Tx!_BVwIe}`yUtg)rz(v8;X zpPQE)t$4be8FH6uxvRk3qf&WV*VM8`y6##Ff0bXeDyQah9gU36ue#+ei>f<`yaXEt)N10vrC|w`d7TZKAyAY^t1Dnn@8)lq709 zccfYmcQ%*2XH-5EIVK^A@$TFhtte}>RxQ%WT9#*KF26Ihs!A@^j*dn__IwVJk0Ru+ zJoacSm#j-PzjL*6*fSGflc-25i)$xQxJjujZIsx{Q~lZuKB-KZaytF(^Fnhd57JZJ zd@IX%L1lKkc2_ftczec7yc~g~c8y8I52hhiwUHpgHqDq8@JOSEjB!fvgT#AQ3|bCa6S zmLa`eO_&Y(w$pwlB+HgqTb{F|RK+?o(&lr>$XsKDf3Dt6KkL_(+&TMaapLJWltf*k zoGH80m!-^?DL$RenK##Q(r-%QwBx^{`4 ziJIh!xn3VDlf}`wi}SbZ#Hi#LCCyYPan$K#nJXPl7bo@hxKy(x{nFFq`LpLt%{yb# z=T2vF#;2|!wNBr38KquCCB`IvBTX}v`bb>qGRrbYCNJqDDZPi1kJQ>^KcTVe$# z3TZ&`O8i#?iuz1v)7e&Vp&RH?^xo*B=7zKi^nKCG(0?omX~RK(0sS1k z|J+dbJs?NsvU5fr-h%(WzKbn4=*$Nm)S_sL3mOYwH->9Zz~dFn(Nmz`gFd4auveAu zz_oxoR#^_8rLb*E#U7kjqcRd_b0_5Q;JsFv2VAM#2VAPm2Cl;YdqAA;oyx0_pHgmzJXm$H z%cH}!gF&wVJreXv(BYt0fsRlM*wxW%z?}fiwb3#y0(v=USI{d!dxKsHS`2y>XrFjQ znbsegYY}-C=ykB1$tm3tligRk4{?t{ztSDjYNA2ymgp;?pwMB$ zH-NWC-vr(f-3Xi&eG7PJ^le~Mv<)~rx(Rq!^c~>c(RYD=i@pcEC;Cs|z0vo9_eDPd z&WU~qyg&L8@PX*Zzz3tB0Ov+O11=W?*x)9XLPw1#m(1OW?xjSHMT2 zUjrYF{tNh6bPMqD=vLsO=r-UJ(Qkl@qu&CbjD81vD*8R}>F5u@CD9*&&qRL$wnRIC zOQYL?&qjX%E|2~Xa7FajP{{wvtdL)09&M|>RrhGy^liXz^lyOQ>fZvt)4v0LuYV65 ztVUE1u3}f2x*^n z9%dpVJIRyS@`B;62+0M> zXG#T-{HCNp3Yd}#DQHS=NS#c{1F5qqc_DQnDd?!jC9@uv%z9ih>v74f$0f5Km&|%x zGVAg4D?;^s&9M|iDlw%#koucaUq}N?X)j3onNmMU`1;^7FNT+F=EOUl*&5`T8}Bk7X(+!y^12kWvmLXu-JG3P zoNqHmL)H!8Zw7yJe0Ew9+Z^wdlVtLy(G$wqkvS@Z6MMN>vr-vcQRHG3tTH$fbPWpy zv6BgY);Sc!E+&ZGPY}DGAa*@L>}-PA0R^%931T-Ctik)UVxEg#kN4*ZoQ8r}se{-N z1+gOvVuuvO4k?ITQ80pz9Z`^WME5D7AiewV)9=IEVU9irc)xx>@B#e+;Dh>uz`6Qd z;5>aE@FD#n;KTaEz-GM}IA5O+T%a!iF4PwSAJHEHKB_+od`y1~__+Q!aFMY$Ex@JvQsCe9zXO-)%Ye`7&jOe0%YiHO z6~LAHO5iGe6>zn_8u*<49PoMldEgp-4REc#7Wjhx0&tza4%n)<0@v&7fiLPW0$zX$xM{!ifh`uo5S^bdd^>K_6>(mw)ztbYvrME?Z%ss1VO zGyOB*=lbWs&H83wyWS4`LjMBzrT!)GEB!0r*ZSAMf9d}MZqc{sbdOG9Q`kt}rtr7v z6zt?jVqZTJRM@A1KM{QzjU~6!q`@DwDVx>eS9LC zj`PBQMeAqJ&$O#yY!)JDKf~Ca;Akgq;T%~P8_Nt@Lm68Oomfp@;2c?>GhV{_Lo3{C&=KcSZc$$6x)I7-Tt~4R!O{B0 z*eBp<1!T%can7YYu6%5eZiMnV*O6bP3#`fUcdEmNn?jmqHIJ(Y^?Fo+IoEW>@ zxZZGa?h2#cDB#?c#tnwTxvPvD4V80O8#ftl&Rt{t#qe`&?tcLn2-2JY literal 29448 zcmeHQ378XAw!Z1AB%Oq%q3~&9#(*O-Dk`F=X>n<6aKUlhW}HN4L}^ggQ9v{;Dy=9q z>bRgnf+EOHu`gm`%OV1zxFBo7z6mPGt|oxI|J2Rxs-%k&VSE-Zr!T7 zb(6H%n8lhkVszxOjzxF3mB2YPnL`1(-MtuF@K45`%V(^Y$zZI3rCcq$^ty+!>2)UM z4>C2cyU^5JxmCB@Y>2)b6jglrx)11a9sQjKhUY;h; zmD@{qOBM2`@-lga9F@1qTjbsHUipZ8T$auG=Jry1skwBPxxn1me6IOIb4&A;=Bv!t zn{P7z1^vC|`?bEr+#UKp=6>b@=9dxkCg>gWHhBzW(yQoandcyGfR^!N4Hua|*ZQx_ zYtYBcTYPf~LL;fEwbs0Dv)p4Tu9d$r-~E>REjOBP zMCFGpT@l?2R0=Ay3{2*FHQDp1w;{c284H}E>3f^!SU$Eavdl7n0r^|dTF^$zPRm}) z5zBFl*=o1ewVq*ZYW;(?mGxq4FX>uPTkFlhJ3xQ8cCbETeboA-wZC4->wDJr z(s=7M;9To`>r(62*0t8n)?JWuYd=``TYt_uDz_R{y??Vx`PTgU`Mq%G_L9!b_vAOr z$Grrib_Mz;w0Wx<$u+Iatu@~pc|s2lG4U_tfh2&3`8UMZBwD${&&+gl1&^g#5C~m}&WQp_`w-1lZX8MgB_77R}$9 zzZ06h`A71PqnB(}TYa0ycBZYp)C_dGot}ZyGjMta{(sECrIjn*Rkm5$=hy1byS1&@ zc5`LSCHS;|hwbmS2W;E$iM+k^aHXaCEZMCs>b9tRbKOC8KdUR(E3P-7UPZlQ^;*~O zSHGhEvHGnG`V~|Z94lzupkITR8!TvWtU>FB{Tfy@Y+iSN!}F99rN1&;*=lQGztjGj zeUK@=eU?F&?;x*h9sL{?j-MSD*GZ2#EuWr&|6eoEqfYi`lPo>cp76d&IZ&xDvkkPp zq)D&ZhS-9Y`s(xQ)={>xhL|Y^UiTqxjxB6kXj=^X@~l20Vx__3O7l9~X4`Chv$x%L zIaa(MZ3kif6|3KIo7qkgb`6QUL~3ZKZvb8PCY8GO(mLA(nsk}nYtXiruC=$-@PziErmv7?paT1Tca%7~Iqr2l;CR^4)8TjYbyUrl zs(B{=8OJmEFF0Oyyji1fYGm!CcN}B>BYA6Ay}dL6)u%f?bj<%B9L3M}rH-YJFKbnC zuVDqK?K!3TU1`QOL=##r3H+v-J4-~IT7Qv5t%jduax}{+{;85~O4Zv7Qk#oKSc0YM zN=>OU&qaA;X4{|I;-(As2lG;OPAOH^$MV0;(K+y+$oSca@yINQYa^oxJGbcbkdSoV zUQjAWWTVV^mNt)ZS)T{5Md!0|ENynnTMuUCCw@yD!&u9E`z(;itODq zYjW=FJxH?mFyYE`PP29&+`ZCE^Pu~JG$v+qi~AyByU!lA7dnPJoOMd;tgqv%TN#tg zo-UWKOV?zQ!*wI|&aNM{Y3pB40gU=oO zNA5o{<-Us-cpC&7*oNAICRQFS2ik^%3xyl-Y~{f?xbk5p=4}{gXkrtGmcJct7;orR z$X@=oi3ODMw}`iCD8^u7Va>xfM6+pVVsXVbq;#7-bR5&>AI@2 znVDHrklyH7msy=8k)CT#4%74In&mcj8!N7C6^%#|r0Ypja+0lkmbldH#cX3EQG;8k zo}3(+sI7Yl4e#|_>72L+X~;k{4XMG>7|*0U+k4gGH7ZF|eL7IoY92hO2&H?Cp8<7^ymY}@Z` z!q?((6YsguiIpJ{9~2D)1~(oB)Wk#yW9Wa??G|C5KfpHln(wGMUi9R1atB>AeN2=`)~+JIegdi28+i!}X}L^TyTcfv#}Y^LCD$%VRxgUz z&xmK#QG^K(aUR(#S3jat8?6`x|LmKolI|f!o~_Rmoz2&%FV=*UjTV`V@q(vbj9(Ux z)%GGX`^rupyzw~`=|mkNXYchI*?o)_Q7_jxV-7w#ngfx|XeGYt_{lrdaVq1WeA)75 zs+F$G9$R(%>9Hh>c+qF_sZM8%<1;B{G`&8pOPP<<9ND~5V^4L?dA96hB76Es^z%y{ zOJ<3m6_25(u4*l^aeYbhF3g-MRa-SLpYc>LlE~ATzp6*gnyRC&DTZcEP))vMrwX!0 zsdGhIvdY@qj55;uNI!&kYA^1-RBbMjUO{kFJNHA>G>(Wei5Vq(?uRjoSEwn6K8|dj zd>qEiWa#*Jb|8LYKuZbG7uh47H54bY1ShJal>Na@Q69D?(Squ5h*T zw+gk2wQ^nQzcO@X>`K>F{;NV)#jbK)?Y}y7b?j=F*Y6E^8H<6$;nKbX@fC&(o!11e ziC*Jv9c&$M?YlO7t@FCTbu;rhi~X(e-q5{?d;Ncp{N3d%^~HRJ{*XW6_um(}&vk$4{jvKC{}K8} z;vfEhM*it~p!9**1BD$z9TFY<4@MqzJyiNo?4iPrG=`cEx-)9J8`5*|yGh)8yszp* znp7W>N`?uTDp`PUa}dDTtqu2+3SCbsQy_q1bYmjAVeiu3k=}{kg?&o`b6Z3#1n;2mOdGIGVx?#pfnH(Bm#w{rKOS5L}}qurB6kkN<3BAue2Z4 zi1bVJD||Y&T2+Z(75?^3y?eM+j1Zycy~vR|H{?f0`n9e43rlVq{r8fS=b9RR&EZ+H zkHO3WPjhQn>qS+OL-aylQ=h$3RZEk~->meksZUMylDJNM{VLJc0E!^a*tjgbjh3o> z_V2G#fA`Dgmv}nxOg39>_3>xo&&K=5%i_<)pQl;#56Jc#X6?pAb!~ELvmZ-sMl0Rd zMk~oiJMl(4@wwPD<&O`r$?+F#vTvYG4iBGXfg0b+&;d%Deyq&#(+bBN1 z!1oD_HNGhRsTTj0{VTDSlijx@ywta>T=ISH`$DsSX8&yPF&u~p91zSYJzNN9CzwQG%k4SzQX6mQw4*aL*t z#MX!i{r;en6k2}QS9f)sYkg~jYolwO>wN2i>!RzNQC~C|jYgd@Uo04l#+>VY>x1i~ z>zx~X8-g368=M<`8-p988=aecn}VC7o1B|{n}eI9o1I&HTd*>L#1S>{LYJ-Q>rOT+ z?b!I4oIGaAEuM3}j~45JaV$nYQ-(}B$|`bYve!<>Gt|zPdsRQy>aBVmapxF&*Vf=x zqg)eD`!zBCOjnHF6gB0@W~a9R<*ezb$wyqFQ|Ysfk@WPOR%aJBv-Xw{Pqi`1wN3aK zdp=&$__xtBFKot0p%Z;*TWp(ayMKFVyNSiNyLR|@gi`+QuBN%nWT)$py5@-813j(c zsO?=xF}3#<*{RQTf2He;{;7OwP5oZrD?_Spb&?)wUwxKqGkf^lCOKuN*=Y$_qxnv|o-LL0IehpFWM4gB zeWSj175E)bSG2mu^Xj8F)*(K1O%Xc>-lyKB>5;n3vuCT3`ZtWVlXz~*SoOb8Nd23} z%#uE~Y+Sa{rux^;i^uS*kQ$$hRH8X^%q6a>uYOH+o4wJgHQS!tnqE0GSKHBu=-jI7 z-XdN+E&6+cTV(AGU97v+lB1Pezg9X&Pn1!rWjg#_^zYHKSF>&TJzDns;-1Uyk=;Tw z$>XUfiQ35>nbyLS%_Z&`SsRKN6PNgSPi~BQls;O$7U@JS-LrZww=>kM3NF)*jz&TD zTn?U(BE+vj#%Mj4s7rlr=c?y0W+uKvx+1I0ubn{dCZyu5QG72?^-HSp$z-Z7XR5z( zUT6-*L3*m2Yh@A7dy(C!J=M%S-k7mEUW`EW=Q_Xo zm;{%8m1?)&$?lbTe^Yy{er{rf)aQDLwLrv*Tm+eC9mcuI%-575yIoG34f?jzcqSx^mR?(&v*@XcerBZ2a^-lHEe`TpX zXZ$QqJpGoEu1lAz%bx1XGUlr;KASByZ*9j(zb*0KoCNAUx1`q-b@+c7o?55se~-$& z({rTDB4=tvx_0TF>6)q)YkPh4OgcyB&d=YdiijVX6934vggibNsZ53LugZdvt<-|9+e)G{*5%vROTanrK?^RIjZv#KBA}h zP}L)|wyGc3==H=qQrDz%*)j;dSbN2qnkr|KC^A*gWusPPueld*5Z+hyw>LQTX4`l2 z&3LA{yw$}0XzYz=XVhoojsCUeixzaj`qjd8naOlRS60Nj;Z1d6lOom|{j;XxJ8nr@ zi&=ZkQq1nwEEZNFy(1N|osceSV!B}uGu^P4Er8Vc+}?=3TWh`9h;xeB;-t?7sBx+3 zMpSHJ%44^%%S^?CZr=Ap&qVJTlS zL8IeJPZ665k8dz@bI>0_|Dlw}R!E1zg~7Fz=D{Zd+kR*&(9gkDS%7{G=Z-GPHD}isiR{`I0 zuLi#DUIQHJUJD%NUIz@iqri|m1|05Q4}8bH0r;+aBXER!6YxFvX5dKo7T_rNR^a>Y zZNSm)?Z7ea9l){foxpMK?||doyMPniyMYtk-vcMPe*miPAAytIdw^5idx2BkKLMw? z_W`H7_XB6R4*+Mn4+3Yo4*_Sp4+H18j{rY#{|ub#{ss7<`&VGt9S26-3E;=>qH=r%nbwtxf?xqfP}rt4;&TKW(>Kx#U>IcAq>RjL; z^+RB}8U_wlBfyu`kAN?$6~I^2kAeSI=K)_;=L26;7XV*Z7XsfE&&czmjZ{Wp84IHPg0ghMK0w<{JfD_dyaFQAWs_J^+WOV~@inUQ94bq8>cx)b<;`W<<%fXn$K5&(Yi5Co ztpI%sS_xVOS`AtQS_@hSih^RG^`H%)ji61S&7duyt)Ok7?Vuf?ouKbPyFj}^--CVt z{Rr9v+6(##v=6i&bO3Y^bO>}9bOiJ>=oiqhpg1T2Iu7~`bfQ3JPvf1`v3U`D@Ej8x zj(2bet*4dm5q6n$bCO8z!}6tjl0@EDnnkm_mDtB|aa>O(4H{UM2ZWvmR6s8`0G zgCy#evF9O)dSz??BvG%7y#PtnD`PK067|a1KuDtA05%L#V^sf>HkKfyCXg=Bq!6TY zAzi3R!y)|<(nXr|4y5xSHP@tfAvJ|`u_ldxbUvg@H0eD^%^+P0sf@h_Nz^N2uR{{` z%Geu_M7=UL1d^y%#@>V^>Xos#Ac=Zq>}^P*UKtw-Nz@y_CP3W^q+nF#4YNL@5( z5~PP9b=4#lQb$PLG-)!V5=h-OX$quHka}p+R7jm6_0*(kkRFEgFGyuXotgA&GirY&0ZMuZ)d>B+@9DI4Q)?DNpnk1@4GI>)iLaMbWB?sTNGT8I+)N;(Wn|)2I8P>8< z#x{_7JW2VQo{&G;l#h0$0<-N_r)yakV$~RxBs#Oj$TK!cA|GieKS%QFhuKMBb`sj` z9K)F{!}(6cXvq2%_*cQdnw*_wh@F@u%1JVL)949lb`BYG@O}0~>{;nkQhV&n>C+E= zy6Ki@Vo(18lqa)7&~=dS1U&}219TVYde9B9jl}9a3iLi`G-wQHENC2PJZJ)FB4`pw z1x*G`0Zj!>15F3b0L=u=0?h`^0et|P3;GZg21P(0fhs^BgXV$egBE}mf<6H)0(}Zv z3|b2MvY-g>Jrf&*y)wQ12D$O?&f&Z|VeFkj_k$h;bq93?jQ|YvJMzkqQICE1Fl!r12-refE$&Kz)i{~;AUkraEr19xK-H-+@@>; zZdbMgcPKl6JC&Wl@09O=yOdqP-O6s@_saLcACw<}KPo>0_b7XSdzHPwpOl|~`;>hO zJx85cXVy+@owYZ9XMBQhhfnhD&~tlt@V9}wLvC?7`I86tAzo<7k9&58FRcjl?Li-W zcW@7=53?Mtsxu#@8hy-t&Bf++puT1I5*scZj zFyDf|qLZ{QOlWU||Hk?ph7orr%~8&EVN#Oo&SZ`20;5Uex`WFDNBb(q-bQQ}W(C*6 z#29`5z}S9pv^!$hCyh9V&JX`~-%-m^+IcZH6cM!NVr&>V+HK+Q-@(zhA&lLESbBpn zHrvA(t?!I|3Xa|yjD3*g!qQxgBkL?rg@@L8#+E?GSNx?KN7hB2>+E!_d*EbjAjP`rx`U)(G?52UnD_Gc>LTxHC1b zFE|ApU1!Gr3mn5aG3@3Bc}yBt<|+5&Y20)8_YftGd*1VsN7lFjo|ipljeEiKipQdH zFM9s%v1;5v5B7)}_nK#zhxSVhJEx$B_DT%9rjUpBNenxt;U3x}G3=J!@zDN=VW;%2 zhxSIeH#{Rev@gOn_q^w!Jqp8aWrBzHCk#84i5}XUFziw$d1zn4utQNjv?pQMolN%7 seuQCXGQ~rC5r$pKR1fV#7B?j2#5`-wzY8+W-In diff --git a/icu4c/source/data/unidata/changes.txt b/icu4c/source/data/unidata/changes.txt index 9f5bb61b32c..5c3038ff78a 100644 --- a/icu4c/source/data/unidata/changes.txt +++ b/icu4c/source/data/unidata/changes.txt @@ -240,6 +240,21 @@ set(ICU4C_SRC_DIR /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c) genuca/genuca --hanOrder radical-stroke $ICU_SRC/icu4c - rebuild ICU (make install) & tools +* Fix case props + genprops error: casepropsbuilder: too many exceptions words + genprops error: failure finalizing the data - U_BUFFER_OVERFLOW_ERROR +- With the addition of Georgian Mtavruli capital letters, + there are now too many simple case mappings with big mapping deltas + that yield uncompressible exceptions. +- Changing the data structure (now formatVersion 4), + adding one bit for no-simple-case-folding (for Cherokee), and + one optional slot for a big delta (for most faraway mappings), + together with another bit for whether that is negative. + This makes most Cherokee & Georgian etc. case mappings compressible, + reducing the number of exceptions words. +- Further changes to gain one more bit for the exceptions index, + for future growth. Details see casepropsbuilder.cpp. + * update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) - grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters @@ -249,6 +264,13 @@ set(ICU4C_SRC_DIR /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c) * run & fix ICU4C tests - Andy handles RBBI & spoof check test failures +TODO: +- Errors in char.txt, word.txt, word_POSIX.txt like + createRuleBasedBreakIterator: ICU Error "U_BRK_RULE_EMPTY_SET" at line 46, column 16 + because \p{Grapheme_Cluster_Break = EBG} and \p{Word_Break = EBG} are empty. + -> Temporary(!) workaround: Add an arbitrary code point to these sets to make them + not empty, just to get ICU building. + * collation: CLDR collation root, UCA DUCET - UCA DUCET goes into Mark's Unicode tools, see diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java index 56787a16aca..ad8125fc647 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCaseProps.java @@ -193,6 +193,10 @@ public final class UCaseProps { } else { int excOffset=getExceptionsOffset(props); int excWord=exceptions.charAt(excOffset++); + if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) { + int delta=getSlotValue(excWord, EXC_DELTA, excOffset); + return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(hasSlot(excWord, EXC_LOWER)) { c=getSlotValue(excWord, EXC_LOWER, excOffset); } @@ -209,6 +213,10 @@ public final class UCaseProps { } else { int excOffset=getExceptionsOffset(props); int excWord=exceptions.charAt(excOffset++); + if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) { + int delta=getSlotValue(excWord, EXC_DELTA, excOffset); + return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(hasSlot(excWord, EXC_UPPER)) { c=getSlotValue(excWord, EXC_UPPER, excOffset); } @@ -225,6 +233,10 @@ public final class UCaseProps { } else { int excOffset=getExceptionsOffset(props); int excWord=exceptions.charAt(excOffset++); + if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) { + int delta=getSlotValue(excWord, EXC_DELTA, excOffset); + return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } int index; if(hasSlot(excWord, EXC_TITLE)) { index=EXC_TITLE; @@ -305,6 +317,10 @@ public final class UCaseProps { set.add(c); } } + if(hasSlot(excWord, EXC_DELTA)) { + int delta=getSlotValue(excWord, EXC_DELTA, excOffset); + set.add((excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta); + } /* get the closure string pointer & length */ if(hasSlot(excWord, EXC_CLOSURE)) { @@ -479,7 +495,12 @@ public final class UCaseProps { } public final boolean isCaseSensitive(int c) { - return (trie.get(c)&SENSITIVE)!=0; + int props=trie.get(c); + if(!propsHasException(props)) { + return (props&SENSITIVE)!=0; + } else { + return (exceptions.charAt(getExceptionsOffset(props))&EXC_SENSITIVE)!=0; + } } // string casing ------------------------------------------------------- *** @@ -1109,6 +1130,10 @@ public final class UCaseProps { } } + if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) { + int delta=getSlotValue(excWord, EXC_DELTA, excOffset); + return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(hasSlot(excWord, EXC_LOWER)) { result=getSlotValue(excWord, EXC_LOWER, excOffset2); } @@ -1201,6 +1226,10 @@ public final class UCaseProps { } } + if(hasSlot(excWord, EXC_DELTA) && getTypeFromProps(props)==LOWER) { + int delta=getSlotValue(excWord, EXC_DELTA, excOffset); + return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) { index=EXC_TITLE; } else if(hasSlot(excWord, EXC_UPPER)) { @@ -1314,6 +1343,13 @@ public final class UCaseProps { } } } + if((excWord&EXC_NO_SIMPLE_CASE_FOLDING)!=0) { + return c; + } + if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) { + int delta=getSlotValue(excWord, EXC_DELTA, excOffset); + return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(hasSlot(excWord, EXC_FOLD)) { index=EXC_FOLD; } else if(hasSlot(excWord, EXC_LOWER)) { @@ -1408,6 +1444,13 @@ public final class UCaseProps { } } + if((excWord&EXC_NO_SIMPLE_CASE_FOLDING)!=0) { + return ~c; + } + if(hasSlot(excWord, EXC_DELTA) && isUpperOrTitleFromProps(props)) { + int delta=getSlotValue(excWord, EXC_DELTA, excOffset); + return (excWord&EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta; + } if(hasSlot(excWord, EXC_FOLD)) { index=EXC_FOLD; } else if(hasSlot(excWord, EXC_LOWER)) { @@ -1534,8 +1577,8 @@ public final class UCaseProps { } static final int IGNORABLE=4; - private static final int SENSITIVE= 8; - private static final int EXCEPTION= 0x10; + private static final int EXCEPTION= 8; + private static final int SENSITIVE= 0x10; private static final int DOT_MASK= 0x60; //private static final int NO_DOT= 0; /* normal characters with cc=0 */ @@ -1553,9 +1596,9 @@ public final class UCaseProps { return (short)props>>DELTA_SHIFT; } - /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */ - private static final int EXC_SHIFT= 5; - //private static final int EXC_MASK= 0xffe0; + /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ + private static final int EXC_SHIFT= 4; + //private static final int EXC_MASK= 0xfff0; //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1); /* definitions for 16-bit main exceptions word ------------------------------ */ @@ -1565,7 +1608,7 @@ public final class UCaseProps { private static final int EXC_FOLD=1; private static final int EXC_UPPER=2; private static final int EXC_TITLE=3; - //private static final int EXC_4=4; /* reserved */ + private static final int EXC_DELTA=4; //private static final int EXC_5=5; /* reserved */ private static final int EXC_CLOSURE=6; private static final int EXC_FULL_MAPPINGS=7; @@ -1574,7 +1617,9 @@ public final class UCaseProps { /* each slot is 2 uint16_t instead of 1 */ private static final int EXC_DOUBLE_SLOTS= 0x100; - /* reserved: exception bits 11..9 */ + private static final int EXC_NO_SIMPLE_CASE_FOLDING=0x200; + private static final int EXC_DELTA_IS_NEGATIVE=0x400; + private static final int EXC_SENSITIVE=0x800; /* EXC_DOT_MASK=DOT_MASK<=0) { /* uppercase mapping as delta if the character is lowercase */ @@ -405,6 +421,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, if(type==UCASE_LOWER) { delta=props.suc-start; } else { + noDelta=TRUE; value|=UCASE_EXCEPTION; } } @@ -414,6 +431,7 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, if(type>=UCASE_UPPER) { delta=props.slc-start; } else { + noDelta=TRUE; value|=UCASE_EXCEPTION; } } @@ -421,40 +439,52 @@ CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, hasMapping=TRUE; } if(props.suc!=props.stc) { + noDelta=TRUE; value|=UCASE_EXCEPTION; } + + // Simple case folding falls back to simple lowercasing. + // If they differ, then store them separately. + UChar32 scf=props.scf; + if(scf>=0 && scf!=props.slc) { + hasMapping=noDelta=TRUE; + value|=UCASE_EXCEPTION; + } + + // If there is no case folding but there is a lowercase mapping, + // then set a bit for that. + // For example: Cherokee uppercase syllables since Unicode 8. + // (Full case folding falls back to simple case folding, + // not to full lowercasing, so we need not also handle it specially + // for such cases.) + UBool hasNoSimpleCaseFolding=FALSE; + if(scf<0 && props.slc>=0) { + hasNoSimpleCaseFolding=TRUE; + value|=UCASE_EXCEPTION; + } + + if(noDelta) { + delta=0; + } else if(delta=0 && props.scf!=props.slc) || - (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) || + if( (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) || newValues.contains(PPUCD_TURKIC_CASE_FOLDING) ) { hasMapping=TRUE; value|=UCASE_EXCEPTION; } - // Simple case folding falls back to simple lowercasing. - // If there is no case folding but there is a lowercase mapping, - // then add a case folding mapping to the code point. - // For example: Cherokee uppercase syllables since Unicode 8. - // (Full case folding falls back to simple case folding, - // not to full lowercasing, so we need not also handle it specially - // for such cases.) - UChar32 scf=props.scf; - if(scf<0 && props.slc>=0) { - scf=start; - hasMapping=TRUE; - value|=UCASE_EXCEPTION; - } - - if(deltaprops.scf=scf; + newExcProps->delta=delta; newExcProps->hasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS); newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING); + newExcProps->hasNoSimpleCaseFolding=hasNoSimpleCaseFolding; value|=(uint32_t)excPropsCount<=0) { - slots[count]=(uint32_t)p.slc; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_LOWER); - } - if( p.scf>=0 && - (p.slc>=0 ? - p.scf!=p.slc : - p.scf!=c)) { - slots[count]=(uint32_t)p.scf; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_FOLD); - } - if(p.suc>=0) { - slots[count]=(uint32_t)p.suc; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_UPPER); - } - if(p.suc!=p.stc) { - if(p.stc>=0) { - slots[count]=(uint32_t)p.stc; - } else { - slots[count]=(uint32_t)c; + if(ep.delta!=0) { + int32_t delta=ep.delta; + if(delta<0) { + excWord|=UCASE_EXC_DELTA_IS_NEGATIVE; + delta=-delta; } + slots[count]=(uint32_t)delta; slotBits|=slots[count]; ++count; - excWord|=U_MASK(UCASE_EXC_TITLE); + excWord|=U_MASK(UCASE_EXC_DELTA); + } else { + if(p.slc>=0) { + slots[count]=(uint32_t)p.slc; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_LOWER); + } + if( p.scf>=0 && + (p.slc>=0 ? + p.scf!=p.slc : + p.scf!=c)) { + slots[count]=(uint32_t)p.scf; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_FOLD); + } + if(p.suc>=0) { + slots[count]=(uint32_t)p.suc; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_UPPER); + } + if(p.suc!=p.stc) { + if(p.stc>=0) { + slots[count]=(uint32_t)p.stc; + } else { + slots[count]=(uint32_t)c; + } + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_TITLE); + } } /* length of case closure */ @@ -994,33 +1041,43 @@ CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorC return excIndex; } else { /* write slots */ - int32_t excIndex=exceptions.length(); - exceptions.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */ + UnicodeString excString; + excString.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */ if(slotBits<=0xffff) { for(int32_t i=0; i>16)); - exceptions.append((UChar)slots[i]); + excString.append((UChar)(slots[i]>>16)); + excString.append((UChar)slots[i]); } } /* write the full case mapping strings */ - exceptions.append(p.lc); - exceptions.append(p.cf); - exceptions.append(p.uc); - exceptions.append(p.tc); + excString.append(p.lc); + excString.append(p.cf); + excString.append(p.uc); + excString.append(p.tc); /* write the closure data */ - exceptions.append(closureString); + excString.append(closureString); /* write the main exceptions word */ - exceptions.setCharAt(excIndex, (UChar)excWord); + excString.setCharAt(0, (UChar)excWord); + // Try to share data. + if(count==1 && ep.delta!=0) { + int32_t excIndex=exceptions.indexOf(excString); + if(excIndex>=0) { + printf("share delta: U+%04lx %ld\n", (long)c, (long)ep.delta); + return excIndex; + } + } + int32_t excIndex=exceptions.length(); + exceptions.append(excString); return excIndex; } } @@ -1065,7 +1122,6 @@ CasePropsBuilder::build(UErrorCode &errorCode) { } makeCaseClosure(errorCode); - makeExceptions(errorCode); if(U_FAILURE(errorCode)) { return; } /* @@ -1090,6 +1146,9 @@ CasePropsBuilder::build(UErrorCode &errorCode) { return; } + makeExceptions(errorCode); + if(U_FAILURE(errorCode)) { return; } + utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops/case error: utrie2_freeze() failed: %s\n",