From 70621f89239ba0a38d1c7694b6318f57c88070b5 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 25 Jun 2002 18:53:10 +0000 Subject: [PATCH] ICU-45 new builder for RBBI rules, remove obsolete RBBI files X-SVN-Rev: 8941 --- .gitattributes | 14 - icu4c/source/common/rbbi_tbl.cpp | 246 --- icu4c/source/common/rbbi_tbl.h | 235 --- icu4c/source/data/brkitr/charBE.brk | Bin 7552 -> 0 bytes icu4c/source/data/brkitr/charLE.brk | Bin 7552 -> 0 bytes icu4c/source/data/brkitr/lineBE.brk | Bin 10796 -> 0 bytes icu4c/source/data/brkitr/lineLE.brk | Bin 10796 -> 0 bytes icu4c/source/data/brkitr/line_thBE.brk | Bin 11892 -> 0 bytes icu4c/source/data/brkitr/line_thLE.brk | Bin 11892 -> 0 bytes icu4c/source/data/brkitr/sentBE.brk | Bin 10928 -> 0 bytes icu4c/source/data/brkitr/sentLE.brk | Bin 10928 -> 0 bytes icu4c/source/data/brkitr/titleBE.brk | Bin 8116 -> 0 bytes icu4c/source/data/brkitr/titleLE.brk | Bin 8116 -> 0 bytes icu4c/source/data/brkitr/wordBE.brk | Bin 13576 -> 0 bytes icu4c/source/data/brkitr/wordLE.brk | Bin 13576 -> 0 bytes icu4c/source/data/brkitr/word_thBE.brk | Bin 15072 -> 0 bytes icu4c/source/data/brkitr/word_thLE.brk | Bin 15072 -> 0 bytes icu4c/source/i18n/rbbi_bld.cpp | 2093 ------------------------ icu4c/source/i18n/rbbi_bld.h | 358 ---- icu4c/source/i18n/unicode/parseerr.h | 88 - 20 files changed, 3034 deletions(-) delete mode 100644 icu4c/source/common/rbbi_tbl.cpp delete mode 100644 icu4c/source/common/rbbi_tbl.h delete mode 100644 icu4c/source/data/brkitr/charBE.brk delete mode 100644 icu4c/source/data/brkitr/charLE.brk delete mode 100644 icu4c/source/data/brkitr/lineBE.brk delete mode 100644 icu4c/source/data/brkitr/lineLE.brk delete mode 100644 icu4c/source/data/brkitr/line_thBE.brk delete mode 100644 icu4c/source/data/brkitr/line_thLE.brk delete mode 100644 icu4c/source/data/brkitr/sentBE.brk delete mode 100644 icu4c/source/data/brkitr/sentLE.brk delete mode 100644 icu4c/source/data/brkitr/titleBE.brk delete mode 100644 icu4c/source/data/brkitr/titleLE.brk delete mode 100644 icu4c/source/data/brkitr/wordBE.brk delete mode 100644 icu4c/source/data/brkitr/wordLE.brk delete mode 100644 icu4c/source/data/brkitr/word_thBE.brk delete mode 100644 icu4c/source/data/brkitr/word_thLE.brk delete mode 100644 icu4c/source/i18n/rbbi_bld.cpp delete mode 100644 icu4c/source/i18n/rbbi_bld.h delete mode 100644 icu4c/source/i18n/unicode/parseerr.h diff --git a/.gitattributes b/.gitattributes index 571209abdd0..1e913a86b83 100644 --- a/.gitattributes +++ b/.gitattributes @@ -48,21 +48,7 @@ README text !eol *.spp -text *.tri2 -text -icu4c/source/data/brkitr/charBE.brk -text -icu4c/source/data/brkitr/charLE.brk -text -icu4c/source/data/brkitr/lineBE.brk -text -icu4c/source/data/brkitr/lineLE.brk -text -icu4c/source/data/brkitr/line_thBE.brk -text -icu4c/source/data/brkitr/line_thLE.brk -text -icu4c/source/data/brkitr/sentBE.brk -text -icu4c/source/data/brkitr/sentLE.brk -text icu4c/source/data/brkitr/thaidict.brk -text -icu4c/source/data/brkitr/titleBE.brk -text -icu4c/source/data/brkitr/titleLE.brk -text -icu4c/source/data/brkitr/wordBE.brk -text -icu4c/source/data/brkitr/wordLE.brk -text -icu4c/source/data/brkitr/word_thBE.brk -text -icu4c/source/data/brkitr/word_thLE.brk -text icu4c/source/data/unidata/UCARules.txt -text icu4c/source/samples/ucnv/data02.bin -text icu4c/source/test/testdata/importtest.bin -text diff --git a/icu4c/source/common/rbbi_tbl.cpp b/icu4c/source/common/rbbi_tbl.cpp deleted file mode 100644 index 7073b73f081..00000000000 --- a/icu4c/source/common/rbbi_tbl.cpp +++ /dev/null @@ -1,246 +0,0 @@ -/* -********************************************************************** -* Copyright (C) 1999 IBM Corp. All rights reserved. -********************************************************************** -* Date Name Description -* 11/11/99 rgillam Complete port from Java. -********************************************************************** -*/ - -#include "ucmp8.h" -#include "cmemory.h" -#include "rbbi_tbl.h" -#include "unicode/unistr.h" -#ifdef RBBI_DEBUG -#include -#endif - -U_NAMESPACE_BEGIN - -//======================================================================= -// constructor -//======================================================================= - -RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables(UDataMemory* memory) -: refCount(0), - ownTables(FALSE) -{ - if(memory != 0) { - fMemory = memory; - const void* image = udata_getMemory(memory); - - if(image != 0) { - - const int32_t* im = (const int32_t*)(image); - const int8_t* base = (const int8_t*)(image); - - // the memory image begins with an index that gives the offsets into the - // image for each of the fields in the BreakIteratorTables object-- - // use those to initialize the tables object (it will end up pointing - // into the memory image for everything) - numCategories = (int32_t)im[0]; - description = UnicodeString(TRUE, (UChar*)((int32_t)im[1] + base), -1); - charCategoryTable = ucmp8_openAlias((uint16_t*)((int32_t)im[2] + base), - (int8_t*)((int32_t)im[3] + base), 0); - stateTable = (int16_t*)((int32_t)im[4] + base); - backwardsStateTable = (int16_t*)((int32_t)im[5] + base); - endStates = (int8_t*)((int32_t)im[6] + base); - lookaheadStates = (int8_t*)((int32_t)im[7] + base); - } else { - udata_close(fMemory); - } - } else { - fMemory = 0; - } -} - -RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables() -: refCount(0), - ownTables(TRUE), - fMemory(0) -{ - // everything else is null-initialized. This constructor depends on - // a RuleBasedBreakIteratorBuilder filling in all the members -} - -//======================================================================= -// boilerplate -//======================================================================= - -/** - * Destructor - */ -RuleBasedBreakIteratorTables::~RuleBasedBreakIteratorTables() { - if (ownTables) { - delete [] stateTable; - delete [] backwardsStateTable; - delete [] endStates; - delete [] lookaheadStates; - ucmp8_close(charCategoryTable); - } - else { - uprv_free(charCategoryTable); - if(fMemory != 0) { - udata_close(fMemory); - } - } -} - -/** - * Equality operator. Returns TRUE if both tables objects are of the - * same class, have the same behavior, and iterate over the same text. - */ -UBool -RuleBasedBreakIteratorTables::operator==(const RuleBasedBreakIteratorTables& that) const { - return this->description == that.description; -} - -/** - * Compute a hash code for these tables - * @return A hash code - */ -int32_t -RuleBasedBreakIteratorTables::hashCode() const { - return description.hashCode(); -} - -//======================================================================= -// implementation -//======================================================================= -/** - * Looks up a character's category (i.e., its category for breaking purposes, - * not its Unicode category) - * The ignored parameter is used by derived implementations. - */ -int32_t -RuleBasedBreakIteratorTables::lookupCategory(UChar c, BreakIterator* /*ignored*/) const { - return ucmp8_get(charCategoryTable, c); -} - -/** - * Given a current state and a character category, looks up the - * next state to transition to in the state table. - */ -int32_t -RuleBasedBreakIteratorTables::lookupState(int32_t state, int32_t category) const { - return stateTable[state * numCategories + category]; -} - -/** - * Given a current state and a character category, looks up the - * next state to transition to in the backwards state table. - */ -int32_t -RuleBasedBreakIteratorTables::lookupBackwardState(int32_t state, int32_t category) const { - return backwardsStateTable[state * numCategories + category]; -} - -/** - * Returns true if the specified state is an accepting state. - */ -UBool -RuleBasedBreakIteratorTables::isEndState(int32_t state) const { - return endStates[state]; -} - -/** - * Returns true if the specified state is a lookahead state. - */ -UBool -RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const { - return lookaheadStates[state]; -} - - -#ifdef RBBI_DEBUG -// -// debugDumpTables -// -void RuleBasedBreakIteratorTables::debugDumpTables() const { - printf("Character Classes:\n"); - int currentCharClass = 257; - int startCurrentRange = 0; - int initialStringLength = 0; - char buf[80]; - - UnicodeString *charClassRanges = new UnicodeString[numCategories]; - - for (int i = 0; i < 0xffff; i++) { - if ( ucmp8_get(charCategoryTable, i) != currentCharClass) { - if (currentCharClass != 257) { - // Complete the output of the previous range. - if (i != startCurrentRange+1) { - sprintf(buf, "-%x", i-1); - charClassRanges[currentCharClass].append(buf); - } - if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) { - charClassRanges[currentCharClass].append("\n "); - } - } - - // Output the start of the new range. - currentCharClass = ucmp8_get(charCategoryTable, i); - startCurrentRange = i; - initialStringLength = charClassRanges[currentCharClass].length(); - if (charClassRanges[currentCharClass].length() > 0) - charClassRanges[currentCharClass].append(", "); - sprintf(buf, "%x", i); - charClassRanges[currentCharClass].append(buf); - } - } - - for (int i=0; if&q5PkZxL`$+`AUyJ~wOz;IQYv6ZS)ASx+Z z^uT}9{G1+(CP4BN`VZ2#%Ux=hTv1Fy*@tEba%X2|-@Kh!+LnyypO0}BsGy2Fhu?e; zCB4bx-=@Q}^O=9Mx9xwrzrX*9|MhqlOvi)Sa55f@{5wy=;W!9GzdtxR7n2ML{uQ7Vzd*S z6Dg!^!qsO2Z421lI#$^p;fb`{5kE5G&myo%kqz{Y0f(mLD&9(*gs>YD*w6KhxXoM zzF%3fkzngJuP$U7jqz^*W^=6%QK4+{NvavO8s^!gpc7F)U*4hM|9X`GlWtNEFn*X1_0t%}_x zw;}V|Smqgiz#M|xBQ*++M~HFk{09abjk^ueH(iMy4lK?(>JKAk-hyEs75u^qPn!yqXrt$8rGsFno$d_$b%QHV?Ao4&FBs~e7deK z*LR&B9le2#T)&Q6G;U(^oz^eHeQ$qXfify^1*t2#VqInAD2l<~iOIXNr8V_K0Q?1o zwPf3fY`G;mR=@Y03edqjbAnR8mqaBK-7$sx2+T5|4j%KqG)L^JETfNxdB_ z5|dnNw$+KX?9FyAN^;~WD!&Mc&sYJd44aH0FQe^K`B?#(pDb*Dq1oH4QNUa(U#lcf zf|V$VYq9bsu~`&rwKd7cmTwWI7ZSH1X=S36pD~4~cC1;XycaRmB@nqTy#D3}b#mDY z(^0O@G|V8x#EmVs&GeV2C;8PouaIK`C8n{qm%`;+|JJ?Ml~RP|U~EOZb&>VlIVQC7 z$R6j{IAR+cr82%0Fx41q zD2GT4Sq@eKl%g+i0jRtX?~r8 z)ZaXH_On>lVxu;VsmN=K?X`X41EO0xiP;lhdnwUYJqmNy3s%D=ZDn+OT`RQB%%^SX zQB=Oh+BaBx-HR3k`!+jNMCvMNQB5>PO)DH-$nJib-yE7X1vxJlH)eL7jhyxCZpF2V V{pNmkkIa?keudrqD#DCY`hT&^i}nBj diff --git a/icu4c/source/data/brkitr/charLE.brk b/icu4c/source/data/brkitr/charLE.brk deleted file mode 100644 index 993a2dfd59b4b321ce347ae8b81740e4f8bf7813..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7552 zcmeHMU2hvj6g}~0Hnx*SDh~*d_Ms9@L?ZE+wop+&@!?1WR8^{?V3KWOFm`3TMIavb z-@wn|5djkX1pYxe^SLv->$O|f@dLf9*|~G?opaBo_9rqi(3Kfe-M*d0%v9)|tV zML3>aMU&}a_|b3}o(Pm_coI#c$@A#q&}2~a4yRijzg9V#9B%@T-U9x98~BF7=m^LHE{;4yl^DelsLM*Ag37}FbZC4Az#gC4DSaEgODD5p9ql2Hon99Kk| z5^2ocml6%T2RPeR{%`P1+Wm;H81b`>P&@sIUl{R1M@)$^;dRDb1C`-7djC)v67P^4 zL^b%6-oJ$RkfVbwS$7gAP;Qa4K^6N%XVSIo)0XA0tZB~s62bKTnAqQ9raOO6ciwm( zGT-;?*if>q2M4&Bis*6YOrD;o=R-z(7JMC?r=q&D=Tmv_v&%Dy_XeYnf=9fcrQ&;x zykPB5I3l78>C1iW zc^&(N+y>0+W4X`pW9Cpi)GRojLgD!Nj~zA!cMsy+T-NR$6ZMl5lbpeQ&+7MZKUGI^ zwn?^h-^H_{B!;=LAvNAni7TkaRn(-VhI(8_BW|D>H?bLSB8UTQ#an2_Ek?KFw!!dc zbhs8{?uG4md%kzE2Rrf3;;7dVuRE}w4&a7=#Wzyd+&80qzSwmzwGJy=K*KKnH_D50 zCDS*;XKX2hV1UJ8sa>mDX8J*DxgzY4#1~e$Dy}}R!_s%StLx$FP!TG`@hZjTlB+Ku zm_l}!xp3teoiybDCMcaw2OBA=E+aq_6<;<9OS}#zgTOy!DoL^&Z9%_tc36|#>Kz#gSl%FmcW)D;^RDerYGO$$V>Ywy2Xph+&z!gZ9ZGYvBc zadDN!x4HiEbemtl^BP$ZC^3y!UJ93Q{VSif>69icC#MwSHcd9`+}YA9B5Q;GyCO)) z!u3>uxa_0QUa_=FlKP6IpoK*){WzEq`%*}|v6ebt!y~r6T`J?Xfa!zrrgDfV$a1ij zKw0($UIMBpBv`Z5)+|N6S@RzhlA3txiaD;*TS1$@Gi`D-QRTgY8bArzW83>C;mUom$o(YBevCz)y%??^@82Vla@B-d-HY}o14$r(zED1KgP5E z1~qjS{0L8eEuPRaQqw{6dSWp6v!0ba6RuDn+9Lf3B1Kw4>XWvHZFbwFO*dh8>j(HS zAmWSo?{J0367`g@u=|K-A{e*59tK++3~)Q*+b7bEi(7IuTr~)XUXMvA(iaDXs<= z?$%Z+ki7VF5bC@n&md7yY{=`YCkpGFg= zY{5}1BKMn6K|WPvwMb}qoQ6v%JB@yMT-M_`xFN^M?3i=T2lHnyCFEJbxFuV^s} zu1l{)xGnv<;6ok?!Bj9E=__y@cVS&>KM!)j?nf$0k``eJIWuG8`6}8f&U2`v@imyi z7$(s>BTZsYk&@4Gsi(apW;Iwgu;x)l>%UDQ5GJizk23bA%w5S?dnNKsp1+Vjh~2!w zZXUBv=vUz>JdbI_HMFVYF4&l~Hr|2xXa}ysgkFCYp2qPko+t1;jprFW&!YG5yjy{Q znc(DvJCU96g(ET|E6$5K@uGN1Tog;T0MysdFlJhc>V1Dzj%Fk zOPojCkHkH^K15XBt98t%org)xiCw2^7fMpT!grpm*2~e(QO++T5?2B3*d4SjAyNgo z+>xAB&Uis}#-aXdhP!MW*U4u9iuxA(a8M!jfFmwL+3~j5a^^!*|U%?xtBO z9Km(03Sv7YD?C|`G0$u8tgLa$AZ6+}u1IEstYiH&u9%ipt}1a6Ynq=Ct^KL|k-vA) zL!2F&nVgf@^rXFk#|pe9;~{?Po;)tG()l8W=Ds8u6VG(!DHG|KOqt2W=1F+!Ida^< zp=^!g-DXzISGp@i zI1UWS&oE-yP9Fz0-QVPXKW_7ba8Sp12&?8$;uu!;597UYIAU@H$4A1Wa5Q`ZJkQJH ziPyZB&&rX$IF93aZpQIlCxfRmnj3Wry16KE=xo*ewpzy_J^P9}?bsNMIj-;f(uMEj z$YG%%X9dSaDy@7!Hu6S)TPkjZPstJjV&H6UmRQyIDQ+&cR6f!YHX*qqiEkPqBM8jp~V*P2sT+uIthubQYJ6sEo@q&$+!d(#S5nWuXB?k#t>Gy4?4 z-~|IJ@#w4O81d|sy3bE%QgtPXRm7NmN)zCqtuIme7)<+}+E zvanhTHX&dy_BCOWoVxORdNb z0N<5<$VD879?{N8TefHPu zYnlC~GW=Pao$0r#i_G_$-7Z{iY02PU%BUhNO3}-?{PfD!KSN+X_SrOaHDM+x~MjqN8N z`5^|XF$0>>8g7ittjUi;17xPS(b!zmOvBt5lhC-uejoyN z7TM)-q8fL)zF^ZNp1BH~<*pb?3 Qyvk&aRSUXu6=>Z54`er_5C8xG diff --git a/icu4c/source/data/brkitr/lineLE.brk b/icu4c/source/data/brkitr/lineLE.brk deleted file mode 100644 index aa06ecb0079db0ca5b895d06b5953734719a9bd7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10796 zcmd^F-D@0G6hE_*>}EDeYrktNIFZ#Rwi~WcADSZl2qHyVLhF-l4BPCsNt{lt?VfNm8&OPV%yXTyl zolPc!ACK$>Y8g0t@#RZ$h##}7Yj3SDUtih?CugU^=T4kBaV$Jn-KeZrOB>6p)zV6M z_I7Q#TB+5-^QFZl8ili~>ub|tsag(KH;OfES`CQ-=6_^WfI{1DS(gh{07eg)ZjL(^Y+_e?}=I&noVl z*_MM!!1AjSwFEaMt`7m^VGJh2sgU(mxQba=liJT97w!t{4_HIe5-cJ!$_pJc@G7oV zoM%x-<9R5eu7K$5senE0lrYDoo^nadYOrMannM}cf1C0^KFMY+j@TPAc4f=E(@EASMaN3_RPT(f~$aD9@k$-rEkfh#bf?XSSoIG)Dy7@jkDp2YJM zqJIZBvLYl}fb7_}_`QRmPe5-omwI z^i)MIwH zO}kQf2UoEwi0!1T@MJw2p7ZdmtZ|AUMd~P86fgpIG-$QaPoc$>ta4R}3s}>9k7$mk z;zxLIBZ7$z&1}v|-*l(FhQ~6zDfdJC)HQij`bx)(7}|MBGA5qs$WtWJd$MIV7ndjb zQ}>ahmLH1NI9_db#hk+EQHuaPJ|gZj;6)kmBOivMA*mUTh9MheAs6LfBpQJr!rNe! zhtX&hZ-%ioT3KV1j-zB8c0@bOiUy;d(N1`b2*-!uaC8`+ z$Y!&7Jn=K1%@50w{_s36oAonZ;CmT7y^-9AH>R795|7SiEoiCrJkm2(+-YEIFzWe1 z5J-r?%Msw%n4FDyK2m9h{m97cy=|#|qx+OBAs`0c*2@xBJ)dH7sioqPmhcjidy@E; zJ7mxO{H&(-8o8IcTxB< zREipTCIQVIlzJV&RDq+SiMu$ZZY9w8YEeU9M90Pt0-m)rQp%Qnj8AI_B)M_R*5D_C z!{?F?gl!ezHo(8z0HZAbGDFF2DAUP9!WK@_E4MOxL8@n=SK)qC~8r)T1AHzuk+IhNK|T!or7={2~@y%zLmKs4kX zC+-l%ii(4&;^zn-=EM*+Bjl59&Z(pHHh0%kOr)Q;FX&T=$3Hbksiy$sMswKk#zsFo@l+ej=VfTKJ~a~B)nSj7L8_Pbb^681F=ii5 zyc_sI7EVjKO$Zpp{Y)%LPJQ`1y_0)c(i6|kmT=T~Ne9vfvFX6ez)hTBY;)3Evb{@1 z_q`Cd%z1?{uPV9&H0emz8vQQzSP+pUsLg4DGY!=q>iB`&2J)CVz!sFcqw_8 zHyYyB2mijrAqcY%J)+#`$wMEQO6d=ke*52wwJL%~tKVu@?d#jycl-VZy|O+2EmjSh zaF6l2eJpdIR0cn5k(pkrI*5F?(d__oTPFQ~D5HvSD8&!w`s1Jvz44NIAA zgD=}j8`8Kq`51>7sKyLvM%=iuXJ$<}q)Fm{WNB;+1EXOZ_DSBj8Jk$3J9=lZvxcvG zAWwYB1EYzrc;JZMbIWPW#$L(BZj=$WQQk<)-Wq8@;wX*Ei1NtZXlA@AZpwxka6U$G arOQBhN^9Jmy588zVU1M_x^k6Q)Bgv@h@=ky diff --git a/icu4c/source/data/brkitr/line_thBE.brk b/icu4c/source/data/brkitr/line_thBE.brk deleted file mode 100644 index 8814f534ad5ab854b219080a7b1275f05d860bf6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11892 zcmeHNOK%*<5w0DQv)tuy`H*Z`_8}ufhDpn$CxCMh4MUb0$1nm5gmkkk6O_0X*OEwr zyGkO!hXH~hx14nG#g7DrokM{91OxdKL>wRif*`lt64-*_^;gwX)AL*|xx1tdqz66S zU0wavSJlBQB+oUb|37uZf|w`h1Es>#}_YNeBOU!yVKm; z-srS;wm0tjuikIBwwvv?|K`T6E$;cNJA1p!{>JvEzth=j?zNZw%XjbkR~e=4Uv0LV zd%tLIE~|yOJ%!$RqR9@Chq{1z;UUq|=cr$x?qhbHHmOCosDt|sZPNzr(E~_drt6Ig zEj5-K0q!e}S8)Gn<8|Dx&<%PCE8BD*x_i*QiZbfKOpS)|aUI-m zBPu)avuUD2*Kw`Tn~<)+vgtSD{wlpI{cL=aV`Im7aD$eBQN9m1BJVcg%Qj*o9@RE$ z%}9Pzm~GG<_LRn`Ms@Q11%EM`-=>?`sa=`<*suC?-)baSx`CZ<%AE1)E$FIOtC-`y zGE}7VK;&q!&1sqEY+z<>+Dc%pLB{Ld-3c& z%!zM}pG9*WSmCwn;LribmuUeja(InrilnqC_`A`5{!(^!Gr{^=zX^UD{4V%_mVyt1Pl7)Lp9Y@=e+>Q- z{4@Ah@bBP1!GDAQ1>a!y+is=X=>Di6rY6#NWCz|e@s~`r=i20{o=A={PSh2gn~y&I z8uv#_!DU!@7<_{JBgDdFay}ZI-RW~=vzp~C^JIYa6}l2-jp^@5^fV`!$vU#-KJtUJ zT_b4bZ%JP4$;>9(-CdSscbs!J$!{8%_RN-_ z?}E%N-7DkyP2?1Nxgfih=cXs|20bSnG{P6;oTOPYM?1fW73O@Xr|UQk=GkXXna21z zdLP&}emqLD<8j$1%LQ4RJ;hnOoQ3b=YSFL6S3cwFnL)_!0kco;b=DhwQlxj2qm=hI zDNAxm^JG@~T)U9K;Ycpv>39h;n}?Ug^W2jPdPyygU0x@tu?Sp#N-FDKEQ+vRklc9( z&s%;*7M;!aF>{^x6csVaui&Vd5?1iWBbW3X^qp{w#=>zL7s&)ogf6*Zm8#(+O@=k9 zg>|ZjQ#2KN(ALWnZ5>TRJ{>lwk&^1Hwlg#n&PE8c(43>WSbv`8<{4yoRYHxD_@yKQE(s*8`O@+dmk2ceo|zH{f%!jDQ)q2diaP4Sk5s<&uj>USd>O?h5e( zJv7Lpz9*PlQ}J<`H%H}oxF)D@m!m3SbQ9;-a!m9%qCHCnIXUdPZm`LOnZbbas|QGY z36)Y~WESu^Wu@C48?#xEn&LrPIXB%)5ZhO!LM*0}wC^}()N_$e_G}~jtiwjKjwjYy z`rK;h{LG9y3fN8=8I1CA)LB$EV)yIv&3lAG#Q#dN;grSDf^O zj!Ifxlzs}t(xG>lhS-W47)V4VWzKq(5}(|_616?Bsg*N2()O0a4Rt0;2*r#Kg~%hR zd}~TMtLnK{-o0ga`_s<>j9w6*kl~t&zG}{L%K@m&R#5o+F1~i6sTD`*UI`Xe`7TKIMDJ}EdDe%mz;g+R~&QH4VpPq(O)U70dGpL zsgmNA$W?6C%P3N1^hU!xX%xL2rF_O%$m4sm#EqV0))(pV-eg24IJB8E8WN+%KZvy^ z!S}m8$cvMkA2@C&M|~)k2YoF{zbPF5=y3Ztl#m?)WYFjy!qkyeNB>g98IhtEy&T6M zm;9lbQZNI2cIu4z$c*2l3zSbzG{^70Oii4q@PC)I2&p31lx;u%-%?~cu6iFFN_5hkGFY{~bcQ@*{R$Kne{H*`PsZ*yO^Dj19wT(u#RbOjVZ~GTs zZPpvLX4Ah^U0$V@KfkuIKIc~(EB;z*wYJfm^UvSD?O!33rhlc@tZlqjTba`qq&|Sv zG@h;fc$V;-0eJ5uz~>c!ukqZ(a~hUl1?sR2ExfKl1FEnAcTxH@T!UP>0(0d6ucyoB z@cK;odAwdOFTxp|*@Rb7cLQ~A<4n?r?<>&$AwIf_$fQ$48?3dN+kA5Ksm*k~I`XM0 zymLq~%+PBg(_4&p?z?oPb>vrNjqp3F`enjaXwjaV&Gz4=Ax!=O5tf8MZ6_sn~ z>1nux($lc0+O?=X4=-`s#-;e!eX%1REJ6h_ve#h|ZBVpV(3b{ZDTXV>T2bUTnOPNH zVNZvk2qRE}8GqK7@dn()m0IV~kD%aB`t84Egrrqm`5KRjS1+Qj_G%u-sI3`lq-jK? zXpqe@9@qSuuxeo~AX}RMB_m8DdtoBh$#xwxjl81x)_LaRD!1-Lqjzvje0xrfwa^O9 zT|y2mv2!B?fa{oM9cn|GY z;C`UA-`7rB zI26elygQynnaRVm?0Is%DOZdZ#V)C1Ys?}p-FGzWPAt-}9_QS-f%!xC$h5urCXQSK z+C^zhk~1hOQbK^79HG>C$c1?r3Ws1{xQ|DM!(s434+>!c_J{kS7~(@K9D&hr6dzq` zHqNrzC>=w|7?i`Z(MV~HhvP6I@{sDua5B=G3a6rWWF;m!5FS9u!SEm)ij8E=7>GPP z^#9R+1mN&)uGwDL3J=2r+HUwjcnFTTNB=SG5#b23cu<&xR38nG!m;pJu~;0(6R+dN z@nLR77tiyG#nPCUFL`{9^@}YB4va zqU|yzTV;2+T2!b>QROgt*8VjdtrkVJV~HW#-H!A68??;y2b7%MN8)p+G&Mpd0red# z?e@@+$pUAHI<9hJxRW5Vua#1mSPpO;?8NqVqjt?ZKy8Hqb?%{BI8!I7Dk=2n`@ zKG_vm7Nj8<`-#!aISI{#fP8kmj7jwPq#&@}`G`pnczE=v%tZPd^l zjCm~?P^~QlkekS1q3^r+wG&TmZG4 zZSSZL#B#qMi`-8NyWiW|`VAyxTL9@dx?3={E!Ca>P(&HwP*Wd{qsJvZG#v%$;ImVY z$RjiQB%PwXccMx9>`Ttmo`{&EH_m*BE0T*Hyg$&{H*n ze;ES4tCoj-c%6VU{_#J|fBu(Xi2qhBbiocDZV5F_<{^y3%Kedc#h&(LTLeChv~Ht7Ue7`ESfihr(~YSGFmilcAwEA zxfGq}MoW+AT3aN$M$5%A7Lz^_1MQ>6qD)G|l1LaWv1sCzhDAKnut*-~MLSN%-e761 ov=vHfQS8NHkxaO;r=N)MEh1I${VBRPCJHN&l#uiapr)hz4*=L$vj6}9 diff --git a/icu4c/source/data/brkitr/sentBE.brk b/icu4c/source/data/brkitr/sentBE.brk deleted file mode 100644 index 7c61174f64485121b281926df841204cd0886089..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10928 zcmeHNSx*~R6h0VZ#$zB&LiVMT3JF3Qnm$5QMF~$;lPXHz(xj0EZa|_2*``vZKHQ(v z$NrwGQh$njmb=d81rOmNGtAs`&pqe+&N4IDFrfZfhgE<%sKJXjKl}uG+dV${qj&i6 zV32I=K1{yd-roK?`Mx{o^t$cA;c>Tpl)O0YA9g$ae)6ilcR*6Id)zzOO4{B1Gijg=SN4$8$#(PZ#w->@AuCBmSM2t5=v_T^}hfjJl65p_#0sJCLXBYe1fZeIE5bW zAK(a%;XC;GUx3H|4dFdJ#nlgR2p^$~D?RAo4A*hrfmf*8K^u`fR^EkQSx(~$+7awA z!VbJdHu5KqQ`EkMHVlxf%j?hKF}&5QeK^6jJ!B{DCM?4e#K{ITzs2YYMR59v(8f3i zHp{t*1 z&LrCRh#dGMi{qFI0lt9^#x#`?hHRDp46GjEEW8@e8p$J!B#r5EGo7Kz7RKC=hSR0Nmdvbqvz&FU3( z6|SmPSXI~Hn!3(5*U{q!+`zN!H{qt0jV?Xjf?H^F8*ZyR+AD4GdH%Tq7pDSuRJfmK zWOwx)au@HIdvH(Phx;lrTb(vDv17fy(4SnOzwGqkL*VIkI3d&ViF7=bIxlmU6t55`16+ldH*OGeI)ZD`pP zz&8~KK6@eo2ndfvGv1zODYeZ#=a+j{+FZ&rDTlM&GO-|hMpvf5P8M!sR8g)Wxh``1 zOBmSsskbp=2u#Mf%BTYomnk>m5eU4@ca5Yq9)>tfU4;o8gK88{EMn$*#M0z$q>s38 zlU-q!Y1V=lrGs`Ei|%}4^+F*rqRUH!bUZFnmZ+koTMF5qminD-(W5yV(8crS8-Kd6 zxK*Y!1VIX#2XMpT;$lhHL*u}6)Wq@|!(1#1C*+q|sgqeO9W_7M5J@_5&)4y%iX)z$ zcVe?uFIPTtFIRpv{LC4eBb-)bi_l zUPmDTq@Q$th?B`CJ(rJ@=&p~3aYMe7PKIlWrwb-iFBMxzBb1GnI_QPf&Rn2a_<5xV zB=D*d(3$!L%1=3QfjSA}ao_JV!A;fP3`(Ft>R>+gL|+xg-`*6gH(DBlWc`J#%-x;9 zK8sMLrm9PPyzgu0Y$aXba#qJI@Veoqbslrcy1R_0v)82bXCu9Z{Av8u9KFPz1_ZEk z!J~5#UfQ1j?5IzC187Hly2o@FoO+FKE%&(apANok95N&O4|T*8bQHz(Cq%RHJmWQ~ zLRiA{co-5H+S=OX`0<)MqMx7C>89&(vILfw~l%_$I+E*7-)%c@w z4u4ae|4irahsi2YRP~|Hr#p*W{VxUTNG3w_fG}gm02TTXlDBk$bVGUkQMeRU&i{9d zgB|n#yVduoIK+|_-B#&G=($+biGCkP{&KzUc5F^H;$ul{VSi>V`-RRR2TI1_Ju)=42B?!kQjLhig ztsY+?vib$A*fPhyh-YWNz&MnX(HBKv@FUDVW_x@k$>J|#PDW2yE*@te2fSq8Vw{$z nr@RbKXixgIJz!2N81OP4ejP+lRVvcEy3y+rl(U?vu`K-$0g-8i diff --git a/icu4c/source/data/brkitr/sentLE.brk b/icu4c/source/data/brkitr/sentLE.brk deleted file mode 100644 index c0c1668d80b36d6e99d828dd55985eb235a54c60..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10928 zcmeHN-ER{|5T7`XeLg3lKqvtUJ*r4apalAupb9~rszQb68;~LkyAUaLBHO7{sSo?- z^s#?WRjGf9J0JUbcRqi_7xKU!w>vvK^ZU)r?j25I1^;ZW0ksM|d-dbbEMm8ReDX){ z@WaPJvbDdRe7C#1`)%?=chKo|TZ6;nZtEy{cG^GecKZF~MeF@Xl9K)7-pNkV>b8^P z!N*Rozmq&YI!ay34d+ciKCKLB?eqHt_zdig$$fCcxiY0PFWqkM|=Qp%16f z!|NR!!7+RfZvg&%0K@+d;4!YggG2ZLU0mrw2WPbY2KL|u>h|D3%N;B4!>=r-aRu!N z-ZR1;JV!S2CyrCpK8F?zkgLnd9*P6dMMGI@cJQEQuyA&4y@|6KCX2U;T^{7=kN>SQq9seqUxHekZgQ57^jW) zE>0KO2KrEp62o?q-Cc>aja8(ov^bY`qthV$4&1|xwv#7TCKhAqM|97#@EPA=T>jid zdw|HH|8`>dLXXK29>Nx5n#u@6w#t77R`+ohp2qDn=5ik%W-wiD=FCuDMh-JBsuAtP zU3?~}cG=9*nH;cPSPyfCC~tWZs80SK6*I`$bZ4!IgODAE$@Ve7suaM2T7ZhGz@l1& zs;a`0T7sIYL0#1$f=ETsPz{Jx3{BO9Wwi_|Y6Vu+DqK>R$UIz&{S7A-9!8LUaz9hY({<^wuY+zljTiI}=#~bPf+T2t(;nrB&wD0Hn=L(#21=OwZ zI@dG%m9QxqZ>!sIN8O1`t5P!)J65Yp4VpCK8V!q4(>9t%OVPr}7Dwb`?K6U`GtB7D zh1872s@aU1j9Wv*(T>JAL(2*q7GlkGWt=jKF}Si!8IULa!xZ$u~FWB|aIPe@dvHZp`7mLCP`DIqc zji~vFS72F@hv4`#t0R3fmKg=*9As^=te|}g_{Tl)8P-q{Fa?vq&IMpTvIWbtPRy3e z7TCSDg28q4Ml+{TXDma8){>OQAD;S=^)Hd%=4i-yFRXU%0@5G@H6L0us4G2F%dek# z9fbste$)9OPA8l6Ts}^sr#>3SHTjuzI$TpcT`-w?sn|jqp=h+!!CqMH+y#n--&cA- z0pbRyb$5(sv)82bYa`u4{xp7Sj$UF<0|MB& z;88ycFKy4icGMj|0NN4X?lIj4r&{4h%e^lAw}USmhs?%9Yrzy3ehy4XS}9W z2us*ch9Qxmt*u>-AFsJC$3nKiBsI~Pi*nJF6YcRu3wnml@OEsz*%TeC`7(`1^ODRF z$NSIEqKbdQz}BQslXB5(K}&J1Mz1$%@=P0M)ETjy@soXkqioAbX%TWx}m)OC|rsv=kL3t zgPrj2yXEiE=nxB5^uF>}U~;+zz^}wW4f_V(i+IbGV;ZOygIUo8zZhs+74SV_D#mYFEDxb ns}?7;Cwp^`f;UHuLqmO{Uefs^=j>8IVe^Rf@i%{ypR;ga9y)7LqD+WF0QCa}-b^&_vy#5Pd4s~;=n_I&39$NRI2z4mI zb2x@Ws6Z8JsCfxx)IEe9{H($z>Q`Y;$|WtY!z(Rsz#A=Z!)q?@!YSOw&t13=AA@U% z@W5a^kod%~fu4$C0~&^0CVBn^6y~c7pP{hu-TMRu?~C^puiw1ic>Uu)_TSH+pvQ-W zAL#KHc?t%Ptum1bRolSIRS!f)&y3guLr%llL7ya&yR&<+80!V0gn5Lroplk{!p16a z=dCso(+Y=;f%2w<^y$&bSq!UOkX+k3u)Hk{#$(xb}IA0;$ zMSLA)uIV`#{^?E~TP!W(>16_EHq(NvRGHxIJ17bq+XuOrksVaTd!brG)sceckxFCJ zh)u@CbxqDSzw(a~OKk?faUWJN&5n|v+=(P!xD_UmO+sM}vnqA{{4PXUSl!&pc48zV z)f}l>V`;(7&8@BYCgCE{kxA0G`x>?6G~%iFwa|UW7@}kp!}-w~S%g)$xBm>N&cv|! znSVq&Awn8jX*>)Rp@`}Xb2mlJk7;OJvHJ{#wzMlav8cfzd21B9Ra_9gG)hrs)nqld zj+(jMWHdl?0wuu&o9gh8GWfe8G+{x#gM1Ir-pw?#Mx} P;mL=*#YkpK+i?2_YoM{8 diff --git a/icu4c/source/data/brkitr/titleLE.brk b/icu4c/source/data/brkitr/titleLE.brk deleted file mode 100644 index 909cb50ae0e3c5615a80c0b3056d5c7645535533..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8116 zcmeHMOK;Oa5T4j+>HGbF3pi3FplT{WNT@{|Xv6{0UfN?DAhe;a2uYQsNN{2Q5E3WE z5q<%j5eE)j_!rFVde>`vz0R}hp|MfX7KCg0?^^W=FYQSF5<0NKYiag zIXZ56w~DvDyX))gtKQREtJ0{ITPO8ex$146HBV}lX4Bg)9~_h973+=DHLqMd^y;nS zN~5{vJ+4;09YSe(JC$ao@vd^XCK<$-#UYQ^zJu2=!0+n-*&C?CYZb5iB*Ouep$Qdu z0Vi+-HK;=aHLswGx<{~w=LT$}egjIrTv2inUMqPE-YR()-f(#z&fpH7_uv71f@@w0 z9%_t-K0YyQp{Hcnf|e#%x%@I$fO_srz5qY+pPd4Hb-v;KyYmO{f1M}p2lo^`KIVR+ z$G?0D3L1~8GLZ>Y+rr9a4@5@Kwb(;VPQuwkpE#1evn5!G^ny@=JObIwx`JzAVimYK zqfNwg!eL^dyy;*78I}QuIp8uEGLU6i7-B;(%!c6#!xKi>2#m5(7-M5F&c>KWIQ}>KYlOySrvuWLepdcj-p(5~R zj|gH75`~IO@f*A}mG2y+(Y~aiSj3>c7AM`L5{NHpOd}N3H+mxZ4@QmkONPOaro1rGj_wASrO{9^_(1_D~V;g=`H~M+%yUDveA-Ha{k+ zYjdvnrGJ>%*QW4m&tVDE?&$NAJCQ^SH^RiSi7AX>My0BsKZPg@qnlmXO^lC7HHWHJ zSW2*Sb89EQakx-)XcG7BzeX-Ojd)3ZCG?*$g6K1f;rwW|EP|@L+kYBVZ(`8=)L$aK z5Frh%G#(m?P=s}wxu2rq$21hK*nOHpS-KUhSk&NvyfX^bDlUj#YNbfCY_gmiN5$N2 zG76wQfs){b`-3IQ+CNKN)TIB7nk+}720r(fL-MFi$@Nm(;mEt&VGOyS&*Be=C3#Kh zm8yq#G8-_r5qc87)XOmqO!~=@UPQ%dj|}|nei5sD%XA&wGiXzDKI5;V6hY6BP$bSd zW8g2E;NL?VFt*-gjVVL%_6d)V0`N7q$8}~EEaAHyL2NU9CFoymqD3MySW433Md~&) z5B1+L5e%K)4M3?X3rBQSjIRFr@xkcOU!_miU(hfbQpleK-M~936m2y|kL|lmZRk7b z#Q{vOEqPyK75LYP)gCW2oluYIOYhB;xUTf|F*xw^BYt0}Z|vgA;+$OQ>$iU3r%LWa M-sJ1UP}-W?f1UWTo&W#< diff --git a/icu4c/source/data/brkitr/wordBE.brk b/icu4c/source/data/brkitr/wordBE.brk deleted file mode 100644 index 149110d4213ead4ea5ed2e13fcae7a05e07a7a83..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13576 zcmds7TW?!M5FR^@8{4_KrR5Giv}x%rA)xZmB2=JH2#Qd71F8~;X;ac9asr6Aec*w= zfP_FG@yH|pB?2V)8GsOGE<1ZUXODACoRqSz&(6+%GvCbY?y=(@Fh4yFM*(V3hgYt@ z_7>Q7bz}3h{_3sU+u50`XR~K7UAlB3dv$%g*I!@WUfozU^wrY0c>UY*P(*z@`!edkUi!<< ztYOx?YBulJU8dLIgF>d^O5cD5w3!E=gUA%cJSgK0%d>)27M{!S2K%KwbQj}V!K#1C z^kYIy z8QE>%?%&49yyd%a3K6H_yXKv2DSOt&vW7U?Gi{j8&SYn;_EW2+-SaleluOIa(O#om z&tL_l{^wcNytEpA_AH!3tlvdf;4Gg(iSB(|!@l(=b&mZBitK3(&ZF%)wqIl)%65fg z2tCfbbs-CiHlE$YRST;`QC(r5aovlS$~mfEs_QwJK^CgbmSxkue~!6z=gG+LP8v%y zHN5lIOdaZG0;LJlfQFfbNz;U;IRyAJQ!r&(&@ydkn+|l$G)$Wrm@%_3Yvy3ibfIew z!(np-j)*sS03JZwqj1z5BM*gF4UU-y;X(5dJY*h*hY>5UN8k}BKMIdJS?BZpWAK=H z93D4Mz!Qvp5)R~L4us`ka=C{r$7$!~UdM;7c^>`W9%qo5r$G1YQ@CqR4;G2wS%p(4264oqgDsdwT*e6VQ(`N>fD#Lgc?-dbWr z;duUbyV;bHMnXYzBZc^paeTEP3fBHpezGGHNt{O1$lBe6JU^D3ID8B0FCPF(6w|vR z2{}9AqeA)9;UWN~;ytq@W6z(P=0$XjRr6;9HpTO5mz!*nUm8Y(jVr~{5PrNPS$?*V zfi)%%5)(mWp37o~*wOYO2-`XSLK%Dj_?<0k*3`RYsdym{+e9QUm^|rt>3JQ*HJ3z0 z4lme$q7APSI=j|9EKk%W`R(13?&>)2XoVC>NK0Hk{g7GY(fK7VvPOy5AA6RacV-m? zT`ThYgwe(iV#!a?D%&~4^Wx)>0^&OZPZ4|ETyaDlJKueTrKXj}M)pl>AR5#S%QTL| zj;L~!246)-K3!rgD+C9QpU=F&a9RS0E1k6{%i4l9iK274ktA$68`s5{iLua<)UVpejtQ_FBJD5&3ZrUKm!z=W)fcxGu}5Os@fz z`P2EbKIPZ#F&6wm@luRPj>-u4qz{?)kZ8rX4k8Dft=gKs$UdvF~-@pHeQ>tV?jGqjK&F^ha!b$s3 zUhUH;uT`A(^!-U*9+Q$hJ~dma)2bi^dvIg%r(O&7nAyKJJu@RYa84C2gsuSF0|W`E+!EF>I*-_2$u52r(>NLT+y8E{9=b2vZdp~I zua{7|)fQqH(FiP?Pewh${Fg@>eh#3i-ERFp80+H6eK# z5o_s0M4B3ZD~wdBL4XpGe+}X6c-Mq++%wt!aBoB7M=%QCN6Lefw0E0lR5@smGw5AB zBzv6YXUF6LGKp8O`E^x&vBw8vp^NW2;A60G1ZJTL6F52ybF8&rqKk%+hJH7g#JB!_ zKt80Y#twr{5rHZa-NHp$z^D#cjc1{U5L2zphsSojXeIj0re_89v%7;&b4yc=9R{5u z0u_pNn6?V<10ODqK@O2rJ0(A(h#s|-t&2w}ve@H8&4H5tNDRg##=3YUP}(1X}MpPHoek>fXYLQP=P)nC_?3pHY$ObHf@?jb^!4<4?OS} zkPrwY9(m-yM1TZ80}#TT%bdCF&N}PHNn4JyJ7>=2JC~VVJI==7o42;I)0fX=FI~8B;av9C`c|jAzPz=%vA(>Pz4>Wxb-mN;WmlFzzC~Pi zd850zkS(vTWE)$zI^Etv_Qu*;c9l?i+0{<3)BUWovY-)UJcPp>9{)7(IEcrK0AIcg z@ZouYEj+%$V-9-oDRl951J+;zUW01@zs*Dc_a%5W(%ynqw&}o@SZ&v~&u_o7owi$r z73f&J*SF`vl-O;-GTdam*MChTZDM}QY+-)IysVX%!XGI8Gps{06$`<0f?Yj6>+pzI#M9Df-;VLn8q?u2uRBKb>{70h!T>y~U+COQ)$zr*s|m;-fp zY!S}S!_QaX-Ff)&JNRoJZvS1x)&9*vSO+0dM8f%caUQ-|{2tH$E=nBAsB5)vp#0m# zf2?K=Ym-&G#i$n~D!UHH;5wYbW5Hsn*E+8MUH0)d=ipA$TW;D_^_EiQ`?fZ! z;0HuqLiFQ7giXw+hxy*XRr4`Upv6hFzujESUdp7dHMFCiw_plRXJ@SRb1S9Y^8xZy z%LE=&v#<3u&ZPe@0P$LvT6palIE#9}i!Q+##**wl!Zqw-7K9;vj`76G>j;hf7ovPF zv44o@b0rT#kG1(OQC622GwyrQQ!5-6m->1ZrZEcjre~w+-apH+b?3>79E1ro0WH&lNizvkW(uawG|ZS8m^HJ|Hf=a$ z4#8n_n9%U`h#HpvFT4SF^buJB(wqd`C@z_%m#@&#oU1Sct^7F z*+VAQnGhr=G7&wu#ZGac?L#K)=i(RI;1j^lY!O*k56fKfL@_K0ODI@8$$05`9R!+N zA}psD>_3r)XGxu*wGJC68j|tt-IDI=sCQu{CVzT`#T$npB)$h<@6 zsA56qit&BH=;J3b=M}WdR)_Yy_&BA6_|Cv8iXM||j)-IMooCo;T4^NmZ(0YDplnd3 zaUOO?oue}NEHc7$g)y%tIC1g$nin%vO9FAHbCssUC#AHrMd9o59JZHUNR+8iz^W)e z3ZQmMM(HJ$f3<5TigH8Bm;zw7|+LY~R>BH^VECB<_mH&vWL5X;u5YU2!hZW%ZQF zHK8&*)t8MaT(`%V@fV8cVvFRgEa0x>VWeFFJ!H;qd-AQ8yU;E#f%WC;p*4tyOCR9245&QS)`jcydQCd4)ANb{WL_5WnDDio)58sRYJ14gXJ-GSe>6enELG#kU zzNCs=8s`5Yt=NL3*=ReEzw?h4eK@paxlTA7*6ys^t^3RLx!&D3rkU zJ)$Y3dL3YT?`UdzT58~$DxAo=g4r%iFaWh%(i#-bTkI`o7eC5rlo9tDf2Ul>A4=M( zs_Ob`R7~r@QQDoz)D#DHiWC=T@&VU-+24Qy-kfD!6I zAi5wVN7VAOWAXr*@*nEA`nHaTenC`|3xuwB|Zv0ESsV-3~xK>S@*nEA`l^4$LGX_JjRrh#kR#fVm&{ih=@vM>*6icvN+D-+Ir&O5&e|8SQgL3l=er8 z4yS|_Wb|-$0}p&Gn^zbg$i@}GN3rn<@Fq%&4{ufgMUrFNvQQwlS^wxOjwvKdN3 zKD{$Tiz2Vp(ltW;Ao5YDA4GZu=`If~*dTXOR29oS-H)>UrM|Q-mhd}8us=8wsU{l# E1FODqCjbBd diff --git a/icu4c/source/data/brkitr/word_thBE.brk b/icu4c/source/data/brkitr/word_thBE.brk deleted file mode 100644 index 912509e85dc67e8a18ebb31107a774fcba245b7e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15072 zcmeHOTW?%f6<*`=nRe&k@bjbH$4^dr*YDiuy?68G&3Ae~-9PCc?Qfs#9_(-L^*;Rccz3^l zeB8Uc{pbl(y*md-hwHuV{hi*y$&>!k@p|ury}jOjwsPFN-#_jj{ieUOZauj1BB)iO znU9F(@q8E05B^HDIg1DU&D)SZq#fF&M|6Vzfc9ydj_7l6e@G9y4O;82cQ?>)c5kEq zQTHe4@6i^$56xrx6xbtR_n-;<--*`#F~~`uz%pa{@B!2*oiasP>E|20VEp^+*=b9{){AEPP<$g{6ip$B+v z(p_*j;aTavqwmnKh0fL|oU0$rV2jpDRzagUL>{|Xm2LRsSavz9?z<-M=T7<@vS%Xu zv61bF4)@TQ+nMVd{xjh}RZAOF_fU&J{`ueNzv5{8%4(!`O;#za4cp)xV0q(Dw6^h; zvCTS1hW`Qhzcc(t^c2zQe%r>5I|5F2#K!M|`h!8aePZhTk*n-o=y1jJ@u?11@&Tki zH>EkoRkBg7!b(vb@sqG+BHO`Eka!WrJ&A-*!EgRDN2J+i~qkUjBpa5>t)^xmRR z=rSH2*NrW!wU0geTy#9hcMBD0j>wVteCaV3Uz ziNnTxOEL+gT6OEoUODHq^bo5bR+}@sB{tLRm)EpD>)MUo+gh`HmbVad_NK5g#%&eV z>s{pkfPN{}%v$7Al)f>y=(j)uUs(GhyzYtYn({jHo_w1+s4VZQW2x{1(Do*Zo$W1X zy^k1%-vWvZ?-17HmNf8A-bf~BB58tZCX+OoOwm*_P1DH?;g!tNY%)i4NsC&^Jk2L< zY9}4)Bwgwz=jdFrKnuwtEhbB}q}J#>ormr+Ehj5%A@^#~N^*fNBp2yoa)~YhtFM>n zB}czZFFV@Gv*RoDO7bebn!HA@30899e9rXsI=!C!|229A-Y8h5H%iz1Y}a+%)_c@T z-Xyzc-^8A|j5o~iHF^!JS-6aU}gYh=!$r3Pc$;=qW(&j2-TNM?nkXZy|o&&q` z6e4{nP~i%oE7vGAW0|a~kMnOu#iIfHFsf6<2D4l^FebB-$lM(6f(;SdBFt|};ldS; zm}j;GWXH-0#|>J07QyRinz!z*%cagDa|afEhGVmB<1k|z4BckI{fz*8n3fu}%L3&1 z$#r7QAXm zS&$rT7QjFA)E`g9zp*b{OD%Ia-haE@YU#{I!a+-8hT742x;G#W*8H>nS&yA);ymNV z+U^qa{!woB;agCDeFLzL((YZ6)VUauqr&;~;bH)%;w=lMbMHSRmPK?PtL85XqKfls zmYa3qUmK!9lS&Cy;vYAX@h=)WG0JQ~V`7NPr7UKOfwvDsM9=xpmB9yq&upn?RJ~cD z@*VpyO-Z(b?q zKlH40eg7I4YlD*Y$0)S3E?yPEeMSF1Vd(ftn(_i(Wg`P#79Ud%NNxrZvG;Uy#ZmTh zg*NFh6OLM5X8o4T+tZd^nrS)?Gvmr}8qzC|19yoDZ6TOosd=Yyl6=Y-%esb_@(;n# z7K7Nm(-BHz(Limlxo|3I46p)6SVqyP7Fo+tb%{u<0o#@sRpuDv zk~0IG1yvE(R?*h4Zai7MwO2lRt#X``It9lWJ0UK{VRdYV(!pp63@_NpX;j4-P9P)Q ziO9}}nW>7;;_`F#6w4>ZxADd7S$S+Mvg`Jk5dNTeDvcVeLWJwxSB=-N0(QH$~wa7xG)XYMV`dvB$d6``mgO*J#U7+1}TG zi?gl@$1?w{P$nNurr{LZuzu}WjPhKIv!i?@;*UihQb>7xp*Y60X5m0AxVhT1R&(`} zlwmZ$H-g3+a7LQeCg^A8=UF;vbUFdI7Ag7zK7V>9M7cfeeBhVgsW!}|dP(=?I&v5J zcTOn}G^F;bDGMnK?t9b_Z*VJ?pb$3R3S@8o@mjBEtuS53tY>R9bc5Ar z+l9q3|J`Kb(la$<&IOq2xO)bTEuTBLvcex|FPvGE!z@WiP8B)^SAdNGfdekLL?vC! zmPec|Wfngb(>Ohzwf$kS#_C(nA7-_>eD;NC)$^%~Ox3xr@)+Nq5Xo-%kHuk-XGxo1 z+?&wkD;T-YBXR$l_At4i;-C>Fu&x#gM@Z$nWBLU$jo0>ASNZm9y>0?uY+b~ky>#$2 z@mwN&HP+A<=pv{In#R{&JD^wUnHF?Y`YG6+1jP@5*2c4GL+fBAc4Tfyr@%ueWwvKT zn=|IdUuaCDg~fBN%7VsuT8BqGMY`b0$lC$vI;? z)GVUYk#qs+;L(~D;Z@EOe2E&b1w0dmRu1=8YB|N0;@rFiYbh^bE4+i2sOlap%Pt$! z`78tTtUms(tfXbpCD1?=p2U?Q=MrB^4>UtMU^|k}yl+Kk{*o7uf#O?d5^FP!6F7;7 zPsJ3@#60$Dhq{=xv16K$FW{c4+w|6{epc;`#F5xjX3qehDBfS*nnhQOstP)9o3(7S zk=VH(L)-X^!6D6}W41X<+p1JByrI4Acm=I(eLwIyd%mT@NaASTC%!FsvXIK6bysQJ zya&1Q9`yIH?m-cl&Sn|%m@c!O#qvBI_afnU_ju%={X+uzb$(74pmgI^mv@q^fam`K Dkz}68 diff --git a/icu4c/source/data/brkitr/word_thLE.brk b/icu4c/source/data/brkitr/word_thLE.brk deleted file mode 100644 index 70eb161c58a604c3e5ad4adbe9cf6554d8c201a7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15072 zcmeHOTW?#(5gv+`C{ZLO>t^T9#PY3;6s2ejSV3VYE?N|gjly}0q8h5j+OlOypp*sy z`mjHuF9iw|EzrmQll}mqDbW6mHZSZqGrMQ^T$Vg1qLR4i(&y~#%s1D)kfIL_{r&n9 zahsqI9{l_xA#uBVc=UOH@9~q9?v1-QyYJq*b?fc!&kjy{{ezv8y~Bf@{q6^!9`7CW zj*q+db{;)ps(bgaf3)7+IoRzUo;>OGkJr2J@9%dXu$ANPgWhqk|C`?Ky7Qp@C3LH} zK5XEc#q~X+zx|MCdJ}wHcetmAv`c&Rh)!@nqyyTaK79`Ek7$c(oi$qTY~a4xxr6(U zJ3qz!erKEBgXS@P3Tz+PeP}YC{;@`be`cjmV3{#J_z=`-?ezPnKR7MM?IE6?wRh)q zjZ6f0f=C~Wy*pnNTRK8~JAyD@ATEpYT>2Nd{|@ueNEM+rA0qb0sLCPo?AfYpA=6E| z2ks_qTe_Ffcj?zchjzI>;aL4>2HUhYWEM1vBjmA%S=oV4j%81>ddJSg^F;a_vS%Xu ziIwe&4&S-*yEDBRwfIcTo5uAoaN^ z%`vW$i)s~CisFdxgl+ib^}dUhpz&_feTjs*&ymG0vieNc$gWg_*T*lwlkeDUUp|E6*dfE+5%yaL#{*)oob65_mX5JjaOlAy$m{ zaTSPbz(1YZ=)T+4y!L_PIc!pkZgg*2?q`Hlej(JVq#+dDCgulWL>mw4jsI^X8RFs#gJ=Qb4< zUhg3PE&3&KHM16Z7o|_kZTc;cj4!fPgx58Zol{+R$EbcAmE~1+EEV1Y?M|ZD-rk1R zdvZ36It3INo*}Hs%@Eb7W@4bz}$GfgvQhMJ~Hvu2iB zrbTVjrjF^*oSCC}Gfx-IMOrWmYz?nPvk2WKvqZ~mBloJAWm+*Sbje(z%jR+-f62U* z&@Y>p6FP4}Ij@*k=vDJ7y=Gn$ta?B1`|IZauQ61B-mt4Bm@o6j&^Zr~dEIXvr)#?O zrpw2?X8iO(*UYuLYqiW-+a3Q$SSQvId5CwR*eRY%&J;2KNgKOx71#97{c7KE`+OQF4d-#(U(QYg~m` z%Ynzr2%<^IbR@0Jw2J!#k`aPG7;j5FIRfS#nd!q=T3lr;tD<5RGK+xBOTbBa3Xwh( zsBi^v8?I4k#xYq{uf)F-755hG!zfP`8_aUyz?#fTB4fL`3l>Cdi7>w|g$rjmVxHL& zkn2k)95?9fIRww6ZQgovTrRXPGI!w6=h)Xo6)JyY zAGRsUR**dNWa?!e)U=dDNeM4lf65K-8oHQvd00QGuF$_dE%{W3JaMZ|;e@&D%cmcD zR=T=>jf=HG$^4@iI6R2n*-Itb zg#lyA8q95lUmxK;2z);Z)EV-~amjupj216^6;rCev_=Z8*QKFGhK;!?|8Q67k0(4=JQPzEB)vIY9qswopG488@$D>m#?DoGWa`QPA{EI}b`ycNiv{NuG=&01l)3A385v!QFO zKHVxTj{WZ@8<*~>hTRuns^iW%G`2i9x4g_BXjjgy$!3-$B&QNL2A6=H0RjhHZiz~6 zFk{aL@S?9Rb;BpRh7qpPx+4w#^5Guf{(bW zEz~*l@~P_lTZTXDpC#!}ieLYz+Wd{b_;$&MhJ_`J@IOP-Z+tqEKX5>%WszMpE_N7d zAw4cfCG6z=mrxoC`Kli<@{tsCLg&SZG?#Zoiqnu!g(+6iAV4W8--bwbqyH=pgFH{# zesOO@lSeRepGV^UHSJMyLB&C5l)$=LDC{AXXUFsbGL6^WF|YjeYqd5(i*ymcdTHaT z~X9>tNKvim5}6$MGxs(^IGXUUerDN8)yw+XK9#cztNxYZBKi4gmUPZZO+oQG8GI@Xm2}SL1$av4}8v^@2D`6IGWdqZ}Xlkq_SvT zRT?+1L2kST{W+{_Pz3hovJ81l53`-c@;DtgBEfg}xa6OE@?XBr&nW{GG@f&Q=irtS H*ztb>@Qa?u diff --git a/icu4c/source/i18n/rbbi_bld.cpp b/icu4c/source/i18n/rbbi_bld.cpp deleted file mode 100644 index 8538b40a9f9..00000000000 --- a/icu4c/source/i18n/rbbi_bld.cpp +++ /dev/null @@ -1,2093 +0,0 @@ -/* -********************************************************************** -* Copyright (C) 1999 International Business Machines Corporation * -* and others. All rights reserved. * -********************************************************************** -* Date Name Description -* 12/9/99 rgillam Ported from Java -********************************************************************** -*/ - -#include "unicode/rbbi.h" -#include "rbbi_bld.h" -#include "cmemory.h" -#include "unicode/uchar.h" - -//======================================================================= -// RuleBasedBreakIterator.Builder -//======================================================================= -/** - * The Builder class has the job of constructing a RuleBasedBreakIterator from a - * textual description. A Builder is constructed by RuleBasedBreakIterator's - * constructor, which uses it to construct the iterator itself and then throws it - * away. - *

The construction logic is separated out into its own class for two primary - * reasons: - *

  • The construction logic is quite complicated and large. Separating it - * out into its own class means the code must only be loaded into memory while a - * RuleBasedBreakIterator is being constructed, and can be purged after that. - *
  • There is a fair amount of state that must be maintained throughout the - * construction process that is not needed by the iterator after construction. - * Separating this state out into another class prevents all of the functions that - * construct the iterator from having to have really long parameter lists, - * (hopefully) contributing to readability and maintainability.
- *

It'd be really nice if this could be an independent class rather than an - * inner class, because that would shorten the source file considerably, but - * making Builder an inner class of RuleBasedBreakIterator allows it direct access - * to RuleBasedBreakIterator's private members, which saves us from having to - * provide some kind of "back door" to the Builder class that could then also be - * used by other classes. - */ - -const int32_t -RuleBasedBreakIteratorBuilder::END_STATE_FLAG = 0x8000; - -const int32_t -RuleBasedBreakIteratorBuilder::DONT_LOOP_FLAG = 0x4000; - -const int32_t -RuleBasedBreakIteratorBuilder::LOOKAHEAD_STATE_FLAG = 0x2000; - -const int32_t -RuleBasedBreakIteratorBuilder::ALL_FLAGS = END_STATE_FLAG - | DONT_LOOP_FLAG | LOOKAHEAD_STATE_FLAG; - -// constants for various characters -const UChar NULL_CHAR = 0x0000; -const UChar OPEN_PAREN = 0x28; -const UChar CLOSE_PAREN = 0x29; -const UChar OPEN_BRACKET = 0x5b; -const UChar CLOSE_BRACKET = 0x5d; -const UChar OPEN_BRACE = 0x7b; -const UChar CLOSE_BRACE = 0x7d; -const UChar SEMICOLON = 0x3b; -const UChar EQUAL_SIGN = 0x3d; -const UChar MINUS = 0x2d; -const UChar CARET = 0x5e; -const UChar AMPERSAND = 0x26; -const UChar COLON = 0x3a; -const UChar ASTERISK = 0x2a; -const UChar PLUS = 0x2b; -const UChar QUESTION = 0x3f; -const UChar PERIOD = 0x2e; -const UChar PIPE = 0x7c; -const UChar BANG = 0x21; -const UChar SLASH = 0x2f; -const UChar BACKSLASH = 0x5c; - -const UChar ASCII_LOW = 0x20; -const UChar ASCII_HI = 0x7f; - -const UnicodeString IGNORE_NAME = UnicodeString("$ignore"); - -//============================================================================ - -/** - * This class is a completely non-general quick-and-dirty class to make up - * for the fact that at the time of this writing (12/20/99) there was no - * general hash table class in the ICU. When one is created, this class should - * be removed and the code that depends on this class should be altered to use - * the regular hash-table class. This class is just here as a temporary measure - * until that happens. --rtg 12/20/99 - */ -class ExpressionList { -private: - UVector keys; - UVector sets; - UVector strings; - -public: - static const UnicodeSet setNotThere; // an empty UnicodeSet we can use as a return value - // in get() when the key isn't found - static const UnicodeString stringNotThere; - ExpressionList(); - ~ExpressionList(); - - const UnicodeSet& getSet(const UnicodeString& key) const; - void putSet(const UnicodeString& key, UnicodeSet* valueToAdopt); - - const UnicodeString& getString(const UnicodeString& key) const; - void putString(const UnicodeString& key, UnicodeString* valueToAdopt); - - const UnicodeString& getKeyAt(int32_t x) const { return *((UnicodeString*)keys[x]); } - const UnicodeSet& operator[](int32_t x) const { return *((UnicodeSet*)sets[x]); } - int32_t size() const { return keys.size(); } -}; - -const UnicodeSet -ExpressionList::setNotThere; - -const UnicodeString -ExpressionList::stringNotThere; - -ExpressionList::ExpressionList() -{ -} - -ExpressionList::~ExpressionList() -{ - for (int32_t i = 0; i < keys.size(); i++) { - delete (UnicodeString*)keys[i]; - delete (UnicodeSet*)sets[i]; - delete (UnicodeString*)strings[i]; - } -} - -const UnicodeSet& -ExpressionList::getSet(const UnicodeString& key) const -{ - for (int32_t i = 0; i < keys.size(); i++) { - if (key == *((UnicodeString*)keys[i])) { - return *((UnicodeSet*)sets[i]); - } - } - return setNotThere; -} - -void -ExpressionList::putSet(const UnicodeString& key, UnicodeSet* valueToAdopt) -{ - const UnicodeSet& theSet = getSet(key); - if (&theSet != &setNotThere) { - UnicodeSet* value = (UnicodeSet*)(&theSet); - value->clear(); - value->addAll(*valueToAdopt); - delete valueToAdopt; - } - else { - keys.addElement(new UnicodeString(key)); - sets.addElement(valueToAdopt); - strings.addElement(new UnicodeString); - } -} - -const UnicodeString& -ExpressionList::getString(const UnicodeString& key) const -{ - for (int32_t i = 0; i < keys.size(); i++) { - if (key == *((UnicodeString*)keys[i])) { - return *((UnicodeString*)strings[i]); - } - } - return stringNotThere; -} - -void -ExpressionList::putString(const UnicodeString& key, UnicodeString* valueToAdopt) -{ - const UnicodeString& theString = getString(key); - if (&theString != &stringNotThere) { - UnicodeString* value = (UnicodeString*)(&theString); - *value = *valueToAdopt; - delete valueToAdopt; - } - else { - keys.addElement(new UnicodeString(key)); - sets.addElement(new UnicodeSet); - strings.addElement(valueToAdopt); - } -} -//============================================================================ - -#define error(message, position, context) \ - setUpErrorMessage(message, position, context); \ - err = U_PARSE_ERROR; \ - return - -void -stringDeleter(void* o) { - delete (UnicodeString*)o; -} - -void -usetDeleter(void* o) { - delete (UnicodeSet*)o; -} - -void -tableRowDeleter(void* o) { - delete [] (int16_t*)o; -} - -void -vectorDeleter(void* o) { - delete (UVector*)o; -} - -void -mergeRowDeleter(void* o) { - delete [] (int32_t*)o; -} - -/** - * No special construction is required for the Builder. - */ -RuleBasedBreakIteratorBuilder::RuleBasedBreakIteratorBuilder( - RuleBasedBreakIterator& iteratorToBuild) -: iterator(iteratorToBuild), - tables(new RuleBasedBreakIteratorTables) -{ - iterator.tables = tables; - - tempRuleList.setDeleter(&stringDeleter); - categories.setDeleter(&usetDeleter); - tempStateTable.setDeleter(&tableRowDeleter); - decisionPointStack.setDeleter(&vectorDeleter); - // decisionPointList, loopingStates, and statesToBackfill (as well as the - // individual elements in decisionPointStack) don't need deleters-- - // their element type is int32_t - mergeList.setDeleter(&mergeRowDeleter); -} - -RuleBasedBreakIteratorBuilder::~RuleBasedBreakIteratorBuilder() -{ - delete expressions; -} - -/** - * This is the main function for setting up the BreakIterator's tables. It - * just vectors different parts of the job off to other functions. - */ -void -RuleBasedBreakIteratorBuilder::buildBreakIterator(const UnicodeString& description, - UErrorCode& err) -{ - if (U_FAILURE(err)) - return; - - UnicodeString tempDesc(description); - - buildRuleList(tempDesc, err); - buildCharCategories(err); - buildStateTable(err); - buildBackwardsStateTable(err); -} - -/** - * Thus function has three main purposes: - *

  • Perform general syntax checking on the description, so the rest of the - * build code can assume that it's parsing a legal description. - *
  • Split the description into separate rules - *
  • Perform variable-name substitutions (so that no one else sees variable names) - *
- */ -void -RuleBasedBreakIteratorBuilder::buildRuleList(UnicodeString& description, - UErrorCode& err) -{ - if (U_FAILURE(err)) - return; - - // invariants: - // - parentheses must be balanced: ()[]{} - // - nothing can be nested inside {} - // - nothing can be nested inside [] except more []s - // - pairs of ()[]{} must not be empty - // - ; can only occur at the outer level - // - | can only appear inside () - // - only one = or / can occur in a single rule - // - = and / cannot both occur in the same rule - // - the right-hand side of a = expression must be enclosed in [] or () - // - *. ?, and + may not occur at the beginning of a rule, nor may they follow - // =, /, (, (, |, }, ;, +, ?, or * (except that ? can follow *) - // - the rule list must contain at least one / rule (which may or may not - // actually contain a / - // - no rule may be empty - // - all printing characters in the ASCII range except letters and digits - // are reserved and must be preceded by \ - // - ! may only occur at the beginning of a rule - - // set up a vector to contain the broken-up description (each entry in the - // vector is a separate rule) and a stack for keeping track of opening - // punctuation - UStack parenStack; - - int32_t p = 0; - int32_t ruleStart = 0; - UChar c = 0x0000; - UChar lastC = 0x0000; - UChar lastOpen = 0x0000; - UBool haveEquals = FALSE; - UBool haveSlash = FALSE; - UBool sawVarName = FALSE; - UBool sawIllegalChar = FALSE; - int32_t illegalCharPos = 0; - UChar expectedClose = 0x0000; - - // if the description doesn't end with a semicolon, tack a semicolon onto the end - if (description.length() != 0 && description[description.length() - 1] != SEMICOLON) { - description += SEMICOLON; - } - - // for each character, do... - while (p < description.length()) { - c = description[p]; - switch (c) { - // if the character is opening punctuation, verify that no nesting - // rules are broken, and push the character onto the stack - case OPEN_BRACE: - case OPEN_BRACKET: - case OPEN_PAREN: - if (lastOpen == OPEN_BRACE) { - error("Can't nest brackets inside {}", p, description); - } - if (lastOpen == OPEN_BRACKET && c != OPEN_BRACKET) { - error("Can't nest anything in [] but []", p, description); - } - - // if we see { anywhere except on the left-hand side of =, - // we must be seeing a variable name that was never defined - if (c == OPEN_BRACE && (haveEquals || haveSlash)) { - error("Unknown variable name", p, description); - } - - lastOpen = c; - parenStack.push((void*)c); - if (c == OPEN_BRACE) { - sawVarName = TRUE; - } - break; - - // if the character is closing punctuation, verify that it matches the - // last opening punctuation we saw, and that the brackets contain - // something, then pop the stack - case CLOSE_BRACE: - case CLOSE_BRACKET: - case CLOSE_PAREN: - expectedClose = NULL_CHAR; - switch (lastOpen) { - case OPEN_BRACE: - expectedClose = CLOSE_BRACE; - break; - case OPEN_BRACKET: - expectedClose = CLOSE_BRACKET; - break; - case OPEN_PAREN: - expectedClose = CLOSE_PAREN; - break; - } - if (c != expectedClose) { - error("Unbalanced parentheses", p, description); - } - if (lastC == lastOpen) { - error("Parens don't contain anything", p, description); - } - parenStack.pop(); - if (!parenStack.empty()) { - lastOpen = (UChar)(int32_t)parenStack.peek(); - } - else { - lastOpen = NULL_CHAR; - } - break; - - // if the character is an asterisk, make sure it occurs in a place - // where an asterisk can legally go - case ASTERISK: - case PLUS: - case QUESTION: - switch (lastC) { - case EQUAL_SIGN: case SLASH: case OPEN_PAREN: case PIPE: - case ASTERISK: case PLUS: case QUESTION: case SEMICOLON: - case NULL_CHAR: - error("Misplaced *, +, or ?", p, description); - - default: - break; - } - break; - - // if the character is an equals sign, make sure we haven't seen another - // equals sign or a slash yet - case EQUAL_SIGN: - if (haveEquals || haveSlash) { - error("More than one = or / in rule", p, description); - } - haveEquals = TRUE; - sawIllegalChar = FALSE; - break; - - // if the character is a slash, make sure we haven't seen another slash - // or an equals sign yet - case SLASH: - if (haveEquals || haveSlash) { - error("More than one = or / in rule", p, description); - } - if (sawVarName) { - error("Unknown variable name", p, description); - } - haveSlash = TRUE; - break; - - // if the character is an exclamation point, make sure it occurs only - // at the beginning of a rule - case BANG: - if (lastC != SEMICOLON && lastC != NULL_CHAR) { - error("! can only occur at the beginning of a rule", p, description); - } - break; - - // if the character is a backslash, skip the character that follows it - // (it'll get treated as a literal character) - case BACKSLASH: - ++p; - break; - - // we don't have to do anything special on a period - case PERIOD: - break; - - // if the character is a syntax character that can only occur - // inside [], make sure that it does in fact only occur inside [] - // (or in a variable name) - case CARET: - case MINUS: - case COLON: - case AMPERSAND: - if (lastOpen != OPEN_BRACKET && lastOpen != OPEN_BRACE && !sawIllegalChar) { - sawIllegalChar = TRUE; - illegalCharPos = p; - } - break; - - // if the character is a semicolon, do the following... - case SEMICOLON: - // if we saw any illegal characters along the way, throw - // an error - if (sawIllegalChar) { - error("Illegal character", illegalCharPos, description); - } - - // make sure the rule contains something and that there are no - // unbalanced parentheses or brackets - if (lastC == SEMICOLON || lastC == NULL_CHAR) { - error("Empty rule", p, description); - } - if (!parenStack.empty()) { - error("Unbalanced parenheses", p, description); - } - - if (parenStack.empty()) { - // if the rule contained an = sign, call processSubstitution() - // to replace the substitution name with the substitution text - // wherever it appears in the description - if (haveEquals) { - processSubstitution(description, ruleStart, p + 1, p + 1, err); - } - else { - // otherwise, check to make sure the rule doesn't reference - // any undefined substitutions - if (sawVarName) { - error("Unknown variable name", p, description); - } - - // then add it to tempRuleList - UnicodeString* newRule = new UnicodeString(); - description.extractBetween(ruleStart, p, *newRule); - tempRuleList.addElement(newRule); - } - - // and reset everything to process the next rule - ruleStart = p + 1; - haveEquals = haveSlash = sawVarName = sawIllegalChar = FALSE; - } - break; - - // if the character is a vertical bar, check to make sure that it - // occurs inside a () expression and that the character that precedes - // it isn't also a vertical bar - case PIPE: - if (lastC == PIPE) { - error("Empty alternative", p, description); - } - if (parenStack.empty() || lastOpen != OPEN_PAREN) { - error("Misplaced |", p, description); - } - break; - - // if the character is anything else (escaped characters are - // skipped and don't make it here), it's an error - default: - if (c >= ASCII_LOW && c < ASCII_HI && !u_isalpha(c) - && !u_isdigit(c) && !sawIllegalChar) { - sawIllegalChar = TRUE; - illegalCharPos = p; - } - break; - } - lastC = c; - ++p; - } - if (tempRuleList.size() == 0) { - error("No valid rules in description", p, description); - } -} - -/** - * This function performs variable-name substitutions. First it does syntax - * checking on the variable-name definition. If it's syntactically valid, it - * then goes through the remainder of the description and does a simple - * find-and-replace of the variable name with its text. (The variable text - * must be enclosed in either [] or () for this to work.) - */ -void -RuleBasedBreakIteratorBuilder::processSubstitution(UnicodeString& description, - int32_t ruleStart, - int32_t ruleEnd, - int32_t startPos, - UErrorCode& err) -{ - if (U_FAILURE(err)) - return; - - // isolate out the text on either side of the equals sign - UnicodeString substitutionRule; - UnicodeString replace; - UnicodeString replaceWith; - - description.extractBetween(ruleStart, ruleEnd, substitutionRule); - int32_t equalPos = substitutionRule.indexOf(EQUAL_SIGN); - substitutionRule.extractBetween(0, equalPos, replace); - substitutionRule.extractBetween(equalPos + 1, substitutionRule.length() - 1, replaceWith); - - // check to see whether the substitution name is something we've declared - // to be "special". For RuleBasedBreakIterator itself, this is "$ignore". - // This function takes care of any extra processing that has to be done - // with "special" substitution names. - handleSpecialSubstitution(replace, replaceWith, startPos, description, err); - - // perform various other syntax checks on the rule - if (replaceWith.length() == 0) { - error("Nothing on right-hand side of =", startPos, description); - } - if (replace.length() == 0) { - error("Nothing on left-hand side of =", startPos, description); - } - if (!(replaceWith[0] == OPEN_BRACKET - && replaceWith[replaceWith.length() - 1] == CLOSE_BRACKET) - && !(replaceWith[0] == OPEN_PAREN - && replaceWith[replaceWith.length() - 1] == CLOSE_PAREN)) { - error("Illegal right-hand side for =", startPos, description); - } - - // now go through the rest of the description (which hasn't been broken up - // into separate rules yet) and replace every occurrence of the - // substitution name with the substitution body - if (replace[0] != OPEN_BRACE) { - replace.insert(0, OPEN_BRACE); - replace += CLOSE_BRACE; - } - - description.removeBetween(ruleStart, ruleEnd); - - int32_t lastPos = startPos; - int32_t pos = description.indexOf(replace, lastPos); - while (pos != -1) { - description.replaceBetween(pos, pos + replace.length(), replaceWith); - lastPos = pos + replace.length(); - pos = description.indexOf(replace, lastPos); - } -} - -/** - * This function defines a protocol for handling substitution names that - * are "special," i.e., that have some property beyond just being - * substitutions. At the RuleBasedBreakIterator level, we have one - * special substitution name, "$ignore". Subclasses can override this - * function to add more. Any special processing that has to go on beyond - * that which is done by the normal substitution-processing code is done - * here. - */ -void -RuleBasedBreakIteratorBuilder::handleSpecialSubstitution(const UnicodeString& replace, - const UnicodeString& replaceWith, - int32_t startPos, - const UnicodeString& description, - UErrorCode& err) -{ - if (U_FAILURE(err)) - return; - - // if we get a definition for a substitution called "$ignore", it defines - // the ignore characters for the iterator. Check to make sure the expression - // is a [] expression, and if it is, parse it and store the characters off - // to the side. - if (replace == IGNORE_NAME) { - if (replaceWith.charAt(0) == OPEN_PAREN) { - error("Ignore group can't be enclosed in (", startPos, description); - } - ignoreChars = UnicodeSet(replaceWith, err); - } -} - -/** - * This function provides a hook for subclasses to mess with the character - * category table. - */ -void -RuleBasedBreakIteratorBuilder::mungeExpressionList() -{ - // base class doesn't do anything-- this is here - // for subclasses -} - -/** - * This function builds the character category table. On entry, - * tempRuleList is a vector of break rules that has had variable names substituted. - * On exit, the charCategoryTable data member has been initialized to hold the - * character category table, and tempRuleList's rules have been munged to contain - * character category numbers everywhere a literal character or a [] expression - * originally occurred. - */ -void -RuleBasedBreakIteratorBuilder::buildCharCategories(UErrorCode& err) -{ - if (U_FAILURE(err)) - return; - - int32_t bracketLevel = 0; - int32_t p = 0; - int32_t lineNum = 0; - - // build hash table of every literal character or [] expression in the rule list - // and derive a UnicodeSet object representing the characters each refers to - while (lineNum < tempRuleList.size()) { - UnicodeString* line = (UnicodeString*)(tempRuleList[lineNum]); - p = 0; - while (p < line->length()) { - UChar c = (*line)[p]; - switch (c) { - // skip over all syntax characters except [ - case OPEN_PAREN: case CLOSE_PAREN: case ASTERISK: case PERIOD: case SLASH: - case PIPE: case SEMICOLON: case QUESTION: case BANG: case PLUS: - break; - - // for [, find the matching ] (taking nested [] pairs into account) - // and add the whole expression to the expression list - case OPEN_BRACKET: - { - int32_t q = p + 1; - ++bracketLevel; - while (q < line->length() && bracketLevel != 0) { - c = (*line)[q]; - if (c == OPEN_BRACKET) { - ++bracketLevel; - } - else if (c == CLOSE_BRACKET) { - --bracketLevel; - } - ++q; - } - - UnicodeString temp; - line->extractBetween(p, q, temp); - if (&expressions->getSet(temp) == &ExpressionList::setNotThere) { - expressions->putSet(temp, new UnicodeSet(temp, err)); - } - p = q - 1; - } - break; - - // for \ sequences, just move to the next character and treat - // it as a single character - case BACKSLASH: - ++p; - c = (*line)[p]; - // DON'T break; fall through into "default" clause - - // for an isolated single character, add it to the expression list - default: - { - UnicodeString temp; - - line->extractBetween(p, p + 1, temp); - expressions->putSet(temp, new UnicodeSet(temp, err)); - } - break; - } - ++p; - } - ++lineNum; - } - - // create the temporary category table (which is a vector of UnicodeSet objects) - if (ignoreChars.isEmpty()) { - categories.addElement(new UnicodeSet(ignoreChars)); - } - else { - categories.addElement(new UnicodeSet()); - } - ignoreChars.clear(); - - // this is a hook to allow subclasses to add categories on their own - mungeExpressionList(); - - // Derive the character categories. Go through the existing character categories - // looking for overlap. Any time there's overlap, we create a new character - // category for the characters that overlapped and remove them from their original - // category. At the end, any characters that are left in the expression haven't - // been mentioned in any category, so another new category is created for them. - // For example, if the first expression is [abc], then a, b, and c will be placed - // into a single character category. If the next expression is [bcd], we will first - // remove b and c from their existing category (leaving a behind), create a new - // category for b and c, and then create another new category for d (which hadn't - // been mentioned in the previous expression). - // At no time should a character ever occur in more than one character category. - - // for each expression in the expressions list, do... - for (int32_t i = 0; i < expressions->size(); i++) { - // initialize the working char set to the chars in the current expression - UnicodeSet e = UnicodeSet((*expressions)[i]); - - // for each category in the category list, do... - for (int32_t j = categories.size() - 1; !e.isEmpty() && j > 0; j--) { - - // if there's overlap between the current working set of chars - // and the current category... - UnicodeSet* that = (UnicodeSet*)(categories[j]); - UnicodeSet temp = UnicodeSet(e); - temp.retainAll(*that); - if (!temp.isEmpty()) { - // if the current category is not a subset of the current - // working set of characters, then remove the overlapping - // characters from the current category and create a new - // category for them - if (temp != *that) { - that->removeAll(temp); - categories.addElement(new UnicodeSet(temp)); - } - - // and always remove the overlapping characters from the current - // working set of characters - e.removeAll(temp); - } - } - - // if there are still characters left in the working char set, - // add a new category containing them - if (!e.isEmpty()) { - categories.addElement(new UnicodeSet(e)); - } - } - - // we have the ignore characters stored in position 0. Make an extra pass through - // the character category list and remove anything from the ignore list that shows - // up in some other category - UnicodeSet allChars; - for (int32_t i = 1; i < categories.size(); i++) - allChars.addAll(*(UnicodeSet*)(categories[i])); - UnicodeSet* ignoreChars = (UnicodeSet*)(categories[0]); - ignoreChars->removeAll(allChars); - - // now that we've derived the character categories, go back through the expression - // list and replace each UnicodeSet object with a String that represents the - // character categories that expression refers to. The String is encoded: each - // character is a character category number (plus 0x100 to avoid confusing them - // with syntax characters in the rule grammar) - for (int32_t i = 0; i < expressions->size(); i++) { - const UnicodeSet& cs = (*expressions)[i]; - UnicodeString* cats = new UnicodeString; - - // for each category... - for (int32_t j = 1; j < categories.size(); j++) { - - // if the current expression contains characters in that category... - if (cs.containsAll(*(UnicodeSet*)(categories[j]))) { - - // then add the encoded category number to the String for this - // expression - *cats += (UChar)(0x100 + j); - if (cs == *(UnicodeSet*)(categories[j])) { - break; - } - } - } - - // once we've finished building the encoded String for this expression, - // replace the UnicodeSet object with it - expressions->putString(expressions->getKeyAt(i), cats); - } - - // and finally, we turn the temporary category table into a permanent category - // table, which is a CompactByteArray. (we skip category 0, which by definition - // refers to all characters not mentioned specifically in the rules) - tables->charCategoryTable = ucmp8_open((int8_t)0); - - // for each category... - for (int32_t i = 0; i < categories.size(); i++) { - UnicodeSet& chars = *(UnicodeSet*)(categories[i]); - const UnicodeString& pairs = chars.getPairs(); - - // go through the character ranges in the category one by one... - for (int32_t j = 0; j < pairs.length(); j += 2) { - // and set the corresponding elements in the CompactArray accordingly - if (i != 0) { - ucmp8_setRange(tables->charCategoryTable, pairs[j], pairs[j + 1], - (int8_t)i); - } - - // (category 0 is special-- it's the hiding place for the ignore - // characters, whose real category number in the CompactArray is - // -1 [this is because category 0 contains all characters not - // specifically mentioned anywhere in the rules] ) - else { - ucmp8_setRange(tables->charCategoryTable, pairs[j], pairs[j + 1], - RuleBasedBreakIterator::IGNORE); - } - } - } - - // once we've populated the CompactArray, compact it - ucmp8_compact(tables->charCategoryTable, 32); - - // initialize numCategories - numCategories = categories.size(); - tables->numCategories = numCategories; -} - -/** - * This is the function that builds the forward state table. Most of the real - * work is done in parseRule(), which is called once for each rule in the - * description. - */ -void -RuleBasedBreakIteratorBuilder::buildStateTable(UErrorCode& err) -{ - if (U_FAILURE(err)) - return; - - // initialize our temporary state table, and fill it with two states: - // state 0 is a dummy state that allows state 1 to be the starting state - // and 0 to represent "stop". State 1 is added here to seed things - // before we start parsing - tempStateTable.addElement(new int16_t[tables->numCategories + 1]); - tempStateTable.addElement(new int16_t[tables->numCategories + 1]); - - // call parseRule() for every rule in the rule list (except those which - // start with !, which are actually backwards-iteration rules) - for (int32_t i = 0; i < tempRuleList.size(); i++) { - UnicodeString* rule = (UnicodeString*)tempRuleList[i]; - if ((*rule)[0] != BANG) { - parseRule(*rule, TRUE); - } - } - - // finally, use finishBuildingStateTable() to minimize the number of - // states in the table and perform some other cleanup work - finishBuildingStateTable(TRUE); -} - -/** - * This is where most of the work really happens. This routine parses a single - * rule in the rule description, adding and modifying states in the state - * table according to the new expression. The state table is kept deterministic - * throughout the whole operation, although some ugly postprocessing is needed - * to handle the *? token. - */ -void -RuleBasedBreakIteratorBuilder::parseRule(const UnicodeString& rule, - UBool forward) -{ - // algorithm notes: - // - The basic idea here is to read successive character-category groups - // from the input string. For each group, you create a state and point - // the appropriate entries in the previous state to it. This produces a - // straight line from the start state to the end state. The {}, *, and (|) - // idioms produce branches in this straight line. These branches (states - // that can transition to more than one other state) are called "decision - // points." A list of decision points is kept. This contains a list of - // all states that can transition to the next state to be created. For a - // straight line progression, the only thing in the decision-point list is - // the current state. But if there's a branch, the decision-point list - // will contain all of the beginning points of the branch when the next - // state to be created represents the end point of the branch. A stack is - // used to save decision point lists in the presence of nested parentheses - // and the like. For example, when a { is encountered, the current decision - // point list is saved on the stack and restored when the corresponding } - // is encountered. This way, after the } is read, the decision point list - // will contain both the state right before the } _and_ the state before - // the whole {} expression. Both of these states can transition to the next - // state after the {} expression. - // - one complication arises when we have to stamp a transition value into - // an array cell that already contains one. The updateStateTable() and - // mergeStates() functions handle this case. Their basic approach is to - // create a new state that combines the two states that conflict and point - // at it when necessary. This happens recursively, so if the merged states - // also conflict, they're resolved in the same way, and so on. There are - // a number of tests aimed at preventing infinite recursion. - // - another complication arises with repeating characters. It's somewhat - // ambiguous whether the user wants a greedy or non-greedy match in these cases. - // (e.g., whether "[a-z]*abc" means the SHORTEST sequence of letters ending in - // "abc" or the LONGEST sequence of letters ending in "abc". We've adopted - // the *? to mean "shortest" and * by itself to mean "longest". (You get the - // same result with both if there's no overlap between the repeating character - // group and the group immediately following it.) Handling the *? token is - // rather complicated and involves keeping track of whether a state needs to - // be merged (as described above) or merely overwritten when you update one of - // its cells, and copying the contents of a state that loops with a *? token - // into some of the states that follow it after the rest of the table-building - // process is complete ("backfilling"). - // implementation notes: - // - This function assumes syntax checking has been performed on the input string - // prior to its being passed in here. It assumes that parentheses are - // balanced, all literal characters are enclosed in [] and turned into category - // numbers, that there are no illegal characters or character sequences, and so - // on. Violation of these invariants will lead to undefined behavior. - // - It'd probably be better to use linked lists rather than UVector and UStack - // to maintain the decision point list and stack. I went for simplicity in - // this initial implementation. If performance is critical enough, we can go - // back and fix this later. - // -There are a number of important limitations on the *? token. It does not work - // right when followed by a repeating character sequence (e.g., ".*?(abc)*") - // (although it does work right when followed by a single repeating character). - // It will not always work right when nested in parentheses or braces (although - // sometimes it will). It also will not work right if the group of repeating - // characters and the group of characters that follows overlap partially - // (e.g., "[a-g]*?[e-j]"). None of these capabilites was deemed necessary for - // describing breaking rules we know about, so we left them out for - // expeditiousness. - // - Rules such as "[a-z]*?abc;" will be treated the same as "[a-z]*?aa*bc;"-- - // that is, if the string ends in "aaaabc", the break will go before the first - // "a" rather than the last one. Both of these are limitations in the design - // of RuleBasedBreakIterator and not limitations of the rule parser. - - int32_t p = 0; - int32_t currentState = 1; // don't use state number 0; 0 means "stop" - int32_t lastState = currentState; - UnicodeString pendingChars; - UnicodeString temp; - - int16_t* state; - UBool sawEarlyBreak = FALSE; - - // if we're adding rules to the backward state table, mark the initial state - // as a looping state - if (!forward) { - loopingStates.addElement((void*)1); - } - - // put the current state on the decision point list before we start - decisionPointList.addElement((void*)currentState); // we want currentState to - // be 1 here... - currentState = tempStateTable.size() - 1; // but after that, we want it to be - // 1 less than the state number of the next state - while (p < rule.length()) { - UChar c = rule[p]; - clearLoopingStates = FALSE; - - // this section handles literal characters, escaped characters (which are - // effectively literal characters too), the . token, and [] expressions - if (c == OPEN_BRACKET - || c == BACKSLASH - || u_isalpha(c) - || u_isdigit(c) - || c < ASCII_LOW - || c == PERIOD - || c >= ASCII_HI) { - - // if we're not on a period, isolate the expression and look up - // the corresponding category list - if (c != PERIOD) { - int32_t q = p; - - // if we're on a backslash, the expression is the character - // after the backslash - if (c == BACKSLASH) { - q = p + 2; - ++p; - } - - // if we're on an opening bracket, scan to the closing bracket - // to isolate the expression - else if (c == OPEN_BRACKET) { - int32_t bracketLevel = 1; - while (bracketLevel > 0) { - ++q; - c = rule[q]; - if (c == OPEN_BRACKET) { - ++bracketLevel; - } - else if (c == CLOSE_BRACKET) { - --bracketLevel; - } - else if (c == BACKSLASH) { - ++q; - } - } - ++q; - } - - // otherwise, the expression is just the character itself - else { - q = p + 1; - } - - // look up the category list for the expression and store it - // in pendingChars - rule.extractBetween(p, q, temp); - pendingChars = expressions->getString(temp); - - // advance the current position past the expression - p = q - 1; - } - - // if the character we're on is a period, we end up down here - else { - int32_t rowNum = (int32_t)decisionPointList.lastElement(); - state = (int16_t*)tempStateTable[rowNum]; - - // if the period is followed by an asterisk, then just set the current - // state to loop back on itself - if (p + 1 < rule.length() && rule[p + 1] == ASTERISK && state[0] != 0) { - decisionPointList.addElement((void*)state[0]); - pendingChars.remove(); - ++p; - if (p + 1 < rule.length() && rule[p + 1] == QUESTION) { -//System.out.println("Saw *?"); - setLoopingStates(&decisionPointList, decisionPointList); - ++p; - } -//System.out.println("Saw .*"); - } - - // otherwise, fabricate a category list ("pendingChars") with - // every category in it - else { - pendingChars.remove(); - for (int32_t i = 0; i < numCategories; i++) - pendingChars += (UChar)(i + 0x100); - } - } - - // we'll end up in here for all expressions except for .*, which is - // special-cased above - if (pendingChars.length() != 0) { - - // if the expression is followed by an asterisk, then push a copy - // of the current decision point list onto the stack - if (p + 1 < rule.length() && ( - rule[p + 1] == ASTERISK || - rule[p + 1] == QUESTION - )) { - UVector* clone = new UVector; - for (int32_t i = 0; i < decisionPointList.size(); i++) { - clone->addElement(decisionPointList[i]); - // (there's no ownership issue here because the vector - // elements are all integers) - } - decisionPointStack.push(clone); - } - - // create a new state, add it to the list of states to backfill - // if we have looping states to worry about, set its "don't make - // me an accepting state" flag if we've seen a slash, and add - // it to the end of the state table - int32_t newState = tempStateTable.size(); - if (loopingStates.size() != 0) { - statesToBackfill.addElement((void*)newState); - } - state = new int16_t[numCategories + 1]; - if (sawEarlyBreak) { - state[numCategories] = DONT_LOOP_FLAG; - } - tempStateTable.addElement(state); - - // update everybody in the decision point list to point to - // the new state (this also performs all the reconciliation - // needed to make the table deterministic), then clear the - // decision point list - updateStateTable(decisionPointList, pendingChars, (int16_t)newState); - decisionPointList.removeAllElements(); - - // add all states created since the last literal character we've - // seen to the decision point list - lastState = currentState; - do { - ++currentState; - decisionPointList.addElement((void*)currentState); - } while (currentState + 1 < tempStateTable.size()); - } - } - - // a * denotes a repeating character or group (* after () is handled separately - // below). In addition to restoring the decision point list, modify the - // current state to point to itself on the appropriate character categories. - if (c == PLUS || c == ASTERISK || c == QUESTION) { - // when there's a *, update the current state to loop back on itself - // on the character categories that caused us to enter this state - if (c == ASTERISK || c == PLUS) { - for (int32_t i = lastState + 1; i < tempStateTable.size(); i++) { - UVector temp2; - temp2.addElement((void*)i); - updateStateTable(temp2, pendingChars, (int16_t)(lastState + 1)); - } - } - - // pop the top element off the decision point stack and merge - // it with the current decision point list (this causes the divergent - // paths through the state table to come together again on the next - // new state) - if (c == ASTERISK || c == QUESTION) { - UVector* temp2 = (UVector*)decisionPointStack.pop(); - for (int32_t i = 0; i < temp2->size(); i++) - decisionPointList.addElement((*temp2)[i]); - delete temp2; - - // a ? after a * modifies the behavior of * in cases where there is overlap - // between the set of characters that repeat and the characters which follow. - // Without the ?, all states following the repeating state, up to a state which - // is reached by a character that doesn't overlap, will loop back into the - // repeating state. With the ?, the mark states following the *? DON'T loop - // back into the repeating state. Thus, "[a-z]*xyz" will match the longest - // sequence of letters that ends in "xyz," while "[a-z]*? will match the - // _shortest_ sequence of letters that ends in "xyz". - // We use extra bookkeeping to achieve this effect, since everything else works - // according to the "longest possible match" principle. The basic principle - // is that transitions out of a looping state are written in over the looping - // value instead of being reconciled, and that we copy the contents of the - // looping state into empty cells of all non-terminal states that follow the - // looping state. -//System.out.println("c = " + c + ", p = " + p + ", rule.length() = " + rule.length()); - if (c == ASTERISK && p + 1 < rule.length() && rule[p + 1] == QUESTION) { -//System.out.println("Saw *?"); - setLoopingStates(&decisionPointList, decisionPointList); - ++p; - } - } - } - - // a ( marks the beginning of a sequence of characters. Parentheses can either - // contain several alternative character sequences (i.e., "(ab|cd|ef)"), or - // they can contain a sequence of characters that can repeat (i.e., "(abc)*"). Thus, - // A () group can have multiple entry and exit points. To keep track of this, - // we reserve TWO spots on the decision-point stack. The top of the stack is - // the list of exit points, which becomes the current decision point list when - // the ) is reached. The next entry down is the decision point list at the - // beginning of the (), which becomes the current decision point list at every - // entry point. - // In addition to keeping track of the exit points and the active decision - // points before the ( (i.e., the places from which the () can be entered), - // we need to keep track of the entry points in case the expression loops - // (i.e., is followed by *). We do that by creating a dummy state in the - // state table and adding it to the decision point list (BEFORE it's duplicated - // on the stack). Nobody points to this state, so it'll get optimized out - // at the end. It exists only to hold the entry points in case the () - // expression loops. - if (c == OPEN_PAREN) { - - // add a new state to the state table to hold the entry points into - // the () expression - tempStateTable.addElement(new int16_t[numCategories + 1]); - - // we have to adjust lastState and currentState to account for the - // new dummy state - lastState = currentState; - ++currentState; - - // add the current state to the decision point list (add it at the - // BEGINNING so we can find it later) - decisionPointList.insertElementAt((void*)currentState, 0); - - // finally, push a copy of the current decision point list onto the - // stack (this keeps track of the active decision point list before - // the () expression), followed by an empty decision point list - // (this will hold the exit points) - UVector* clone = new UVector; - for (int32_t i = 0; i < decisionPointList.size(); i++) { - clone->addElement(decisionPointList[i]); - } - decisionPointStack.push(clone); - decisionPointStack.push(new UVector()); - } - - // a | separates alternative character sequences in a () expression. When - // a | is encountered, we add the current decision point list to the exit-point - // list, and restore the decision point list to its state prior to the (. - if (c == PIPE) { - - // pick out the top two decision point lists on the stack - UVector* oneDown = (UVector*)decisionPointStack.pop(); - UVector* twoDown = (UVector*)decisionPointStack.peek(); - decisionPointStack.push(oneDown); - - // append the current decision point list to the list below it - // on the stack (the list of exit points), and restore the - // current decision point list to its state before the () expression - for (int32_t i = 0; i < decisionPointList.size(); i++) - oneDown->addElement(decisionPointList[i]); - decisionPointList.removeAllElements(); - for (int32_t i = 0; i < twoDown->size(); i++) - decisionPointList.addElement((*twoDown)[i]); - } - - // a ) marks the end of a sequence of characters. We do one of two things - // depending on whether the sequence repeats (i.e., whether the ) is followed - // by *): If the sequence doesn't repeat, then the exit-point list is merged - // with the current decision point list and the decision point list from before - // the () is thrown away. If the sequence does repeat, then we fish out the - // state we were in before the ( and copy all of its forward transitions - // (i.e., every transition added by the () expression) into every state in the - // exit-point list and the current decision point list. The current decision - // point list is then merged with both the exit-point list AND the saved version - // of the decision point list from before the (). Then we throw out the *. - if (c == CLOSE_PAREN) { - - // pull the exit point list off the stack, merge it with the current - // decision point list, and make the merged version the current - // decision point list - UVector* exitPoints = (UVector*)decisionPointStack.pop(); - for (int32_t i = 0; i < exitPoints->size(); i++) - decisionPointList.addElement((*exitPoints)[i]); - delete exitPoints; - - // if the ) isn't followed by a *, then all we have to do is throw - // away the other list on the decision point stack, and we're done - if (p + 1 >= rule.length() || ( - rule[p + 1] != ASTERISK && - rule[p + 1] != PLUS && - rule[p + 1] != QUESTION) - ) { - delete (UVector*)decisionPointStack.pop(); - } - - // but if the sequence repeats, we have a lot more work to do... - else { - - // now exitPoints and decisionPointList have to point to equivalent - // vectors, but not the SAME vector - exitPoints = new UVector; - for (int32_t i = 0; i < decisionPointList.size(); i++) - exitPoints->addElement(decisionPointList[i]); - - // pop the original decision point list off the stack - UVector* temp2 = (UVector*)decisionPointStack.pop(); - - // we squirreled away the row number of our entry point list - // at the beginning of the original decision point list. Fish - // that state number out and retrieve the entry point list - int32_t tempStateNum = (int32_t)temp2->firstElement(); - int16_t* tempState = (int16_t*)tempStateTable.elementAt(tempStateNum); - - // merge the original decision point list with the current - // decision point list - if (rule.charAt(p + 1) == QUESTION || rule.charAt(p + 1) == ASTERISK) { - for (int32_t i = 0; i < temp2->size(); i++) - decisionPointList.addElement((*temp2)[i]); - delete temp2; - } - - // finally, copy every forward reference from the entry point - // list into every state in the new decision point list - if (rule[p + 1] == PLUS || rule[p + 1] == ASTERISK) { - for (int32_t i = 0; i < numCategories; i++) { - if (tempState[i] > tempStateNum) { - updateStateTable(*exitPoints, - UnicodeString((UChar)(i + 0x100)), - tempState[i]); - } - } - } - - // update lastState and currentState, and throw away the * - lastState = currentState; - currentState = tempStateTable.size() - 1; - ++p; - delete exitPoints; - } - } - - // a / marks the position where the break is to go if the character sequence - // matches this rule. We update the flag word of every state on the decision - // point list to mark them as ending states, and take note of the fact that - // we've seen the slash - if (c == SLASH) { - sawEarlyBreak = TRUE; - for (int32_t i = 0; i < decisionPointList.size(); i++) { - state = (int16_t*)tempStateTable.elementAt((int32_t)decisionPointList[i]); - state[numCategories] |= LOOKAHEAD_STATE_FLAG; - } - } - - // if we get here without executing any of the above clauses, we have a - // syntax error. However, for now we just ignore the offending character - // and move on -/* -debugPrintln("====Parsed \"" + rule.substring(0, p + 1) + "\"..."); -System.out.println(" currentState = " + currentState); -debugPrintVectorOfVectors(" decisionPointStack:", " ", decisionPointStack); -debugPrintVector(" ", decisionPointList); -debugPrintVector(" loopingStates = ", loopingStates); -debugPrintVector(" statesToBackfill = ", statesToBackfill); -System.out.println(" sawEarlyBreak = " + sawEarlyBreak); -debugPrintTempStateTable(); -*/ - - // clearLoopingStates is a signal back from updateStateTable() that we've - // transitioned to a state that won't loop back to the current looping - // state. (In other words, we've gotten to a point where we can no longer - // go back into a *? we saw earlier.) Clear out the list of looping states - // and backfill any states that need to be backfilled. - if (clearLoopingStates) { - setLoopingStates(0, decisionPointList); - } - - // advance to the next character, now that we've processed the current - // character - ++p; - } - - // this takes care of backfilling any states that still need to be backfilled - setLoopingStates(0, decisionPointList); - - // when we reach the end of the string, we do a postprocessing step to mark the - // end states. The decision point list contains every state that can transition - // to the end state-- that is, every state that is the last state in a sequence - // that matches the rule. All of these states are considered "mark states" - // or "accepting states"-- that is, states that cause the position returned from - // next() to be updated. A mark state represents a possible break position. - // This allows us to look ahead and remember how far the rule matched - // before following the new branch (see next() for more information). - // The temporary state table has an extra "flag column" at the end where this - // information is stored. We mark the end states by setting a flag in their - // flag column. - // Now if we saw the / in the rule, then everything after it is lookahead - // material and the break really goes where the slash is. In this case, - // we mark these states as BOTH accepting states and lookahead states. This - // signals that these states cause the break position to be updated to the - // position of the slash rather than the current break position. - for (int32_t i = 0; i < decisionPointList.size(); i++) { - int32_t rowNum = (int32_t)decisionPointList[i]; - state = (int16_t*)tempStateTable[rowNum]; - state[numCategories] |= END_STATE_FLAG; - if (sawEarlyBreak) { - state[numCategories] |= LOOKAHEAD_STATE_FLAG; - } - } -/* -debugPrintln("====Parsed \"" + rule + ";"); -System.out.println(); -System.out.println(" currentState = " + currentState); -debugPrintVectorOfVectors(" decisionPointStack:", " ", decisionPointStack); -debugPrintVector(" ", decisionPointList); -debugPrintVector(" loopingStates = ", loopingStates); -debugPrintVector(" statesToBackfill = ", statesToBackfill); -System.out.println(" sawEarlyBreak = " + sawEarlyBreak); -debugPrintTempStateTable(); -*/ -} - -/** - * Update entries in the state table, and merge states when necessary to keep - * the table deterministic. - * @param rows The list of rows that need updating (the decision point list) - * @param pendingChars A character category list, encoded in a String. This is the - * list of the columns that need updating. - * @param newValue Update the cells specfied above to contain this value - */ -void -RuleBasedBreakIteratorBuilder::updateStateTable(const UVector& rows, - const UnicodeString& pendingChars, - int16_t newValue) -{ - // create a dummy state that has the specified row number (newValue) in - // the cells that need to be updated (those specified by pendingChars) - // and 0 in the other cells - int16_t* newValues = new int16_t[numCategories + 1]; - for (int32_t i = 0; i < pendingChars.length(); i++) - newValues[(int32_t)(pendingChars[i]) - 0x100] = newValue; - - // go through the list of rows to update, and update them by calling - // mergeStates() to merge them the the dummy state we created - for (int32_t i = 0; i < rows.size(); i++) { - mergeStates((int32_t)rows[i], newValues, rows); - } -} - -/** - * The real work of making the state table deterministic happens here. This function - * merges a state in the state table (specified by rowNum) with a state that is - * passed in (newValues). The basic process is to copy the nonzero cells in newStates - * into the state in the state table (we'll call that oldValues). If there's a - * collision (i.e., if the same cell has a nonzero value in both states, and it's - * not the SAME value), then we have to reconcile the collision. We do this by - * creating a new state, adding it to the end of the state table, and using this - * function recursively to merge the original two states into a single, combined - * state. This process may happen recursively (i.e., each successive level may - * involve collisions). To prevent infinite recursion, we keep a log of merge - * operations. Any time we're merging two states we've merged before, we can just - * supply the row number for the result of that merge operation rather than creating - * a new state just like it. - * @param rowNum The row number in the state table of the state to be updated - * @param newValues The state to merge it with. - * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable() - * (itself a copy of the decision point list from parseRule()). Newly-created - * states get added to the decision point list if their "parents" were on it. - */ -void -RuleBasedBreakIteratorBuilder::mergeStates(int32_t rowNum, - int16_t* newValues, - const UVector& rowsBeingUpdated) -{ - int16_t* oldValues = (int16_t*)(tempStateTable[rowNum]); -/* -System.out.print("***Merging " + rowNum + ":"); -for (int32_t i = 0; i < oldValues.length; i++) System.out.print("\t" + oldValues[i]); -System.out.println(); -System.out.print(" with \t"); -for (int32_t i = 0; i < newValues.length; i++) System.out.print("\t" + newValues[i]); -System.out.println(); -*/ - - UBool isLoopingState = loopingStates.contains((void*)rowNum); - - // for each of the cells in the rows we're reconciling, do... - for (int32_t i = 0; i < numCategories; i++) { - - // if they contain the same value, we don't have to do anything - if (oldValues[i] == newValues[i]) { - continue; - } - - // if oldValues is a looping state and the state the current cell points to - // is too, then we can just stomp over the current value of that cell (and - // set the clear-looping-states flag if necessary) - else if (isLoopingState && loopingStates.contains((void*)oldValues[i])) { - if (newValues[i] != 0) { - if (oldValues[i] == 0) { - clearLoopingStates = TRUE; - } - oldValues[i] = newValues[i]; - } - } - - // if the current cell in oldValues is 0, copy in the corresponding value - // from newValues - else if (oldValues[i] == 0) { - oldValues[i] = newValues[i]; - } - - // the last column of each row is the flag column. Take care to merge the - // flag words correctly - else if (i == numCategories) { - oldValues[i] = (int16_t)((newValues[i] & ALL_FLAGS) | oldValues[i]); - } - - // if both newValues and oldValues have a nonzero value in the current - // cell, and it isn't the same value both places... - else if (oldValues[i] != 0 && newValues[i] != 0) { - - // look up this pair of cell values in the merge list. If it's - // found, update the cell in oldValues to point to the merged state - int32_t combinedRowNum = searchMergeList(oldValues[i], newValues[i]); - if (combinedRowNum != 0) { - oldValues[i] = (int16_t)combinedRowNum; - } - - // otherwise, we have to reconcile them... - else { - // copy our row numbers into variables to make things easier - int32_t oldRowNum = oldValues[i]; - int32_t newRowNum = newValues[i]; - combinedRowNum = tempStateTable.size(); - - // add this pair of row numbers to the merge list (create it first - // if we haven't created the merge list yet) - int32_t* entry = new int32_t[3]; - entry[0] = oldRowNum; - entry[1] = newRowNum; - entry[2] = combinedRowNum; - mergeList.addElement(entry); - -//System.out.println("***At " + rowNum + ", merging " + oldRowNum + " and " + newRowNum + " into " + combinedRowNum); - - // create a new row to represent the merged state, and copy the - // contents of oldRow into it, then add it to the end of the - // state table and update the original row (oldValues) to point - // to the new, merged, state - int16_t* newRow = new int16_t[numCategories + 1]; - int16_t* oldRow = (int16_t*)(tempStateTable[oldRowNum]); - uprv_memcpy(newRow, oldRow, (numCategories + 1) * sizeof int16_t); - tempStateTable.addElement(newRow); - oldValues[i] = (int16_t)combinedRowNum; - - -//System.out.println("lastOldRowNum = " + lastOldRowNum); -//System.out.println("lastCombinedRowNum = " + lastCombinedRowNum); -//System.out.println("decisionPointList.contains(lastOldRowNum) = " + decisionPointList.contains(new Integer(lastOldRowNum))); -//System.out.println("decisionPointList.contains(lastCombinedRowNum) = " + decisionPointList.contains(new Integer(lastCombinedRowNum))); - - // if the decision point list contains either of the parent rows, - // update it to include the new row as well - if ((decisionPointList.contains((void*)oldRowNum) - || decisionPointList.contains((void*)newRowNum)) - && !decisionPointList.contains((void*)combinedRowNum) - ) { - decisionPointList.addElement((void*)combinedRowNum); - } - - // do the same thing with the list of rows being updated - if ((rowsBeingUpdated.contains((void*)oldRowNum) - || rowsBeingUpdated.contains((void*)newRowNum)) - && !rowsBeingUpdated.contains((void*)combinedRowNum) - ) { - decisionPointList.addElement((void*)combinedRowNum); - } - // now (groan) do the same thing for all the entries on the - // decision point stack - for (int32_t k = 0; k < decisionPointStack.size(); k++) { - UVector* dpl = (UVector*)decisionPointStack[k]; - if ((dpl->contains((void*)oldRowNum) - || dpl->contains((void*)newRowNum)) - && !dpl->contains((void*)combinedRowNum) - ) { - dpl->addElement((void*)combinedRowNum); - } - } - - // FINALLY (puff puff puff), call mergeStates() recursively to copy - // the row referred to by newValues into the new row and resolve any - // conflicts that come up at that level - mergeStates(combinedRowNum, (int16_t*)(tempStateTable.elementAt( - newValues[i])), rowsBeingUpdated); - } - } - } -} - -/** - * The merge list is a list of pairs of rows that have been merged somewhere in - * the process of building this state table, along with the row number of the - * row containing the merged state. This function looks up a pair of row numbers - * and returns the row number of the row they combine into. (It returns 0 if - * this pair of rows isn't in the merge list.) - */ -int32_t -RuleBasedBreakIteratorBuilder::searchMergeList(int32_t a, int32_t b) -{ - int32_t* entry; - for (int32_t i = 0; i < mergeList.size(); i++) { - entry = (int32_t*)(mergeList[i]); - - // we have a hit if the two row numbers match the two row numbers - // in the beginning of the entry (the two that combine), in either - // order - if ((entry[0] == a && entry[1] == b) || (entry[0] == b && entry[1] == a)) { - return entry[2]; - } - - // we also have a hit if one of the two row numbers matches the marged - // row number and the other one matches one of the original row numbers - if ((entry[2] == a && (entry[0] == b || entry[1] == b))) { - return entry[2]; - } - if ((entry[2] == b && (entry[0] == a || entry[1] == a))) { - return entry[2]; - } - } - return 0; -} - -/** - * This function is used to update the list of current loooping states (i.e., - * states that are controlled by a *? construct). It backfills values from - * the looping states into unpopulated cells of the states that are currently - * marked for backfilling, and then updates the list of looping states to be - * the new list - * @param newLoopingStates The list of new looping states - * @param endStates The list of states to treat as end states (states that - * can exit the loop). - */ -void -RuleBasedBreakIteratorBuilder::setLoopingStates(const UVector* newLoopingStates, - const UVector& endStates) -{ - // if the current list of looping states isn't empty, we have to backfill - // values from the looping states into the states that are waiting to be - // backfilled - if (!loopingStates.isEmpty()) { - int32_t loopingState = (int32_t)loopingStates.lastElement(); - int32_t rowNum; - - // don't backfill into an end state OR any state reachable from an end state - // (since the search for reachable states is recursive, it's split out into - // a separate function, eliminateBackfillStates(), below) - for (int32_t i = 0; i < endStates.size(); i++) { - eliminateBackfillStates((int32_t)endStates[i]); - } - - // we DON'T actually backfill the states that need to be backfilled here. - // Instead, we MARK them for backfilling. The reason for this is that if - // there are multiple rules in the state-table description, the looping - // states may have some of their values changed by a succeeding rule, and - // this wouldn't be reflected in the backfilled states. We mark a state - // for backfilling by putting the row number of the state to copy from - // into the flag cell at the end of the row - for (int32_t i = 0; i < statesToBackfill.size(); i++) { - rowNum = (int32_t)statesToBackfill.elementAt(i); - int16_t* state = (int16_t*)tempStateTable[rowNum]; - state[numCategories] = - (int16_t)((state[numCategories] & ALL_FLAGS) | loopingState); - } - statesToBackfill.removeAllElements(); - loopingStates.removeAllElements(); - } - - if (newLoopingStates != 0) { - for (int32_t i = 0; i < newLoopingStates->size(); i++) { - loopingStates.addElement((*newLoopingStates)[i]); - } - } -} - -/** - * This removes "ending states" and states reachable from them from the - * list of states to backfill. - * @param The row number of the state to remove from the backfill list - */ -void -RuleBasedBreakIteratorBuilder::eliminateBackfillStates(int32_t baseState) -{ - // don't do anything unless this state is actually in the backfill list... - if (statesToBackfill.contains((void*)baseState)) { - - // if it is, take it out - statesToBackfill.removeElement((void*)baseState); - - // then go through and recursively call this function for every - // state that the base state points to - int16_t* state = (int16_t*)tempStateTable[baseState]; - for (int32_t i = 0; i < numCategories; i++) { - if (state[i] != 0) { - eliminateBackfillStates(state[i]); - } - } - } -} - -/** - * This function completes the backfilling process by actually doing the - * backfilling on the states that are marked for it - */ -void -RuleBasedBreakIteratorBuilder::backfillLoopingStates(void) -{ - int16_t* state; - int16_t* loopingState = 0; - int32_t loopingStateRowNum = 0; - int32_t fromState; - - // for each state in the state table... - for (int32_t i = 0; i < tempStateTable.size(); i++) { - state = (int16_t*)tempStateTable[i]; - - // check the state's flag word to see if it's marked for backfilling - // (it's marked for backfilling if any bits other than the two high-order - // bits are set-- if they are, then the flag word, minus the two high bits, - // is the row number to copy from) - fromState = state[numCategories] & ~ALL_FLAGS; - if (fromState > 0) { - - // load up the state to copy from (if we haven't already) - if (fromState != loopingStateRowNum) { - loopingStateRowNum = fromState; - loopingState = (int16_t*)tempStateTable[loopingStateRowNum]; - } - - // clear out the backfill part of the flag word - state[numCategories] &= ALL_FLAGS; - - // then fill all zero cells in the current state with values - // from the corresponding cells of the fromState - for (int32_t j = 0; j < numCategories + 1; j++) { - if (state[j] == 0) { - state[j] = loopingState[j]; - } - else if (state[j] == DONT_LOOP_FLAG) { - state[j] = 0; - } - } - } - } -} - -/** - * This function completes the state-table-building process by doing several - * postprocessing steps and copying everything into its final resting place - * in the iterator itself - * @param forward TRUE if we're working on the forward state table - */ -void -RuleBasedBreakIteratorBuilder::finishBuildingStateTable(UBool forward) -{ -//debugPrintTempStateTable(); - // start by backfilling the looping states - backfillLoopingStates(); -//debugPrintTempStateTable(); - - int32_t* rowNumMap = new int32_t[tempStateTable.size()]; - int32_t rowNumMapSize = tempStateTable.size(); - UStack rowsToFollow; - rowsToFollow.push((void*)1); - rowNumMap[1] = 1; - - // determine which states are no longer reachable from the start state - // (the reachable states will have their row numbers in the row number - // map, and the nonreachable states will have zero in the row number map) - while (rowsToFollow.size() != 0) { - int32_t rowNum = (int32_t)rowsToFollow.pop(); - int16_t* row = (int16_t*)(tempStateTable[rowNum]); - - for (int32_t i = 0; i < numCategories; i++) { - if (row[i] != 0) { - if (rowNumMap[row[i]] == 0) { - rowNumMap[row[i]] = row[i]; - rowsToFollow.push((void*)row[i]); - } - } - } - } -/* -System.out.println("The following rows are not reachable:"); -for (int32_t i = 1; i < rowNumMap.length; i++) -if (rowNumMap[i] == 0) System.out.print("\t" + i); -System.out.println(); -*/ - - int32_t newRowNum; - - // algorithm for minimizing the number of states in the table adapted from - // Aho & Ullman, "Principles of Compiler Design" - // The basic idea here is to organize the states into classes. When we're done, - // all states in the same class can be considered identical and all but one eliminated. - - // initially assign states to classes based on the number of populated cells they - // contain (the class number is the number of populated cells) - int32_t* stateClasses = new int32_t[tempStateTable.size()]; - int32_t nextClass = numCategories + 1; - int16_t* state1 = 0; - int16_t* state2 = 0; - for (int32_t i = 1; i < tempStateTable.size(); i++) { - if (rowNumMap[i] == 0) { - continue; - } - state1 = (int16_t*)tempStateTable[i]; - for (int32_t j = 0; j < numCategories; j++) { - if (state1[j] != 0) { - ++stateClasses[i]; - } - } - if (stateClasses[i] == 0) { - stateClasses[i] = nextClass; - } - } - ++nextClass; - - // then, for each class, elect the first member of that class as that class's - // "representative". For each member of the class, compare it to the "representative." - // If there's a column position where the state being tested transitions to a - // state in a DIFFERENT class from the class where the "representative" transitions, - // then move the state into a new class. Repeat this process until no new classes - // are created. - int32_t currentClass; - int32_t lastClass; - UBool split; - - do { -//System.out.println("Making a pass..."); - currentClass = 1; - lastClass = nextClass; - while (currentClass < nextClass) { -//System.out.print("States in class #" + currentClass +":"); - split = FALSE; - state1 = state2 = 0; - for (int32_t i = 0; i < tempStateTable.size(); i++) { - if (stateClasses[i] == currentClass) { -//System.out.print("\t" + i); - if (state1 == 0) { - state1 = (int16_t*)tempStateTable[i]; - } - else { - state2 = (int16_t*)tempStateTable[i]; - for (int32_t j = 0; j < numCategories + 1; j++) { - if ((j == numCategories && state1[j] != state2[j] && forward) - || (j != numCategories && stateClasses[state1[j]] - != stateClasses[state2[j]])) { - stateClasses[i] = nextClass; - split = TRUE; - break; - } - } - } - } - } - if (split) { - ++nextClass; - } - ++currentClass; -//System.out.println(); - } - } while (lastClass != nextClass); - - // at this point, all of the states in a class except the first one (the - //"representative") can be eliminated, so update the row-number map accordingly - int32_t* representatives = new int32_t[nextClass]; - for (int32_t i = 1; i < tempStateTable.size(); i++) { - if (representatives[stateClasses[i]] == 0) { - representatives[stateClasses[i]] = i; - } - else { - rowNumMap[i] = representatives[stateClasses[i]]; - } - } - delete [] stateClasses; - delete [] representatives; -//System.out.println("Renumbering..."); - - // renumber all remaining rows... - // first drop all that are either unreferenced or not a class representative - for (int32_t i = 1; i < rowNumMapSize; i++) { - if (rowNumMap[i] != i) { - delete [] tempStateTable[i]; - tempStateTable.setElementAt(0, i); - } - } - - // then calculate everybody's new row number and update the row - // number map appropriately (the first pass updates the row numbers - // of all the class representatives [the rows we're keeping], and the - // second pass updates the cross references for all the rows that - // are being deleted) - newRowNum = 1; - for (int32_t i = 1; i < rowNumMapSize; i++) { - if (tempStateTable[i] != 0) { - rowNumMap[i] = newRowNum++; - } - } - for (int32_t i = 1; i < rowNumMapSize; i++) { - if (tempStateTable[i] == 0) { - rowNumMap[i] = rowNumMap[rowNumMap[i]]; - } - } -//for (int32_t i = 1; i < rowNumMap.length; i++) rowNumMap[i] = i; int32_t newRowNum = rowNumMap.length; - - // allocate the permanent state table, and copy the remaining rows into it - // (adjusting all the cell values, of course) - - // this section does that for the forward state table - if (forward) { - tables->endStates = new UBool[newRowNum]; - tables->lookaheadStates = new UBool[newRowNum]; - tables->stateTable = new int16_t[newRowNum * numCategories]; - int32_t p = 0; - int32_t p2 = 0; - for (int32_t i = 0; i < tempStateTable.size(); i++) { - int16_t* row = (int16_t*)(tempStateTable[i]); - if (row == 0) { - continue; - } - for (int32_t j = 0; j < numCategories; j++) { - tables->stateTable[p] = (int16_t)(rowNumMap[row[j]]); - ++p; - } - tables->endStates[p2] = ((row[numCategories] & END_STATE_FLAG) != 0); - tables->lookaheadStates[p2] = ((row[numCategories] & LOOKAHEAD_STATE_FLAG) != 0); - ++p2; - } - } - - // and this section does it for the backward state table - else { - tables->backwardsStateTable = new int16_t[newRowNum * numCategories]; - int32_t p = 0; - for (int32_t i = 0; i < tempStateTable.size(); i++) { - int16_t* row = (int16_t*)(tempStateTable[i]); - if (row == 0) { - continue; - } - for (int32_t j = 0; j < numCategories; j++) { - tables->backwardsStateTable[p] = (int16_t)(rowNumMap[row[j]]); - ++p; - } - } - } - - delete [] rowNumMap; -} - -/** - * This function builds the backward state table from the forward state - * table and any additional rules (identified by the ! on the front) - * supplied in the description - */ -void -RuleBasedBreakIteratorBuilder::buildBackwardsStateTable(UErrorCode& err) -{ - if (U_FAILURE(err)) - return; - - // create the temporary state table and seed it with two rows (row 0 - // isn't used for anything, and we have to create row 1 (the initial - // state) before we can do anything else - tempStateTable.removeAllElements(); - tempStateTable.addElement(new int16_t[numCategories + 1]); - tempStateTable.addElement(new int16_t[numCategories + 1]); - - // although the backwards state table is built automatically from the forward - // state table, there are some situations (the default sentence-break rules, - // for example) where this doesn't yield enough stop states, causing a dramatic - // drop in performance. To help with these cases, the user may supply - // supplemental rules that are added to the backward state table. These have - // the same syntax as the normal break rules, but begin with BANG to distinguish - // them from normal break rules - for (int32_t i = 0; i < tempRuleList.size(); i++) { - UnicodeString* rule = (UnicodeString*)tempRuleList[i]; - if ((*rule)[0] == BANG) { - rule->remove(0, 1); - parseRule(*rule, FALSE); - } - } - backfillLoopingStates(); - - // Backwards iteration is qualitatively different from forwards iteration. - // This is because backwards iteration has to be made to operate from no context - // at all-- the user should be able to ask BreakIterator for the break position - // immediately on either side of some arbitrary offset in the text. The - // forward iteration table doesn't let us do that-- it assumes complete - // information on the context, which means starting from the beginning of the - // document. - // The way we do backward and random-access iteration is to back up from the - // current (or user-specified) position until we see something we're sure is - // a break position (it may not be the last break position immediately - // preceding our starting point, however). Then we roll forward from there to - // locate the actual break position we're after. - // This means that the backwards state table doesn't have to identify every - // break position, allowing the building algorithm to be much simpler. Here, - // we use a "pairs" approach, scanning the forward-iteration state table for - // pairs of character categories we ALWAYS break between, and building a state - // table from that information. No context is required-- all this state table - // looks at is a pair of adjacent characters. - - // It's possible that the user has supplied supplementary rules (see above). - // This has to be done first to keep parseRule() and friends from becoming - // EVEN MORE complicated. The automatically-generated states are appended - // onto the end of the state table, and then the two sets of rules are - // stitched together at the end. Take note of the row number of the - // first row of the auromatically-generated part. - int32_t backTableOffset = tempStateTable.size(); - if (backTableOffset > 2) { - ++backTableOffset; - } - - // the automatically-generated part of the table models a two-dimensional - // array where the two dimensions represent the two characters we're currently - // looking at. To model this as a state table, we actually need one additional - // row to represent the initial state. It gets populated with the row numbers - // of the other rows (in order). - for (int32_t i = 0; i < numCategories + 1; i++) - tempStateTable.addElement(new int16_t[numCategories + 1]); - - int16_t* state = (int16_t*)tempStateTable[backTableOffset - 1]; - for (int32_t i = 0; i < numCategories; i++) - state[i] = (int16_t)(i + backTableOffset); - - // scavenge the forward state table for pairs of character categories - // that always have a break between them. The algorithm is as follows: - // Look down each column in the state table. For each nonzero cell in - // that column, look up the row it points to. For each nonzero cell in - // that row, populate a cell in the backwards state table: the row number - // of that cell is the number of the column we were scanning (plus the - // offset that locates this sub-table), and the column number of that cell - // is the column number of the nonzero cell we just found. This cell is - // populated with its own column number (adjusted according to the actual - // location of the sub-table). This process will produce a state table - // whose behavior is the same as looking up successive pairs of characters - // in an array of Booleans to determine whether there is a break. - int32_t numRows = tempStateTable.size() / numCategories; - for (int32_t column = 0; column < numCategories; column++) { - for (int32_t row = 0; row < numRows; row++) { - int32_t nextRow = tables->lookupState(row, column); - if (nextRow != 0) { - for (int32_t nextColumn = 0; nextColumn < numCategories; nextColumn++) { - int32_t cellValue = tables->lookupState(nextRow, nextColumn); - if (cellValue != 0) { - state = (int16_t*)tempStateTable[nextColumn + backTableOffset]; - state[column] = (int16_t)(column + backTableOffset); - } - } - } - } - } - -//debugPrintTempStateTable(); - // if the user specified some backward-iteration rules with the ! token, - // we have to merge the resulting state table with the auto-generated one - // above. First copy the populated cells from row 1 over the populated - // cells in the auto-generated table. Then copy values from row 1 of the - // auto-generated table into all of the the unpopulated cells of the - // rule-based table. - if (backTableOffset > 1) { - - // for every row in the auto-generated sub-table, if a cell is - // populated that is also populated in row 1 of the rule-based - // sub-table, copy the value from row 1 over the value in the - // auto-generated sub-table - state = (int16_t*)tempStateTable[1]; - for (int32_t i = backTableOffset - 1; i < tempStateTable.size(); i++) { - int16_t* state2 = (int16_t*)tempStateTable[i]; - for (int32_t j = 0; j < numCategories; j++) { - if (state[j] != 0 && state2[j] != 0) { - state2[j] = state[j]; - } - } - } - - // now, for every row in the rule-based sub-table that is not - // an end state, fill in all unpopulated cells with the values - // of the corresponding cells in the first row of the auto- - // generated sub-table. - state = (int16_t*)tempStateTable[backTableOffset - 1]; - for (int32_t i = 1; i < backTableOffset - 1; i++) { - int16_t* state2 = (int16_t*)tempStateTable[i]; - if ((state2[numCategories] & END_STATE_FLAG) == 0) { - for (int32_t j = 0; j < numCategories; j++) { - if (state2[j] == 0) { - state2[j] = state[j]; - } - } - } - } - } - -//debugPrintTempStateTable(); - - // finally, clean everything up and copy it into the actual BreakIterator - // by calling finishBuildingStateTable() - finishBuildingStateTable(FALSE); -/* -System.out.print("C:\t"); -for (int32_t i = 0; i < numCategories; i++) -System.out.print(Integer.toString(i) + "\t"); -System.out.println(); System.out.print("================================================="); -for (int32_t i = 0; i < backwardsStateTable.length; i++) { -if (i % numCategories == 0) { -System.out.println(); -System.out.print(Integer.toString(i / numCategories) + ":\t"); -} -if (backwardsStateTable[i] == 0) System.out.print(".\t"); else System.out.print(Integer.toString(backwardsStateTable[i]) + "\t"); -} -System.out.println(); -*/ -} - -void -RuleBasedBreakIteratorBuilder::setUpErrorMessage(const UnicodeString& message, - int32_t position, - const UnicodeString& context) -{ - static UChar lbrks[] = { 0x000a, 0x000a }; - - errorMessage = context; - errorMessage.insert(position, lbrks, 2); - errorMessage.insert(0, lbrks, 1); - errorMessage.insert(0, message); -} diff --git a/icu4c/source/i18n/rbbi_bld.h b/icu4c/source/i18n/rbbi_bld.h deleted file mode 100644 index d390aebc29f..00000000000 --- a/icu4c/source/i18n/rbbi_bld.h +++ /dev/null @@ -1,358 +0,0 @@ -/* -* Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved. -********************************************************************** -* Date Name Description -* 12/15/99 rgillam Port from Java. -********************************************************************** -*/ - -#ifndef RBBI_BLD_H -#define RBBI_BLD_H - -#include "rbbi.h" -#include "rbbi_tbl.h" -#include "unicode/uniset.h" -#include "uvector.h" - -class ExpressionList; - -//======================================================================= -// RuleBasedBreakIterator.Builder -//======================================================================= -/** - * The Builder class has the job of constructing a RuleBasedBreakIterator from a - * textual description. A Builder is constructed by RuleBasedBreakIterator's - * constructor, which uses it to construct the iterator itself and then throws it - * away. - *

The construction logic is separated out into its own class for two primary - * reasons: - *

  • The construction logic is quite complicated and large. Separating it - * out into its own class means the code must only be loaded into memory while a - * RuleBasedBreakIterator is being constructed, and can be purged after that. - *
  • There is a fair amount of state that must be maintained throughout the - * construction process that is not needed by the iterator after construction. - * Separating this state out into another class prevents all of the functions that - * construct the iterator from having to have really long parameter lists, - * (hopefully) contributing to readability and maintainability.
- *

It'd be really nice if this could be an independent class rather than an - * inner class, because that would shorten the source file considerably, but - * making Builder an inner class of RuleBasedBreakIterator allows it direct access - * to RuleBasedBreakIterator's private members, which saves us from having to - * provide some kind of "back door" to the Builder class that could then also be - * used by other classes. - */ -class RuleBasedBreakIteratorBuilder { - -protected: - /** - * The iterator we're constructing. - */ - RuleBasedBreakIterator& iterator; - - /** - * The tables object for the iterator we're constructing. - */ - RuleBasedBreakIteratorTables* tables; - - /** - * A temporary place to hold the rules as they're being processed. - */ - UVector tempRuleList; - - /** - * A temporary holding place used for calculating the character categories. - * This object contains UnicodeSet objects. - */ - UVector categories; - - /** - * The number of categories (and thus the number of columns in the finished state tables) - */ - int32_t numCategories; - - /** - * A table used to map parts of regexp text to lists of character categories, - * rather than having to figure them out from scratch each time - */ - ExpressionList* expressions; - - /** - * A temporary holding place for the list of ignore characters - */ - UnicodeSet ignoreChars; - - /** - * A temporary holding place where the forward state table is built - */ - UVector tempStateTable; - - /** - * A list of all the states that have to be filled in with transitions to the - * next state that is created. Used when building the state table from the - * regular expressions. - */ - UVector decisionPointList; - - /** - * A UStack for holding decision point lists. This is used to handle nested - * parentheses and braces in regexps. - */ - UStack decisionPointStack; - - /** - * A list of states that loop back on themselves. Used to handle .*? - */ - UVector loopingStates; - - /** - * Looping states actually have to be backfilled later in the process - * than everything else. This is where a the list of states to backfill - * is accumulated. This is also used to handle .*? - */ - UVector statesToBackfill; - - /** - * A list mapping pairs of state numbers for states that are to be combined - * to the state number of the state representing their combination. Used - * in the process of making the state table deterministic to prevent - * infinite recursion. - */ - UVector mergeList; - - /** - * A flag that is used to indicate when the list of looping states can - * be reset. - */ - UBool clearLoopingStates; - - /** - * A place where an error message can be stored if we get a parse error. - * The error message is never displayed anywhere, so this is useful pretty - * much only in conjunction with a debugger. - */ - UnicodeString errorMessage; - - /** - * A bit mask used to indicate a bit in the table's flags column that marks a - * state as an accepting state. - */ - static const int32_t END_STATE_FLAG /*= 0x8000*/; - - /** - * A bit mask used to indicate a bit in the table's flags column that marks a - * state as one the builder shouldn't loop to any looping states - */ - static const int32_t DONT_LOOP_FLAG /*= 0x4000*/; - - /** - * A bit mask used to indicate a bit in the table's flags column that marks a - * state as a lookahead state. - */ - static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/; - - /** - * A bit mask representing the union of the mask values listed above. - * Used for clearing or masking off the flag bits. - */ - static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG - | DONT_LOOP_FLAG*/; - -public: - - /** - * The Builder class contains a reference to the iterator it's supposed to build. - */ - RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild); - - /** - * Destructor. - */ - ~RuleBasedBreakIteratorBuilder(); - - /** - * This is the main function for setting up the BreakIterator's tables. It - * just vectors different parts of the job off to other functions. - */ - virtual void buildBreakIterator(const UnicodeString& description, - UErrorCode& err); - -private: - - /** - * Thus function has three main purposes: - *

  • Perform general syntax checking on the description, so the rest of the - * build code can assume that it's parsing a legal description. - *
  • Split the description into separate rules - *
  • Perform variable-name substitutions (so that no one else sees variable names) - *
- */ - virtual void buildRuleList(UnicodeString& description, - UErrorCode& err); - -protected: - - /** - * This function performs variable-name substitutions. First it does syntax - * checking on the variable-name definition. If it's syntactically valid, it - * then goes through the remainder of the description and does a simple - * find-and-replace of the variable name with its text. (The variable text - * must be enclosed in either [] or () for this to work.) - */ - virtual void processSubstitution(UnicodeString& description, - int32_t ruleStart, - int32_t ruleEnd, - int32_t startPos, - UErrorCode& err); - - /** - * This function defines a protocol for handling substitution names that - * are "special," i.e., that have some property beyond just being - * substitutions. At the RuleBasedBreakIterator level, we have one - * special substitution name, "". Subclasses can override this - * function to add more. Any special processing that has to go on beyond - * that which is done by the normal substitution-processing code is done - * here. - */ - virtual void handleSpecialSubstitution(const UnicodeString& replace, - const UnicodeString& replaceWith, - int32_t startPos, - const UnicodeString& description, - UErrorCode& err); - - /** - * This function provides a hook for subclasses to mess with the character - * category table. - */ - virtual void mungeExpressionList(); - - /** - * This function builds the character category table. On entry, - * tempRuleList is a UVector of break rules that has had variable names substituted. - * On exit, the charCategoryTable data member has been initialized to hold the - * character category table, and tempRuleList's rules have been munged to contain - * character category numbers everywhere a literal character or a [] expression - * originally occurred. - */ - virtual void buildCharCategories(UErrorCode& err); - -private: - - /** - * This is the function that builds the forward state table. Most of the real - * work is done in parseRule(), which is called once for each rule in the - * description. - */ - virtual void buildStateTable(UErrorCode& err); - - /** - * This is where most of the work really happens. This routine parses a single - * rule in the rule description, adding and modifying states in the state - * table according to the new expression. The state table is kept deterministic - * throughout the whole operation, although some ugly postprocessing is needed - * to handle the *? token. - */ - virtual void parseRule(const UnicodeString& rule, - UBool forward); - - /** - * Update entries in the state table, and merge states when necessary to keep - * the table deterministic. - * @param rows The list of rows that need updating (the decision point list) - * @param pendingChars A character category list, encoded in a String. This is the - * list of the columns that need updating. - * @param newValue Update the cells specfied above to contain this value - */ - virtual void updateStateTable(const UVector& rows, - const UnicodeString& pendingChars, - int16_t newValue); - - /** - * The real work of making the state table deterministic happens here. This function - * merges a state in the state table (specified by rowNum) with a state that is - * passed in (newValues). The basic process is to copy the nonzero cells in newStates - * into the state in the state table (we'll call that oldValues). If there's a - * collision (i.e., if the same cell has a nonzero value in both states, and it's - * not the SAME value), then we have to reconcile the collision. We do this by - * creating a new state, adding it to the end of the state table, and using this - * function recursively to merge the original two states into a single, combined - * state. This process may happen recursively (i.e., each successive level may - * involve collisions). To prevent infinite recursion, we keep a log of merge - * operations. Any time we're merging two states we've merged before, we can just - * supply the row number for the result of that merge operation rather than creating - * a new state just like it. - * @param rowNum The row number in the state table of the state to be updated - * @param newValues The state to merge it with. - * @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable() - * (itself a copy of the decision point list from parseRule()). Newly-created - * states get added to the decision point list if their "parents" were on it. - */ - virtual void mergeStates(int32_t rowNum, - int16_t* newValues, - const UVector& rowsBeingUpdated); - - /** - * The merge list is a list of pairs of rows that have been merged somewhere in - * the process of building this state table, along with the row number of the - * row containing the merged state. This function looks up a pair of row numbers - * and returns the row number of the row they combine into. (It returns 0 if - * this pair of rows isn't in the merge list.) - */ - virtual int32_t searchMergeList(int32_t a, int32_t b); - - /** - * This function is used to update the list of current loooping states (i.e., - * states that are controlled by a *? construct). It backfills values from - * the looping states into unpopulated cells of the states that are currently - * marked for backfilling, and then updates the list of looping states to be - * the new list - * @param newLoopingStates The list of new looping states - * @param endStates The list of states to treat as end states (states that - * can exit the loop). - */ - virtual void setLoopingStates(const UVector* newLoopingStates, - const UVector& endStates); - - /** - * This removes "ending states" and states reachable from them from the - * list of states to backfill. - * @param The row number of the state to remove from the backfill list - */ - virtual void eliminateBackfillStates(int32_t baseState); - - /** - * This function completes the backfilling process by actually doing the - * backfilling on the states that are marked for it - */ - virtual void backfillLoopingStates(void); - - /** - * This function completes the state-table-building process by doing several - * postprocessing steps and copying everything into its final resting place - * in the iterator itself - * @param forward True if we're working on the forward state table - */ - virtual void finishBuildingStateTable(UBool forward); - - /** - * This function builds the backward state table from the forward state - * table and any additional rules (identified by the ! on the front) - * supplied in the description - */ - virtual void buildBackwardsStateTable(UErrorCode& err); - -protected: - - /** - * Throws an IllegalArgumentException representing a syntax error in the rule - * description. The exception's message contains some debugging information. - * @param message A message describing the problem - * @param position The position in the description where the problem was - * discovered - * @param context The string containing the error - */ - virtual void setUpErrorMessage(const UnicodeString& message, - int32_t position, - const UnicodeString& context); -}; - -#endif diff --git a/icu4c/source/i18n/unicode/parseerr.h b/icu4c/source/i18n/unicode/parseerr.h deleted file mode 100644 index 71a175e2329..00000000000 --- a/icu4c/source/i18n/unicode/parseerr.h +++ /dev/null @@ -1,88 +0,0 @@ -/* -********************************************************************** -* Copyright (C) 1999-2000, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -* Date Name Description -* 03/14/00 aliu Creation. -* 06/27/00 aliu Change from C++ class to C struct -********************************************************************** -*/ -#ifndef PARSEERR_H -#define PARSEERR_H - -#include "unicode/utypes.h" - - -/** - * The capacity of the context strings in UParseError. - * @draft ICU 2.0 - */ -enum { U_PARSE_CONTEXT_LEN = 16 }; - -/** - * A UParseError struct is used to returned detailed information about - * parsing errors. It is used by ICU parsing engines that parse long - * rules, patterns, or programs, where the text being parsed is long - * enough that more information than a UErrorCode is needed to - * localize the error. - * - *

The code field is an integer error code specific to each parsing - * engine, but globally unique. See the engine header file for - * possible values. The line, offset, and context fields are - * optional; parsing engines may choose not to use to use them. - * - *

Examples of engines which use UParseError (or may use it in the - * future) are RuleBasedTransliterator and RuleBasedBreakIterator. - * - * @draft ICU 2.0 - */ -typedef struct _UParseError { - - /** - * An integer indicating the type of error. If no error was - * encountered, the parse engine sets this to zero, and the - * other fields' values should be ignored. - * - *

Each parse engine should use a range of codes from - * 0xNNNN0001 to 0xNNNNFFFF, where NNNN is a 16-bit integer - * between 0x0001 and 0xFFFF unique to each parse engine. - * Parse engines should define the enum PARSE_ERROR_BASE - * to be 0xNNNN0000. - */ - /*int32_t code; */ - - /** - * The line on which the error occured. If the parse engine - * is not using this field, it should set it to zero. Otherwise - * it should be a positive integer. The default value of this field - * is -1. It will be set to 0 if the code populating this struct is not - * using line numbers. - */ - int32_t line; - - /** - * The character offset to the error. If the line field is - * being used, then this offset is from the start of the line. - * If the line field is not being used, then this offset is from - * the start of the text.The default value of this field - * is -1. It will be set to appropriate value by the code that - * populating the struct. - */ - int32_t offset; - - /** - * Textual context before the error. Null-terminated. - * May be the empty string if not implemented by parser. - */ - UChar preContext[U_PARSE_CONTEXT_LEN]; - - /** - * Textual context after the error. Null-terminated. - * May be the empty string if not implemented by parser. - */ - UChar postContext[U_PARSE_CONTEXT_LEN]; - -} UParseError; - -#endif