From dacfe88ad27b0ae6a3300cd9defdde6d27b6547c Mon Sep 17 00:00:00 2001 From: Ram Viswanadha Date: Thu, 21 Aug 2003 23:44:28 +0000 Subject: [PATCH] ICU-3064 StringPrep port take 1 X-SVN-Rev: 12908 --- .gitattributes | 6 + icu4j/build.xml | 72 +- icu4j/src/com/ibm/icu/dev/data/nfscis.spp | Bin 0 -> 20714 bytes icu4j/src/com/ibm/icu/dev/data/nfscsi.spp | Bin 0 -> 20206 bytes icu4j/src/com/ibm/icu/dev/data/nfscss.spp | Bin 0 -> 13328 bytes icu4j/src/com/ibm/icu/dev/data/nfsmxp.spp | Bin 0 -> 13880 bytes icu4j/src/com/ibm/icu/dev/data/nfsmxs.spp | Bin 0 -> 20714 bytes icu4j/src/com/ibm/icu/dev/test/TestFmwk.java | 22 +- icu4j/src/com/ibm/icu/dev/test/TestUtil.java | 14 +- .../dev/test/stringprep/IDNAReference.java | 408 +++++ .../icu/dev/test/stringprep/IDNA_rules.java | 1538 +++++++++++++++++ .../dev/test/stringprep/NFS4StringPrep.java | 173 ++ .../test/stringprep/NamePrepTransform.java | 172 ++ .../test/stringprep/PunycodeReference.java | 388 +++++ .../ibm/icu/dev/test/stringprep/TestAll.java | 42 + .../ibm/icu/dev/test/stringprep/TestData.java | 631 +++++++ .../ibm/icu/dev/test/stringprep/TestIDNA.java | 700 ++++++++ .../icu/dev/test/stringprep/TestIDNARef.java | 565 ++++++ .../dev/test/stringprep/TestStringPrep.java | 197 +++ icu4j/src/com/ibm/icu/impl/LocaleUtility.java | 11 +- .../ibm/icu/impl/StringPrepDataReader.java | 96 + icu4j/src/com/ibm/icu/impl/data/uidna.spp | Bin 0 -> 20898 bytes icu4j/src/com/ibm/icu/stringprep/IDNA.java | 918 ++++++++++ .../ibm/icu/stringprep/ParseException.java | 143 ++ .../src/com/ibm/icu/stringprep/Punycode.java | 467 +++++ .../com/ibm/icu/stringprep/StringPrep.java | 409 +++++ 26 files changed, 6936 insertions(+), 36 deletions(-) create mode 100644 icu4j/src/com/ibm/icu/dev/data/nfscis.spp create mode 100644 icu4j/src/com/ibm/icu/dev/data/nfscsi.spp create mode 100644 icu4j/src/com/ibm/icu/dev/data/nfscss.spp create mode 100644 icu4j/src/com/ibm/icu/dev/data/nfsmxp.spp create mode 100644 icu4j/src/com/ibm/icu/dev/data/nfsmxs.spp create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/IDNAReference.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/IDNA_rules.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/NFS4StringPrep.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/NamePrepTransform.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/PunycodeReference.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/TestAll.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/TestData.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNA.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNARef.java create mode 100644 icu4j/src/com/ibm/icu/dev/test/stringprep/TestStringPrep.java create mode 100644 icu4j/src/com/ibm/icu/impl/StringPrepDataReader.java create mode 100644 icu4j/src/com/ibm/icu/impl/data/uidna.spp create mode 100644 icu4j/src/com/ibm/icu/stringprep/IDNA.java create mode 100644 icu4j/src/com/ibm/icu/stringprep/ParseException.java create mode 100644 icu4j/src/com/ibm/icu/stringprep/Punycode.java create mode 100644 icu4j/src/com/ibm/icu/stringprep/StringPrep.java diff --git a/.gitattributes b/.gitattributes index 88d6744732d..ae0638909a6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -56,6 +56,11 @@ icu4c/source/test/testdata/iscii.bin -text icu4c/source/test/testdata/uni-text.bin -text icu4j/src/com/ibm/icu/dev/data/ThaiWordFreq.xls -text icu4j/src/com/ibm/icu/dev/data/holidays_jp.ucs -text +icu4j/src/com/ibm/icu/dev/data/nfscis.spp -text +icu4j/src/com/ibm/icu/dev/data/nfscsi.spp -text +icu4j/src/com/ibm/icu/dev/data/nfscss.spp -text +icu4j/src/com/ibm/icu/dev/data/nfsmxp.spp -text +icu4j/src/com/ibm/icu/dev/data/nfsmxs.spp -text icu4j/src/com/ibm/icu/dev/data/rbbi/english.dict -text icu4j/src/com/ibm/icu/dev/data/thai6.ucs -text icu4j/src/com/ibm/icu/dev/test/perf/data/collation/TestNames_Asian.txt -text @@ -74,6 +79,7 @@ icu4j/src/com/ibm/icu/impl/data/ICULocaleData.jar -text icu4j/src/com/ibm/icu/impl/data/invuca.icu -text icu4j/src/com/ibm/icu/impl/data/pnames.icu -text icu4j/src/com/ibm/icu/impl/data/ucadata.icu -text +icu4j/src/com/ibm/icu/impl/data/uidna.spp -text icu4j/src/com/ibm/icu/impl/data/unames.icu -text icu4j/src/com/ibm/icu/impl/data/unorm.icu -text icu4j/src/com/ibm/icu/impl/data/uprops.icu -text diff --git a/icu4j/build.xml b/icu4j/build.xml index a400c9160a1..7cc85fa50d8 100644 --- a/icu4j/build.xml +++ b/icu4j/build.xml @@ -6,8 +6,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/build.xml,v $ -* $Date: 2003/07/03 20:48:58 $ -* $Revision: 1.75 $ +* $Date: 2003/08/21 23:44:28 $ +* $Revision: 1.76 $ * ******************************************************************************* * This is the ant build file for ICU4J. See readme.html for more information. @@ -16,34 +16,44 @@ - - - - - - - - - + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - + + diff --git a/icu4j/src/com/ibm/icu/dev/data/nfscis.spp b/icu4j/src/com/ibm/icu/dev/data/nfscis.spp new file mode 100644 index 0000000000000000000000000000000000000000..3acb62cfcae096af56069de480c86a4c09888a80 GIT binary patch literal 20714 zcmeI4eRv&JoyX6abLREtzGewhmeUd>K#>4Jf>sR}G-%jH4NI)?A<~qjG;Py1ZBrXG zYS6ewjaZk6U5Q#Af(8f>Ahn^Sr9f!A7_mx(3PFklDH2LqvJy+O-*YF^nfscX)<1am z>CE$)bI$Ml-pe zh}cnaipsw?EUQ7+necHfA*UQs7AcFg7nMc&mD=yLS5ZEtKcoLydsBNyr}`n?)c>wr ztJFv>r4Fo$);1_#Q@*YIK>4xqGd--Iq%1PdSMD$_QSMQ;Dm#=%mEFoy${&>%lu_kx z%3)=KDN(CM`c=$g5q2^=jm=XQv9sB{BPLFwd@9VBl`}! ziQNME&)FU99=4V3U{A0iHq4%5e_?-P|AX=$vgC~W*xz_IpTl$ceB~)#z!$K2yqK5r zav4`y#4Gu=yoRsf>tvgMg@04#f!g=@P0{{aqSv28@=bmRu05cS-^B;`?ffp(ALqZB z_VhGHD(7Z~lRwX2mOAWG{yOS!^LJ%SRaL)=*+cC#;7>9e$!2I)GGnE&-uSBVE%i)& zt^OVLJoRGzRP|%(Cykrch3eJnHQKZKN7Nd1rLsz0tJJX5^fS~O)Eo74)wY?&g#J}> z7MNhj9y{lJ{bIE}-b(F|67^PRbn5NAQtig7{8sH%YSjLC{h8XwI!Ee&^vvJ(`p3OB zs9z2V)=(X?d-#*;(^1XaRk=%RR26;CsklQ>_B-QKUo~zuZd2cM+P3Y9>w9Sim}1}6 z@1>kh&zC6`-?P~3sef#pK*;*V-O>x|5e|vFVUB4+pvnq^%GNj#K}C9l0Tmq&9>8TaP0M)G5T9& zOMgIK)fwhM+hgybM9-N?j(7j1c+HiuSTUv!@d)0Y9(mUnbX)50dwh_mhv3MTBT&pd3_yDo_n-K|N>$&7cLe zflcuG4B1JxqdY`$ac00R_5I$+_BeqsD`*4lvAXW4V``U8&HhZd17_Zd-g!<{kK|16M9*ECO4pUoHg>Fj%v>Ce z>v~UDb8THMSC*`nenp&KS-KuGuelL3w{c>+wz8Jaey0AJvWNZQbpGu=JD-{Lq=v0~ zJl9O(r1x)oNI!Gwqe%bhG-CVhY|`mF868{RPBf=X6)8Sv*^bT4_K>J&OI-i9mG|m+ zax~o>Tq{?Wy5{xii`#NC zh_&)&7(b#!O|Qh(_S(o;i5fC{TWa^a86~TE$LjUv)RWnGWsaU(I?)}8-Ig}4B(@%} z;q~v>IWpO)cLxe9lz2hG2>)1 zspUk7FB3hu5?eRX){!`6yicc{yx%KzVmTJE5nWA3&n~4_tVFiFQrl1LHPIqbV*7}- zbLZR5CtkyDJN-@>pQY{H8`bsg_2}ArrQW*ZBVJ~=U3*8*E#q^td!(If<@Fk0clG0Y zlC_ihlrrfhR^Rb!m$ALJMXY}EaX8jaY1=rh*fQQe8dvlU?^pYz^C+9pCH6mdx&&%HAt?>a+7KwWVe1dC$b36Wg7IL`f!o zTpibwE6b#2kHpcmN9MF-dA$8zUryc4HF-QoPnNE(SDIQLtL3gzVx)M@)NOlhQuoLA z*!uC;c&+s9cx$vX_~x11r|m0wpOnVWagx|6o_ooBCby1{;k9)7>~i+@USG-TsmGrx zRx-KQ^W+hyHF-pyco&-T=Yr$QvB8fn@9B-VPuB0R$B^;?#tl(B+kk(Q|aP4p)3%thKGn}ZebQx)lo_V1)S$-mmx zQM=k;eGndk&5>RO3r+t{{~m0W{a!Clv}C39BG1^hgXHk!yYh{_+^L?^mPD_1l+5gUaZL7OL&QrgUyWD^&ZkAA&nncfAkK6 z4tBJA#Kv^x@wHezw-v89yLRgNiT7iVB=ePvxA(T~>>Y`lgA>E)Nmoj%Sj$+6ZR={J z@|9jAb+5Amsr;ssd%Ze-QkI@QJ-y>?$@C6Q7bmeVlNok0NwoG#9IeFCt2O(&?JpC% z2z~;D3PF+kba3s$1K~kJ!b4J$-H=@t9uAM7JqpJBjjo)4M{h z_r$TLmN*eoYozZ9jt0kqV6 zb@NGAo7t&E@>u^;8Y{6|$)&EQ+n?yy?Q`{K zQMfqVzopqRm;?JwL+~_tJLaP-S|1Md(|OzU#wncEh?rC(OjA*Gp6(9 zl^^6|vQ&rF{pyH%0P+R2h?c0Mv`jsyj;Zn!aP*LwxY^h>x9w=zr8-W_X{9=$PHKv# zX_gkyB3ceD*5=S^TBRNRJVTqcx~W5218vcoXsgUaYo%>kJ8joG@H;x4+7?K(%v3jjOtKS;1c3FegUTetOXAQ^aCEJd# zWPR~6NB5XYd8f$VNSV(4n98HOAwDzr+n(L?)?eZ^WIN`w#?d=>@?Ck#zGODBvc%s- znOWj`m|54WfqY|rJ0{qnbmEit7SIK{K@aE!eV`xg0{DcW>;?FgK>wobLk;V$?8m41 z3kW&#YXzSe6ntVhvRfI$^~oboD$#LIK6d1pBY#jPqO!+uec;G;=Why8zYCIIvH<>g zFamPWx`p9BI&zB~9m|Wh^Kt#@k)N_cNEV>}pRC9pJ1d?p+tITLbb!qO<6#&N!+03R z6C7YYtQYk@XR1Q-eJ zqTRGdN@ySLr^p|*!65QSxsQ#}{cM6Bkgchm=})PX=`nT7%;!wL;_r`{^=HyZ-ya`u zw(a!Z)AeR@uRHefSvhi9QtaZ?M^>-7DXZV?$lArrvi54lMAJ%0Kr1CVS|t&@n&epu z$+t98Xj!Dl3Xqc2zT6(G>1e$awK7l+DnKQu0@a`nG=N4REt)|KXa#Km_hS5!H6AaS zZr3K4(;PEE=a@OlfGJ45nWr2u^Yu=%Q13O1Xn|Q`mYQX>)T}To%__6ntR=YjNe7U3 z%oflESOw0eO#Z= zCwZfx@g^g{8;l%UYY19z(q-a@zNZM0XHJ7Cra#eSgJ4;1@> z4g>53ItmVgF>n}+g9$K+pKlrnfC!iaazP%L53tHg0l>^D3jkIq*n_eJl!7vuo>C4f zKqaUG)u0yCfqGcFefI zzFuDI>*sa86xr|=-pbo}JKw}R_-5Y8x9~3B&3kw+{$jr$f3d%d5AwZyi0|XWd_N!I z2RLRW^Mg;SimIuW8c-u@jygvbYOb26&QIk*&({m|LVbZ=q!;TYdZ}Kfm+KXJrCz01 z>$Q5FUavRkje3)g71Y}S_K}W#q;COTpd0jnKF|*a0LHElg1rE1tz)f?QcwoUK?SG; zRiGNwf;vzS8bLEaJfj^Tp3#Zl@vSq_JNv`jY<8Mk%r3Lr>@j=IKC|B(Fn5`Q=3aBi z+-DA(`^^#afH`U&G{?-t=D0awPG%CtidZ?;97|ZaR-QHA%C`!vLTiClWEEQ_R;g8H zm0J~7rB!8BTeVi5Rc|#|7+L0rh5KUW!x~AaV;!(Yt%KH>b=VrWCag)H;?sPVFW`&# za(r`q!k6pI^Ue3=`wD!8z6HJ_U$N`WhxPJdy?j_NAJ)rP1+Y#&tbz|C@ihaikq>L+ z!y5TIK^N!&eP93#0_*|b5WpVr4FinJHv$fTQE(89fx}=NOn^!Jo0I}H5CAzqfIN^7 z3PBMl0cD^9RDoJh4;nxtXadck1+;=T&<-|%4zL+?f-Rs6bb}tyhqs<3p|apeL`3qk zCjFW};Lq`kP)DfKpXbl_7lsBxgP|dRk-x-W=CANqg+@Z7p)r4LXfmvY1L2&o@HhKg z{q6n^f2Y68-{bG|5BLZDL;hj^h=0^S<{$S@2DCsRkP{Gryg+`SFi;dI36upY0#$+9 zKz*PwR1vBQ)rRUrjiKgHYY15*r>;;>2)RY;!;!JbM7EY4vD=B)a7DJro}XP9+Y_z| z*M{rEjp62SYq&k!5$+6kg?qw%#ysOPrx@aBa-Xj_(xTV%x;G*+b$x>|ya;wnKcMJtBU<9u+@ikBc9%C&bNckNAbS zP24W-5Wf<4io3sbk+_E}759nz#e-t2*d`tl4~re*5%DNlEglz7h&`l1{I~dx_^tSz z_`P_VG>SipXT_h$m&Nnq1@WSINxUpx5wD8Z#OvY>@uql7yev{^<=}s&qKe4EG5gyO17QUut%Y>fm|OwHhP2i(`8l+*GO;4gHlb7i!WC=Z$o9f87-!tpe6KE^h&yreujRIUPZ5_pQm4-U!>R2FVV$x39X?^ z=`y;UuB59-4P8Um()Dx$ZK7YH&GbgHjDCyt(f^>|W51@GSU>%V95ej|y^Y>ZmeOCM z|8BCJ-iu5gz<1+q^kK4&4$?t3NcZ4;l0HeA=n#F1?xTO8!}J-tpFT%lpfA#w=*#pK z`YL^mzE0nuZ{d8KzC-^(-=$Lu#eaRID!O7SKE@57Jg*@u$tukFVrVSE z-w4%^TCBwNnDaWU#|re;W3^U8uOYgQ>+vmX1OA$5Bfe=#jg9d7z_D1f0puVv=}lw3 z3UTG`@kVv>X`IetaYEKBkuR(;s-w9gQa{_yo@IYx&m-~+>_zqxsb(*e>)0!#mc7be zBX#U`vVy&Fbi6Tfr)S8SA9r?E#pcHzV13iEFupV+b4GD=R{3J`Wxj+o;eTSnoxqpz z<$NVyMON}Pd@WzkH}H-82L4t4HB!yLkuc&F{JW%{f1j-8KOhbKhh!c95ozQ%!{d+P z@hAMh_^teB{1^N-{Etm{@L%ye`Q7{;ejmRd-X0{^@vWqmZzI?9he#cNn5^JCN|>@V#Usf0FISr^ny%-}9&WANjNVPx$|vUf?hCm-x&46?l6U-d=;Z z*Wv9AczYAx-a>9~+k1$=6T6x!RT=)@CtWpFAO82J;4$wexs&9MK8AhF9%c8j`>}El zV(qqK^|oQJJcQlyFm}og?9xXddldd3M}#NXZnh_C(K|aEvRF{Tger7WFHCly@Ui=a zpFJpoY^w<4>79+I_epqqpNyyXDR_FHil_H!czVyp)BAKhz0VM5inGKy;#_f_IA2^S zE)o}uOT>r8N5sd(rQ+k_a`^ltd|m;cpN7vX;d3E;eg-~22cK8L=hg7}dHDPSe0~u= zuMuAoi^URABbJI~V!2o;R*5xYtynKMh>hX~@m2Wz8hm~OK5t}O$@T0)+*bzKz%1PP z7FnzeJoAYc>Yky`od@#4`QQR@A^0G;7!-m9;3MFp;A5Z&6k|rSq<=SdS-oBW%tF=iokE3}1`! zlXfw@Eynyys!^}TlfN1|1>hp+)k3cpdbMa@0zL#jjGqlnC@%$Ib#OMys*i=+F=8hxMaIkABSP)0S%0s$Z)G zYZJB2+Kt**w6AO5(!OtKh7PO0YIhp1Yd_a^Xb)+RXnVCMv?sM^v=Qwu+EMKo(-Kzj z!EEhm7H4O%vr(VN&dzXL$Sz{Fj6%)lN|gh z{;bkr#r)5xAK`yjE&i+^!c;X81ti04WSXIA$;^w*E6n$s9}#C6CCXVzE?PD{8cAc9!uLQ70OVw~8Ak8x#5y^T%egvy`qe$H*5qrCN!v zDv7w&9i6zHUn9PURoO4@(5l5UM92nLM4t=^2>*42b_yyWU)5uG1Eq*PCB7Z<;b+h3+?; z!zes`{C}E(4EqS5yVWOV=Mcwrt_PJCo2S1egEo59=XHH1n61y#FBCWEm+8fhroKRX z!uhn1Pct{^SL#>4(YbT;nrbO-5;u6G!T$fazF7b4RIUHb`0(+>4{@7BrV z83&E$^fktQeUpBJezQ@he@*|U{ypO*W1G=nG#fQWg-K1zjF@NY|8CAU&olNI2aGPG z$tW?(^q*lBr}b$*Ra?I=Ise=({#2^H^AsbO)ns#^?{aofy634$-st|z{-kSaJ(@Xj zgyZ=d*FhM>K^~X|=7M}sNXW#I+{aX+DGyW8n>-pk9i2Gp_!_??-RJo6+}Tc^hik8V zyY=X>Q}ZN_pNZ=rS)DkJOek$W!Dkdqd~wp_NV_T~j;^fYNR~q~Mk)3%kup-7W zLQD?3nNC{niOcF+O3Ko95xdjLF795Dw`I{@%h*E|S@!67gL4uer} z6pVpmc$%vL8^pmZFc%bn`A+s$Y4RwnD)eF8RxQBTt$NS^8bMRCt~ctW+JcGcA8A*> z(-_2koyovfcU7boL--gDJFTTjcAWvb;~5w};Bp2w-zyogh` zabtS6s+P-svi{uJJ8^RP_xh4^I+d2^D|;=sU&k)@jAZw%dX)cM8p-`mHo5ejzT8>6 z5#7@B=SFi&uP3>dYN^`Exj7!v^&E-k-?8$aT~Ds27umD&WZ7$epKgABZMR>^Ci9#= z4#&!Wc48=r^UCbsl_r;-jW>En!|lmjCVTN(ZU)I#{tQz`l&i2&rYd2T$VYrrWv_w@mfvwlnwp zrEVFPP9(n^-7mS5`lNj|4rq)Qwh$#&j+d-Um{qPEL=q^Q`<{Q|q37YEPziCZ9^Cyd>+pew{M8*Re>}&pZy- z+ASR$x0PI``kPwY^)WT=Oj@o7XN=C+QtO_M)1&-&HOH&lR;7a@apjp~yS9EU$J(#2 z;``ff-IcqgD_3P^3%^FDdZv+P((~3kwfEHG`n^o%!5OVT(@b^c(eKHr`z4Nsm6|gWSJN4p+fwBl?f3g~ z>t3#z^@=x`h z9$QJK5=rS~w$5em_BrKr?6c`+lKb-?Kkl~1kEIfahPq{@_4xFgO$(CVzJQ5Ik2ShX9_dk;4Gb)yPqR z=W65_7{_-p;hX9J{%JUQEufM7nnRKgU$2Vk?@hEP=?_(4NO=I`i5DhjbdK`k;xl)u z$4}=H9Lt!;LBs>xx4Ua*0JYKBw zr{~l2BbbkM_@1hIMZ%H1h@3bQ$&Z)O#m*5yuI?+oC|;>zMT+9J3Hw{=tZuWVOJihh`>XnCca(6Lesy~zenm+zu$|apZTo3s3UppRNLLN zu12E7vvv1yqGXQ~X-JQlnmaQk+&v*7I6viqjHgpPy7WDx#82-6GY4*G?X*=h2Pf zX=bk}t7OY$iDT($Wb>3;BYUsA0@?iLlKZ{7ezKM)ceJ~+Pn0M2XRbKueN&lXHm)ZOFd1mKi#j_=jl(UraU^oXZBmAW}3b2WaPJX_lqZUbW+YqD-#RH@?tWUA1jO% zrN;97fLbSr84odacW-M z)pANPLd$5mI4nlRQ86ZtiE&-iby}p`w1O_uPrjd_b^4NtBYG9B*K28m%0q9Ujd~Mp z(zoi{^cKApk~V$2-mZ7(oqCtvoh@Q={M44-tM|cEzrIHw(D$P@s2|XW^n*Iat{+ly zQI6oRoJRGd`j~!9AI}}fv6whw*NGat-fplP?Iyd~-fC~NTkKZ5&E9Ue+Z}eN-DP*% zJ$A3%XZPECZ2YW9f4RQWd(tJY?kSb(N>SewNsRW?&gmJs68D|6+kWj&ZY{lZ?t|(b zM|xkXB>g)flS@1wlk565u)mYOr4nh z>OX_z$J#g{>hJxSj@IoA*UjJ3prjMKb011)s;`OLCqE|hb>dxj z($BEz$TIo&>@)SI$|!b4_E)gOY7TZ;TZ28kB-p3VC%Rrl!g?{u(-#uS7mcSOhA-5>N@MKn?SLhX7AxQAunK8`Rb&-gCA7pUvld$A)*`Edw7^d*z_mcs4#KrSx`5g{ zJpk7L=>z>>0PF{YU!2(ecoG+FHi$!IyR8$2QveMupb0E0HubI29jhR*2F|(F#HS3ve zHnO?H9!wP|QEY{8P+7#oSZOJ;mJ9VQ>hHfWu%EjDd07 zZ#7_pILHIDfCO^^WqAUDJTOAK{;3iD!>v@35#~E3bh(g3+g~U zXaJ3%2{ePPU|Z^NIkdCC0>}7BP~(S#wstTW*G7V~v{6?Ze*ul`SQ$fBtehbyhI|*XFZuCs0-jhk{ z$^wgcabSs|xox);wfKADdeI;nMU!Y2Tg5ifB3eb8*dACY+Igwy;ANsybct@!BYH(& zpvuq#wT2z2H^PBNBOYir@&en8S%Fr*FtDAM2Re8~po=dFWXXot^9J6?n|L$d%D3?r z-pbqfcHYiAcqi}T-MokQ@;=_r_wWI}pAYf_Q~SGlcRYNQALV2G7#|m!(1k6+A};d8 zEVn8t$`IZngI5Zfqi7O zf;O-nw1ZC21-b#oZuEjafVDQT)@Cs%0i~b}ECl6X5vTx5KqaUFbpY|qCV+Tm3%>KY z#6s`%U)DCO#cH+MtnF62)nRp7T~@c%WA$2nR=>5!8nE_TgVq6S$U0~ZTZgO>>##LC zl_<7m>$Yu&?YNz1&$6XG*Uq;K>_U6KU1S&AC3dM@W-qkM?L~Hly~M7xt89#H>MtAD z#pEx0Fqe*fz#g&>+QarCd&E9$kJ?A=G5eT39?$}Mzz&21@jzZ+RzL>k2J!<1fx^Ii z&szZN6~KB0uwDVISD+kVodQ^e07eq116ZQ~)+m5A3bcSW&<;95H|Pb}1A%^kJrEcG z7*}8r8~{V$AQ%RRzz8@DM!``q2FCGw5gmj<9*`g(6oMj90?I%+r~s9q3e*C|JXR5_jMc>IVhypT*w$D}tS#1V z&M_C5OW(lpMe_$4tam3zS1t3|40;(OTg2QwRozMSc3X2M(a4yAXJajRV~4z%?UY|( z56G{w2jwm7A^A1-u>3lEM1F()Qr^ZMliy_fXO@eX$b-r;V- zu^F1x&|5{;uwA5@J%awttYG-TbXtT9>+PS1rJ5RenyHL9bwTrb&$tvwK%z8EcS4=Ni zN9x%x*rRMW8(@1e&+Ew=vKBMG3>wSvw<6W}^sxdnzmn8qJyxT)4r{dzdi9A_+=Ndg zoAFmATkxqwX>5Vl`%cB0^&1Cj57vE&TKR27V){;y0y@ zcs0L;)bX#8b^Pn3o_~XE&|dnWGMXXCDY z4({5sao2tu?%MO@x$-=DfxJ*&Brlei%FE>C@(THG`5yT``F{BU`9b*nFnoRlK0gYd zAA`@U;Pd0~`3d;^Bz#^2pPz!yPs8VD;PbQcb8?wnF017Vxl*o@YvfwFUT%<^5D)ixENdlE(PxdmxCfu4Bi9Y z3*HAxKq+Q(hPva=CT!TLH3ymn>4;FwA5X>#g4}uSY4}&uB5pX5=C|C$S2L1_Lm59F%cjY?L0N)LW+kp5^ zxeh;9>aeqRpL+vj{ z5@(t?tCcA|u^u~7n&?$$>^+>-epC{5R{Bb^Pw8WJt1-*&e5)B(G1a_bHjt@0tDl&S j&{SuYm6~03UZKuT6s+o6Qt0@HbR9G}Sw8m+=&$%cw;Cqr literal 0 HcmV?d00001 diff --git a/icu4j/src/com/ibm/icu/dev/data/nfscss.spp b/icu4j/src/com/ibm/icu/dev/data/nfscss.spp new file mode 100644 index 0000000000000000000000000000000000000000..1660362269e68b965b3f8ab02bc2d9b393e259a0 GIT binary patch literal 13328 zcmeHMU5Hgx6h5fM9MKB`>A^n2}T1@~r@6mG`Q8UA?8=R$X#&Wocch$pgvNIYDt|`XVh2fTXjxd zP#1@+OX`@qs+RSbn$kTz7P)TKYxFujq3_cV=!f;=db8f5x9eSckKU&b>O&5HLm$<% zdR{N+5A~v6(kJy9{k8ti*&i$Wlm6A&KlEQln;X@VS!34gF>{yMU^YTuHDxxLC(LHG z!#o4AdEV@XIh;3R_E!0W)%}n|b}P3obHu%ln!{#R&6&f_f5*H(P&(mOI^4kqZ{93I zR!^8EHE&LtFRBiU=B)X?^3FTevyt71?NBF-W-{xmKD*5?uG8=4ii>udzP+J*sGPJn z+uPLv-E+@*SxwnH?A`9Zfhrs9fod1}P&K$_hu#3O4+3JY(CX|XW|Mu)J(c73Ni}1) zg#5O6rF&$zR&$*4^el`@g#b7x7&^dLpA8812Al2Sz*K?|^;fy8e8;>eTxy{q4N+I)6>0XtV?Wl^w9J z!Kp8wDxX$U<+gHXxhwm%$8q-$u1Cw!^*`Ey$mxa8-R=`JUiM9U%+3Kz_w2h7HvG-) zKCj#3?sdX`Y(KRd>}U3KWZEyq=ha#JjXk&8(?fIRS~crFSq@hNpC9c7`*Uvf_2&gU z=)S42zY%7xwD+sHJKcSU{nP&K-pR)H%bQ3+TKlYwz%OrePqEXd`H)?%3pE?#`LG_p z#b~N&+k^5>JVABkJY==!FY8OMMpFM$V|tM6UuwQ`HF9`G(cG)bUhZE`Yb{@EQ%Kih zhUy_=A3a9Gy@NQ!(w*{|DUGfQv=X+ya#ua91~u?lwaN24=0((j*(;kgte10gF6DLY z>GNeT_EWulJjQ1%<~2VJ%62)eaw6_1(Ker}*lVf*R#U!tdhF3$Pg$O4r5-6Y*=B95 zY+G@gettf8f10PJdC78rsDp?sXM8WLgxH@A73#cDAFJe@l!(clQIq3lPioS9vV!t} zwJF!vfmQO3JpjRfaeYF4%Qd`uWKbq4U57_O+!lnAnA4fLurV0uI`&lANw zxmKw~eym1f#A8d{3Bed@81em(A@!+_FwmfCiIFy2%BHQPdf#&Ns5kFI9m&yQ4o*ZdPDl~h)#jxZ`Tq2TqUBsXCU(dpJhZ$&xfP)TJ+V5T zLS#?&Q|xveHPXCrW!zJG>3TmSYozb9WJLYZ${Oj*3i4&X&8Hf$%hxDcz6KYbQ^MFv zO-A!PSqpw%6|zEp*BDfwqkMJgA9DTv&1>)UV|d>%)+)6gX0j&UqK_S^J<$!wfUB3RFilwLCf={Dg~ zm2{GJcp~Uz4D9e^r8<#fDvvm^>bAMAEsJ_StKOp>A%%8koI&+y3gl;;5-xV>jUZ|7{yU$a=xz1Ip%lq zKoS3+N>+uFremB)*3x-l3^~Ewdos+$vm)bMKhO`JM0`#_bM=9Te5xXgnUoH6;4bKK zoT^IWTr2w%pRVyKJZq!G7R71lL#siu>KlL7DC=o8FC<5K}jf3c()oWb`!{0w0% z?zR1#Y0Jk{xh9tNvGIxV$?@q`Tt~(iBGsq;E)b1hi14DFd&T(aTy6{RbZ?@C5uJc* JdXuZW{sD@*kmCRV literal 0 HcmV?d00001 diff --git a/icu4j/src/com/ibm/icu/dev/data/nfsmxp.spp b/icu4j/src/com/ibm/icu/dev/data/nfsmxp.spp new file mode 100644 index 0000000000000000000000000000000000000000..e8146f80ec88f016e1e2e4b77bff9b5f15b9f17c GIT binary patch literal 13880 zcmeHOTZmRw6h7_C=^v3HL&6|%9un)JGA3!jpaD$?5!8}q5=n)5P^y6tyAXDfiUP@C zgb6YXOX#LAf;|`vVGtn%VjyBuXLhD{r^vP!E&{98HUf+dDC;NIL0HK*g=ZJNweXGu> zv+BIMq%I>W-Kr+l-)cr{ovI<-tJBE3O)t?)^)h{r?$@jIYQ0Wx)Wdqa-l<3R9=+G) z-_-l{L48PnqQBH9^eO#~{$8Ke7o7biv%l*e$BiqPW57BH)1=~C`VJ7zO2s<^Ox&1ZH#thm=-Bmei zH>wdk9OA3uPWQ-e1!Z&E)4yy-()BI_*YG%2%n|!c=8c&v?(9a)IXC@o`y%ArwC(dV zw`Z}T7`ff!4YdQ8@l}08&D4HXbZ6@BKz9eaJ8(UBz`k}(-ydhPZvu_Jn{@hFE_v5; z2fMwyJJ9(K*tg+6PTxuQtD*E@dMN#{_TN#5J=R{E-IH5){cm<4azDA}Zg^tGW#6;! z+m8UHd+nzYG`#WL3vJ-Z|erJC`r2SDmuO6^J+27`S@6cRztvX-__-x?$ z(f(nl>awprF4#f$ONIT{y&kh)Jglkim+`l_<8Z3d1&+BWU6^K1hk^2t1(dharS2-u z>k~6w?kiSAT1-%B}sX&tU5I}Rjf5TU8PXW#R~BuVjn%ugnI{Zm@Dp7A6cZ)RDsTft*_iv580pw z9;=ple#d$dbzpR7lZJdbC)bL+ragVU?8SajFFzl{b1vqUUk$1@xm)E#+)-w$d}gs% z#sO4Wy=r>LFL{q3Wt@ega?5jf>XqwbwOpGpb(G>5(2rM0Th?)!GCZ@&CMQ&=MAMKB z>lDkGAP2EO8!F^+m|=Zn$s;Kdl6qpEqbg5o(tMIbb;yA)YoQ-jE*YU~$Ix zLxkUftE53}!jZ0Tsak9SPg=-DEcs|@hadv9oCkBjLagLt1`)DKSgc4cF;GPwX$F~N zCh^#YI2j{KPT^E^vS|*oVVz>ClMSy0 zs>}ZIJz_p7hn6&C9 y4f`X4xY7cTSyDChkrif(;+Q7na1zNPU7E_T#ik=gYWuE+I%76wY9?|Pe0s9k|rV00j>Qxpe zB2<2^`PNBUTAnrDFOHR2|IY9$e#HC3U42A#gc{X2Ldc%Nuki#&3cjuCAD)@$>FzjM zzGiVm_|TRyu!eJlJc^6?RTWf-p-8)qmHgRhie{%_%}s@Sh;^w;jJj5G!LQ2$T!?QP zqXxc8pmVeC`APl$ZQn=c<`r z-;ypqj-%=s5J8iKXTq`c*sszDB%^UInW64=bc!+DA;i#5g!wwLR3)84i+4%>*uHOy z`;lTX0xE>c+j3r274>{nzDIS26snPN1o6?vG4bo+GwVgZ65WrOm9=_mfGV2U&sXaK zz6|B-xhQ|fd6{A@RKicP;$yC#d>_os3g}qJc$Se|r+%>-=bCt+=-$PH$?C^Ak>8Q$ z#XMqyW8rlOd;8kOQK0Y6D8GC**8N<01Pr{J0f!MX!gFRFWKfK_Vl?53Epugm!gCcf zu{d_#A&&IeBC8q)48nF~;m&|pZshva&J{iv@V8Vxv%D97-^8>>S8N!6Fq$ui0A~&=^32gTK~jL$DZn6{rUMdKkr)G-@j_*+LisQ I9=LbKe*{SdjQ{`u literal 0 HcmV?d00001 diff --git a/icu4j/src/com/ibm/icu/dev/data/nfsmxs.spp b/icu4j/src/com/ibm/icu/dev/data/nfsmxs.spp new file mode 100644 index 0000000000000000000000000000000000000000..3acb62cfcae096af56069de480c86a4c09888a80 GIT binary patch literal 20714 zcmeI4eRv&JoyX6abLREtzGewhmeUd>K#>4Jf>sR}G-%jH4NI)?A<~qjG;Py1ZBrXG zYS6ewjaZk6U5Q#Af(8f>Ahn^Sr9f!A7_mx(3PFklDH2LqvJy+O-*YF^nfscX)<1am z>CE$)bI$Ml-pe zh}cnaipsw?EUQ7+necHfA*UQs7AcFg7nMc&mD=yLS5ZEtKcoLydsBNyr}`n?)c>wr ztJFv>r4Fo$);1_#Q@*YIK>4xqGd--Iq%1PdSMD$_QSMQ;Dm#=%mEFoy${&>%lu_kx z%3)=KDN(CM`c=$g5q2^=jm=XQv9sB{BPLFwd@9VBl`}! ziQNME&)FU99=4V3U{A0iHq4%5e_?-P|AX=$vgC~W*xz_IpTl$ceB~)#z!$K2yqK5r zav4`y#4Gu=yoRsf>tvgMg@04#f!g=@P0{{aqSv28@=bmRu05cS-^B;`?ffp(ALqZB z_VhGHD(7Z~lRwX2mOAWG{yOS!^LJ%SRaL)=*+cC#;7>9e$!2I)GGnE&-uSBVE%i)& zt^OVLJoRGzRP|%(Cykrch3eJnHQKZKN7Nd1rLsz0tJJX5^fS~O)Eo74)wY?&g#J}> z7MNhj9y{lJ{bIE}-b(F|67^PRbn5NAQtig7{8sH%YSjLC{h8XwI!Ee&^vvJ(`p3OB zs9z2V)=(X?d-#*;(^1XaRk=%RR26;CsklQ>_B-QKUo~zuZd2cM+P3Y9>w9Sim}1}6 z@1>kh&zC6`-?P~3sef#pK*;*V-O>x|5e|vFVUB4+pvnq^%GNj#K}C9l0Tmq&9>8TaP0M)G5T9& zOMgIK)fwhM+hgybM9-N?j(7j1c+HiuSTUv!@d)0Y9(mUnbX)50dwh_mhv3MTBT&pd3_yDo_n-K|N>$&7cLe zflcuG4B1JxqdY`$ac00R_5I$+_BeqsD`*4lvAXW4V``U8&HhZd17_Zd-g!<{kK|16M9*ECO4pUoHg>Fj%v>Ce z>v~UDb8THMSC*`nenp&KS-KuGuelL3w{c>+wz8Jaey0AJvWNZQbpGu=JD-{Lq=v0~ zJl9O(r1x)oNI!Gwqe%bhG-CVhY|`mF868{RPBf=X6)8Sv*^bT4_K>J&OI-i9mG|m+ zax~o>Tq{?Wy5{xii`#NC zh_&)&7(b#!O|Qh(_S(o;i5fC{TWa^a86~TE$LjUv)RWnGWsaU(I?)}8-Ig}4B(@%} z;q~v>IWpO)cLxe9lz2hG2>)1 zspUk7FB3hu5?eRX){!`6yicc{yx%KzVmTJE5nWA3&n~4_tVFiFQrl1LHPIqbV*7}- zbLZR5CtkyDJN-@>pQY{H8`bsg_2}ArrQW*ZBVJ~=U3*8*E#q^td!(If<@Fk0clG0Y zlC_ihlrrfhR^Rb!m$ALJMXY}EaX8jaY1=rh*fQQe8dvlU?^pYz^C+9pCH6mdx&&%HAt?>a+7KwWVe1dC$b36Wg7IL`f!o zTpibwE6b#2kHpcmN9MF-dA$8zUryc4HF-QoPnNE(SDIQLtL3gzVx)M@)NOlhQuoLA z*!uC;c&+s9cx$vX_~x11r|m0wpOnVWagx|6o_ooBCby1{;k9)7>~i+@USG-TsmGrx zRx-KQ^W+hyHF-pyco&-T=Yr$QvB8fn@9B-VPuB0R$B^;?#tl(B+kk(Q|aP4p)3%thKGn}ZebQx)lo_V1)S$-mmx zQM=k;eGndk&5>RO3r+t{{~m0W{a!Clv}C39BG1^hgXHk!yYh{_+^L?^mPD_1l+5gUaZL7OL&QrgUyWD^&ZkAA&nncfAkK6 z4tBJA#Kv^x@wHezw-v89yLRgNiT7iVB=ePvxA(T~>>Y`lgA>E)Nmoj%Sj$+6ZR={J z@|9jAb+5Amsr;ssd%Ze-QkI@QJ-y>?$@C6Q7bmeVlNok0NwoG#9IeFCt2O(&?JpC% z2z~;D3PF+kba3s$1K~kJ!b4J$-H=@t9uAM7JqpJBjjo)4M{h z_r$TLmN*eoYozZ9jt0kqV6 zb@NGAo7t&E@>u^;8Y{6|$)&EQ+n?yy?Q`{K zQMfqVzopqRm;?JwL+~_tJLaP-S|1Md(|OzU#wncEh?rC(OjA*Gp6(9 zl^^6|vQ&rF{pyH%0P+R2h?c0Mv`jsyj;Zn!aP*LwxY^h>x9w=zr8-W_X{9=$PHKv# zX_gkyB3ceD*5=S^TBRNRJVTqcx~W5218vcoXsgUaYo%>kJ8joG@H;x4+7?K(%v3jjOtKS;1c3FegUTetOXAQ^aCEJd# zWPR~6NB5XYd8f$VNSV(4n98HOAwDzr+n(L?)?eZ^WIN`w#?d=>@?Ck#zGODBvc%s- znOWj`m|54WfqY|rJ0{qnbmEit7SIK{K@aE!eV`xg0{DcW>;?FgK>wobLk;V$?8m41 z3kW&#YXzSe6ntVhvRfI$^~oboD$#LIK6d1pBY#jPqO!+uec;G;=Why8zYCIIvH<>g zFamPWx`p9BI&zB~9m|Wh^Kt#@k)N_cNEV>}pRC9pJ1d?p+tITLbb!qO<6#&N!+03R z6C7YYtQYk@XR1Q-eJ zqTRGdN@ySLr^p|*!65QSxsQ#}{cM6Bkgchm=})PX=`nT7%;!wL;_r`{^=HyZ-ya`u zw(a!Z)AeR@uRHefSvhi9QtaZ?M^>-7DXZV?$lArrvi54lMAJ%0Kr1CVS|t&@n&epu z$+t98Xj!Dl3Xqc2zT6(G>1e$awK7l+DnKQu0@a`nG=N4REt)|KXa#Km_hS5!H6AaS zZr3K4(;PEE=a@OlfGJ45nWr2u^Yu=%Q13O1Xn|Q`mYQX>)T}To%__6ntR=YjNe7U3 z%oflESOw0eO#Z= zCwZfx@g^g{8;l%UYY19z(q-a@zNZM0XHJ7Cra#eSgJ4;1@> z4g>53ItmVgF>n}+g9$K+pKlrnfC!iaazP%L53tHg0l>^D3jkIq*n_eJl!7vuo>C4f zKqaUG)u0yCfqGcFefI zzFuDI>*sa86xr|=-pbo}JKw}R_-5Y8x9~3B&3kw+{$jr$f3d%d5AwZyi0|XWd_N!I z2RLRW^Mg;SimIuW8c-u@jygvbYOb26&QIk*&({m|LVbZ=q!;TYdZ}Kfm+KXJrCz01 z>$Q5FUavRkje3)g71Y}S_K}W#q;COTpd0jnKF|*a0LHElg1rE1tz)f?QcwoUK?SG; zRiGNwf;vzS8bLEaJfj^Tp3#Zl@vSq_JNv`jY<8Mk%r3Lr>@j=IKC|B(Fn5`Q=3aBi z+-DA(`^^#afH`U&G{?-t=D0awPG%CtidZ?;97|ZaR-QHA%C`!vLTiClWEEQ_R;g8H zm0J~7rB!8BTeVi5Rc|#|7+L0rh5KUW!x~AaV;!(Yt%KH>b=VrWCag)H;?sPVFW`&# za(r`q!k6pI^Ue3=`wD!8z6HJ_U$N`WhxPJdy?j_NAJ)rP1+Y#&tbz|C@ihaikq>L+ z!y5TIK^N!&eP93#0_*|b5WpVr4FinJHv$fTQE(89fx}=NOn^!Jo0I}H5CAzqfIN^7 z3PBMl0cD^9RDoJh4;nxtXadck1+;=T&<-|%4zL+?f-Rs6bb}tyhqs<3p|apeL`3qk zCjFW};Lq`kP)DfKpXbl_7lsBxgP|dRk-x-W=CANqg+@Z7p)r4LXfmvY1L2&o@HhKg z{q6n^f2Y68-{bG|5BLZDL;hj^h=0^S<{$S@2DCsRkP{Gryg+`SFi;dI36upY0#$+9 zKz*PwR1vBQ)rRUrjiKgHYY15*r>;;>2)RY;!;!JbM7EY4vD=B)a7DJro}XP9+Y_z| z*M{rEjp62SYq&k!5$+6kg?qw%#ysOPrx@aBa-Xj_(xTV%x;G*+b$x>|ya;wnKcMJtBU<9u+@ikBc9%C&bNckNAbS zP24W-5Wf<4io3sbk+_E}759nz#e-t2*d`tl4~re*5%DNlEglz7h&`l1{I~dx_^tSz z_`P_VG>SipXT_h$m&Nnq1@WSINxUpx5wD8Z#OvY>@uql7yev{^<=}s&qKe4EG5gyO17QUut%Y>fm|OwHhP2i(`8l+*GO;4gHlb7i!WC=Z$o9f87-!tpe6KE^h&yreujRIUPZ5_pQm4-U!>R2FVV$x39X?^ z=`y;UuB59-4P8Um()Dx$ZK7YH&GbgHjDCyt(f^>|W51@GSU>%V95ej|y^Y>ZmeOCM z|8BCJ-iu5gz<1+q^kK4&4$?t3NcZ4;l0HeA=n#F1?xTO8!}J-tpFT%lpfA#w=*#pK z`YL^mzE0nuZ{d8KzC-^(-=$Lu#eaRID!O7SKE@57Jg*@u$tukFVrVSE z-w4%^TCBwNnDaWU#|re;W3^U8uOYgQ>+vmX1OA$5Bfe=#jg9d7z_D1f0puVv=}lw3 z3UTG`@kVv>X`IetaYEKBkuR(;s-w9gQa{_yo@IYx&m-~+>_zqxsb(*e>)0!#mc7be zBX#U`vVy&Fbi6Tfr)S8SA9r?E#pcHzV13iEFupV+b4GD=R{3J`Wxj+o;eTSnoxqpz z<$NVyMON}Pd@WzkH}H-82L4t4HB!yLkuc&F{JW%{f1j-8KOhbKhh!c95ozQ%!{d+P z@hAMh_^teB{1^N-{Etm{@L%ye`Q7{;ejmRd-X0{^@vWqmZzI?9he#cNn5^JCN|>@V#Usf0FISr^ny%-}9&WANjNVPx$|vUf?hCm-x&46?l6U-d=;Z z*Wv9AczYAx-a>9~+k1$=6T6x!RT=)@CtWpFAO82J;4$wexs&9MK8AhF9%c8j`>}El zV(qqK^|oQJJcQlyFm}og?9xXddldd3M}#NXZnh_C(K|aEvRF{Tger7WFHCly@Ui=a zpFJpoY^w<4>79+I_epqqpNyyXDR_FHil_H!czVyp)BAKhz0VM5inGKy;#_f_IA2^S zE)o}uOT>r8N5sd(rQ+k_a`^ltd|m;cpN7vX;d3E;eg-~22cK8L=hg7}dHDPSe0~u= zuMuAoi^URABbJI~V!2o;R*5xYtynKMh>hX~@m2Wz8hm~OK5t}O$@T0)+*bzKz%1PP z7FnzeJoAYc>Yky`od@#4`QQR@A^0G;7!-m9;3MFp;A5Z&6k|rSq<=SdS-oBW%tF=iokE3}1`! zlXfw@Eynyys!^}TlfN1|1>hp+)k3cpdbMa@0zL#jjGqlnC@%$ 0x7f){ + result.append("\\u"); + result.append(hex(ch)); + }else{ + result.append(ch); + } + + } + return result.toString(); + } + public static String prettify(StringBuffer s) { + return prettify(s.toString()); + } private static class ASCIIWriter extends PrintWriter { private Writer w; private StringBuffer buffer = new StringBuffer(); diff --git a/icu4j/src/com/ibm/icu/dev/test/TestUtil.java b/icu4j/src/com/ibm/icu/dev/test/TestUtil.java index 41f90926235..5dbd567ce18 100644 --- a/icu4j/src/com/ibm/icu/dev/test/TestUtil.java +++ b/icu4j/src/com/ibm/icu/dev/test/TestUtil.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/TestUtil.java,v $ - * $Date: 2003/06/03 18:49:28 $ - * $Revision: 1.5 $ + * $Date: 2003/08/21 23:42:03 $ + * $Revision: 1.6 $ * ******************************************************************************* */ @@ -14,8 +14,10 @@ package com.ibm.icu.dev.test; import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; +import java.io.InputStream; public class TestUtil { /** @@ -86,6 +88,14 @@ public class TestUtil { public static final BufferedReader getDataReader(String name) throws IOException { return getDataReader(name, 1024); } + /** + * Return an input stream on the data file at path 'name' rooted at the data path + */ + public static final InputStream getDataStream(String name) throws IOException{ + File file = getDataFile(name); + FileInputStream st = new FileInputStream(file); + return st; + } static final char DIGITS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/IDNAReference.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/IDNAReference.java new file mode 100644 index 00000000000..68221e7dd41 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/IDNAReference.java @@ -0,0 +1,408 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/IDNAReference.java,v $ + * $Date: 2003/08/21 23:42:25 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ +package com.ibm.icu.dev.test.stringprep; + +import com.ibm.icu.text.UCharacterIterator; +import com.ibm.icu.stringprep.ParseException; + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public class IDNAReference { + + private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ; + private static final int ACE_PREFIX_LENGTH = 4; + + private static final int MAX_LABEL_LENGTH = 63; + private static final int HYPHEN = 0x002D; + private static final int CAPITAL_A = 0x0041; + private static final int CAPITAL_Z = 0x005A; + private static final int LOWER_CASE_DELTA = 0x0020; + private static final int FULL_STOP = 0x002E; + + + public static final int DEFAULT = 0x0000; + public static final int ALLOW_UNASSIGNED = 0x0001; + public static final int USE_STD3_RULES = 0x0002; + public static final NamePrepTransform transform = NamePrepTransform.getInstance(); + + private static boolean startsWithPrefix(StringBuffer src){ + boolean startsWithPrefix = true; + + if(src.length() < ACE_PREFIX_LENGTH){ + return false; + } + for(int i=0; i0x007A){ + return false; + } + //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A] + if( (ch==0x002D) || + (0x0030 <= ch && ch <= 0x0039) || + (0x0041 <= ch && ch <= 0x005A) || + (0x0061 <= ch && ch <= 0x007A) + ){ + return true; + } + return false; + } + + public static StringBuffer convertToASCII(String src, int options) + throws ParseException{ + UCharacterIterator iter = UCharacterIterator.getInstance(src); + return convertToASCII(iter,options); + } + public static StringBuffer convertToASCII(StringBuffer src, int options) + throws ParseException{ + UCharacterIterator iter = UCharacterIterator.getInstance(src); + return convertToASCII(iter,options); + } + public static StringBuffer convertToASCII(UCharacterIterator srcIter, int options) + throws ParseException{ + + char[] caseFlags = null; + + // the source contains all ascii codepoints + boolean srcIsASCII = true; + // assume the source contains all LDH codepoints + boolean srcIsLDH = true; + + //get the options + boolean useSTD3ASCIIRules = (boolean)((options & USE_STD3_RULES) != 0); + + int failPos = -1; + // step 2 + //StringPrep prep = StringPrep.getNameprepInstance(); + StringBuffer processOut = transform.prepare(srcIter,options); + int poLen = processOut.length(); + StringBuffer dest = new StringBuffer(); + // step 3 & 4 + for(int j=0;j 0x7F){ + srcIsASCII = false; + } + // here we do not assemble surrogates + // since we know that LDH code points + // are in the ASCII range only + if(isLDHChar(ch)==false){ + srcIsLDH = false; + failPos = j; + } + } + + if(useSTD3ASCIIRules == true){ + // verify 3a and 3b + if( srcIsLDH == false /* source contains some non-LDH characters */ + || processOut.charAt(0) == HYPHEN + || processOut.charAt(processOut.length()-1) == HYPHEN){ + + /* populate the parseError struct */ + if(srcIsLDH==false){ + throw new ParseException( "The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(), + (failPos>0) ? (failPos-1) : failPos); + }else if(processOut.charAt(0) == HYPHEN){ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0); + + }else{ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(), + (poLen>0) ? poLen-1 : poLen); + + } + } + } + if(srcIsASCII){ + dest = processOut; + }else{ + // step 5 : verify the sequence does not begin with ACE prefix + if(!startsWithPrefix(processOut)){ + + //step 6: encode the sequence with punycode + StringBuffer punyout = PunycodeReference.encode(processOut,caseFlags); + + // convert all codepoints to lower case ASCII + StringBuffer lowerOut = toASCIILower(punyout); + + //Step 7: prepend the ACE prefix + dest.append(ACE_PREFIX,0,ACE_PREFIX_LENGTH); + //Step 6: copy the contents in b2 into dest + dest.append(lowerOut); + }else{ + throw new ParseException("The input does not start with the ACE Prefix.", + ParseException.ACE_PREFIX_ERROR,processOut.toString(),0); + } + } + if(dest.length() > MAX_LABEL_LENGTH){ + throw new ParseException("The labels in the input are too long. Length > 64.", + ParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0); + } + return dest; + } + + public static StringBuffer convertIDNtoASCII(UCharacterIterator iter,int options) + throws ParseException{ + return convertIDNToASCII(iter.getText(), options); + } + public static StringBuffer convertIDNtoASCII(StringBuffer str,int options) + throws ParseException{ + return convertIDNToASCII(str.toString(), options); + } + public static StringBuffer convertIDNToASCII(String src,int options) + throws ParseException{ + char[] srcArr = src.toCharArray(); + StringBuffer result = new StringBuffer(); + int sepIndex=0; + int oldSepIndex = 0; + for(;;){ + sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); + UCharacterIterator iter = UCharacterIterator.getInstance(new String(srcArr,oldSepIndex,sepIndex-oldSepIndex)); + result.append(convertToASCII(iter,options)); + if(sepIndex==srcArr.length){ + break; + } + // increment the sepIndex to skip past the separator + sepIndex++; + oldSepIndex = sepIndex; + result.append((char)FULL_STOP); + } + return result; + } + + public static StringBuffer convertToUnicode(String src, int options) + throws ParseException{ + UCharacterIterator iter = UCharacterIterator.getInstance(src); + return convertToUnicode(iter,options); + } + public static StringBuffer convertToUnicode(StringBuffer src, int options) + throws ParseException{ + UCharacterIterator iter = UCharacterIterator.getInstance(src); + return convertToUnicode(iter,options); + } + public static StringBuffer convertToUnicode(UCharacterIterator iter, int options) + throws ParseException{ + + char[] caseFlags = null; + + //get the options + boolean useSTD3ASCIIRules = (boolean)((options & USE_STD3_RULES) != 0); + + // the source contains all ascii codepoints + boolean srcIsASCII = true; + // assume the source contains all LDH codepoints + boolean srcIsLDH = true; + + int failPos = -1; + int ch; + int saveIndex = iter.getIndex(); + // step 1: find out if all the codepoints in src are ASCII + while((ch=iter.next())!= UCharacterIterator.DONE){ + if(ch>0x7F){ + srcIsASCII = false; + } + if((srcIsLDH = isLDHChar(ch))==false){ + failPos = iter.getIndex(); + } + } + StringBuffer processOut; + + if(srcIsASCII == false){ + // step 2: process the string + iter.setIndex(saveIndex); + processOut = transform.prepare(iter,options); + + }else{ + //just point to source + processOut = new StringBuffer(iter.getText()); + } + // TODO: + // The RFC states that + // + // ToUnicode never fails. If any step fails, then the original input + // is returned immediately in that step. + // + + //step 3: verify ACE Prefix + if(startsWithPrefix(processOut)){ + + //step 4: Remove the ACE Prefix + String temp = processOut.substring(ACE_PREFIX_LENGTH,processOut.length()); + + //step 5: Decode using punycode + StringBuffer decodeOut = PunycodeReference.decode(new StringBuffer(temp),caseFlags); + + //step 6:Apply toASCII + StringBuffer toASCIIOut = convertToASCII(decodeOut, options); + + //step 7: verify + if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){ + throw new ParseException("The verification step prescribed by the RFC 3491 failed", + ParseException.VERIFICATION_ERROR); + } + + //step 8: return output of step 5 + return decodeOut; + + }else{ + // verify that STD3 ASCII rules are satisfied + if(useSTD3ASCIIRules == true){ + if( srcIsLDH == false /* source contains some non-LDH characters */ + || processOut.charAt(0) == HYPHEN + || processOut.charAt(processOut.length()-1) == HYPHEN){ + + if(srcIsLDH==false){ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR,processOut.toString(), + (failPos>0) ? (failPos-1) : failPos); + }else if(processOut.charAt(0) == HYPHEN){ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(),0); + + }else{ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(), + processOut.length()); + + } + } + } + // just return the source + return new StringBuffer(iter.getText()); + } + } + public static StringBuffer convertIDNToUnicode(UCharacterIterator iter, int options) + throws ParseException{ + return convertIDNToUnicode(iter.getText(), options); + } + public static StringBuffer convertIDNToUnicode(StringBuffer str, int options) + throws ParseException{ + return convertIDNToUnicode(str.toString(), options); + } + public static StringBuffer convertIDNToUnicode(String src, int options) + throws ParseException{ + + char[] srcArr = src.toCharArray(); + StringBuffer result = new StringBuffer(); + int sepIndex=0; + int oldSepIndex=0; + for(;;){ + sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); + UCharacterIterator iter = UCharacterIterator.getInstance(new String(srcArr,oldSepIndex,sepIndex-oldSepIndex)); + result.append(convertToUnicode(iter,options)); + if(sepIndex==srcArr.length){ + break; + } + // increment the sepIndex to skip past the separator + sepIndex++; + oldSepIndex = sepIndex; + result.append((char)FULL_STOP); + } + return result; + } + // TODO: optimize + public static int compare(StringBuffer s1, StringBuffer s2, int options) + throws ParseException{ + if(s1==null || s2 == null){ + throw new IllegalArgumentException("One of the source buffers is null"); + } + StringBuffer s1Out = convertIDNToASCII(s1.toString(), options); + StringBuffer s2Out = convertIDNToASCII(s2.toString(), options); + return compareCaseInsensitiveASCII(s1Out,s2Out); + } + // TODO: optimize + public static int compare(String s1, String s2, int options) + throws ParseException{ + if(s1==null || s2 == null){ + throw new IllegalArgumentException("One of the source buffers is null"); + } + StringBuffer s1Out = convertIDNToASCII(s1, options); + StringBuffer s2Out = convertIDNToASCII(s2, options); + return compareCaseInsensitiveASCII(s1Out,s2Out); + } + // TODO: optimize + public static int compare(UCharacterIterator i1, UCharacterIterator i2, int options) + throws ParseException{ + if(i1==null || i2 == null){ + throw new IllegalArgumentException("One of the source buffers is null"); + } + StringBuffer s1Out = convertIDNToASCII(i1.getText(), options); + StringBuffer s2Out = convertIDNToASCII(i2.getText(), options); + return compareCaseInsensitiveASCII(s1Out,s2Out); + } + +} diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/IDNA_rules.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/IDNA_rules.java new file mode 100644 index 00000000000..6574acba837 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/IDNA_rules.java @@ -0,0 +1,1538 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/Attic/IDNA_rules.java,v $ + * $Date: 2003/08/21 23:42:22 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ +package com.ibm.icu.dev.test.stringprep; + +import com.ibm.icu.impl.ICUListResourceBundle; + +public class IDNA_rules extends ICUListResourceBundle { + + public IDNA_rules () { + super.contents = data; + } + static final Object[][] data = new Object[][] { + { + "LabelSeparatorSet", + "[ .\u3002\uFF0E\uFF61 ]", + }, + { + "CaseMap", + "A > a;"+ + "B > b;"+ + "C > c;"+ + "D > d;"+ + "E > e;"+ + "F > f;"+ + "G > g;"+ + "H > h;"+ + "I > i;"+ + "J > j;"+ + "K > k;"+ + "L > l;"+ + "M > m;"+ + "N > n;"+ + "O > o;"+ + "P > p;"+ + "Q > q;"+ + "R > r;"+ + "S > s;"+ + "T > t;"+ + "U > u;"+ + "V > v;"+ + "W > w;"+ + "X > x;"+ + "Y > y;"+ + "Z > z;"+ + "\\u00B5 > \\u03BC;"+ + "\\u00C0 > \\u00E0;"+ + "\\u00C1 > \\u00E1;"+ + "\\u00C2 > \\u00E2;"+ + "\\u00C3 > \\u00E3;"+ + "\\u00C4 > \\u00E4;"+ + "\\u00C5 > \\u00E5;"+ + "\\u00C6 > \\u00E6;"+ + "\\u00C7 > \\u00E7;"+ + "\\u00C8 > \\u00E8;"+ + "\\u00C9 > \\u00E9;"+ + "\\u00CA > \\u00EA;"+ + "\\u00CB > \\u00EB;"+ + "\\u00CC > \\u00EC;"+ + "\\u00CD > \\u00ED;"+ + "\\u00CE > \\u00EE;"+ + "\\u00CF > \\u00EF;"+ + "\\u00D0 > \\u00F0;"+ + "\\u00D1 > \\u00F1;"+ + "\\u00D2 > \\u00F2;"+ + "\\u00D3 > \\u00F3;"+ + "\\u00D4 > \\u00F4;"+ + "\\u00D5 > \\u00F5;"+ + "\\u00D6 > \\u00F6;"+ + "\\u00D8 > \\u00F8;"+ + "\\u00D9 > \\u00F9;"+ + "\\u00DA > \\u00FA;"+ + "\\u00DB > \\u00FB;"+ + "\\u00DC > \\u00FC;"+ + "\\u00DD > \\u00FD;"+ + "\\u00DE > \\u00FE;"+ + "\\u00DF > ss;"+ + "\\u0100 > \\u0101;"+ + "\\u0102 > \\u0103;"+ + "\\u0104 > \\u0105;"+ + "\\u0106 > \\u0107;"+ + "\\u0108 > \\u0109;"+ + "\\u010A > \\u010B;"+ + "\\u010C > \\u010D;"+ + "\\u010E > \\u010F;"+ + "\\u0110 > \\u0111;"+ + "\\u0112 > \\u0113;"+ + "\\u0114 > \\u0115;"+ + "\\u0116 > \\u0117;"+ + "\\u0118 > \\u0119;"+ + "\\u011A > \\u011B;"+ + "\\u011C > \\u011D;"+ + "\\u011E > \\u011F;"+ + "\\u0120 > \\u0121;"+ + "\\u0122 > \\u0123;"+ + "\\u0124 > \\u0125;"+ + "\\u0126 > \\u0127;"+ + "\\u0128 > \\u0129;"+ + "\\u012A > \\u012B;"+ + "\\u012C > \\u012D;"+ + "\\u012E > \\u012F;"+ + "\\u0130 > i\\u0307;"+ + "\\u0132 > \\u0133;"+ + "\\u0134 > \\u0135;"+ + "\\u0136 > \\u0137;"+ + "\\u0139 > \\u013A;"+ + "\\u013B > \\u013C;"+ + "\\u013D > \\u013E;"+ + "\\u013F > \\u0140;"+ + "\\u0141 > \\u0142;"+ + "\\u0143 > \\u0144;"+ + "\\u0145 > \\u0146;"+ + "\\u0147 > \\u0148;"+ + "\\u0149 > \\u02BCn;"+ + "\\u014A > \\u014B;"+ + "\\u014C > \\u014D;"+ + "\\u014E > \\u014F;"+ + "\\u0150 > \\u0151;"+ + "\\u0152 > \\u0153;"+ + "\\u0154 > \\u0155;"+ + "\\u0156 > \\u0157;"+ + "\\u0158 > \\u0159;"+ + "\\u015A > \\u015B;"+ + "\\u015C > \\u015D;"+ + "\\u015E > \\u015F;"+ + "\\u0160 > \\u0161;"+ + "\\u0162 > \\u0163;"+ + "\\u0164 > \\u0165;"+ + "\\u0166 > \\u0167;"+ + "\\u0168 > \\u0169;"+ + "\\u016A > \\u016B;"+ + "\\u016C > \\u016D;"+ + "\\u016E > \\u016F;"+ + "\\u0170 > \\u0171;"+ + "\\u0172 > \\u0173;"+ + "\\u0174 > \\u0175;"+ + "\\u0176 > \\u0177;"+ + "\\u0178 > \\u00FF;"+ + "\\u0179 > \\u017A;"+ + "\\u017B > \\u017C;"+ + "\\u017D > \\u017E;"+ + "\\u017F > s;"+ + "\\u0181 > \\u0253;"+ + "\\u0182 > \\u0183;"+ + "\\u0184 > \\u0185;"+ + "\\u0186 > \\u0254;"+ + "\\u0187 > \\u0188;"+ + "\\u0189 > \\u0256;"+ + "\\u018A > \\u0257;"+ + "\\u018B > \\u018C;"+ + "\\u018E > \\u01DD;"+ + "\\u018F > \\u0259;"+ + "\\u0190 > \\u025B;"+ + "\\u0191 > \\u0192;"+ + "\\u0193 > \\u0260;"+ + "\\u0194 > \\u0263;"+ + "\\u0196 > \\u0269;"+ + "\\u0197 > \\u0268;"+ + "\\u0198 > \\u0199;"+ + "\\u019C > \\u026F;"+ + "\\u019D > \\u0272;"+ + "\\u019F > \\u0275;"+ + "\\u01A0 > \\u01A1;"+ + "\\u01A2 > \\u01A3;"+ + "\\u01A4 > \\u01A5;"+ + "\\u01A6 > \\u0280;"+ + "\\u01A7 > \\u01A8;"+ + "\\u01A9 > \\u0283;"+ + "\\u01AC > \\u01AD;"+ + "\\u01AE > \\u0288;"+ + "\\u01AF > \\u01B0;"+ + "\\u01B1 > \\u028A;"+ + "\\u01B2 > \\u028B;"+ + "\\u01B3 > \\u01B4;"+ + "\\u01B5 > \\u01B6;"+ + "\\u01B7 > \\u0292;"+ + "\\u01B8 > \\u01B9;"+ + "\\u01BC > \\u01BD;"+ + "\\u01C4 > \\u01C6;"+ + "\\u01C5 > \\u01C6;"+ + "\\u01C7 > \\u01C9;"+ + "\\u01C8 > \\u01C9;"+ + "\\u01CA > \\u01CC;"+ + "\\u01CB > \\u01CC;"+ + "\\u01CD > \\u01CE;"+ + "\\u01CF > \\u01D0;"+ + "\\u01D1 > \\u01D2;"+ + "\\u01D3 > \\u01D4;"+ + "\\u01D5 > \\u01D6;"+ + "\\u01D7 > \\u01D8;"+ + "\\u01D9 > \\u01DA;"+ + "\\u01DB > \\u01DC;"+ + "\\u01DE > \\u01DF;"+ + "\\u01E0 > \\u01E1;"+ + "\\u01E2 > \\u01E3;"+ + "\\u01E4 > \\u01E5;"+ + "\\u01E6 > \\u01E7;"+ + "\\u01E8 > \\u01E9;"+ + "\\u01EA > \\u01EB;"+ + "\\u01EC > \\u01ED;"+ + "\\u01EE > \\u01EF;"+ + "\\u01F0 > j\\u030C;"+ + "\\u01F1 > \\u01F3;"+ + "\\u01F2 > \\u01F3;"+ + "\\u01F4 > \\u01F5;"+ + "\\u01F6 > \\u0195;"+ + "\\u01F7 > \\u01BF;"+ + "\\u01F8 > \\u01F9;"+ + "\\u01FA > \\u01FB;"+ + "\\u01FC > \\u01FD;"+ + "\\u01FE > \\u01FF;"+ + "\\u0200 > \\u0201;"+ + "\\u0202 > \\u0203;"+ + "\\u0204 > \\u0205;"+ + "\\u0206 > \\u0207;"+ + "\\u0208 > \\u0209;"+ + "\\u020A > \\u020B;"+ + "\\u020C > \\u020D;"+ + "\\u020E > \\u020F;"+ + "\\u0210 > \\u0211;"+ + "\\u0212 > \\u0213;"+ + "\\u0214 > \\u0215;"+ + "\\u0216 > \\u0217;"+ + "\\u0218 > \\u0219;"+ + "\\u021A > \\u021B;"+ + "\\u021C > \\u021D;"+ + "\\u021E > \\u021F;"+ + "\\u0220 > \\u019E;"+ + "\\u0222 > \\u0223;"+ + "\\u0224 > \\u0225;"+ + "\\u0226 > \\u0227;"+ + "\\u0228 > \\u0229;"+ + "\\u022A > \\u022B;"+ + "\\u022C > \\u022D;"+ + "\\u022E > \\u022F;"+ + "\\u0230 > \\u0231;"+ + "\\u0232 > \\u0233;"+ + "\\u0345 > \\u03B9;"+ + "\\u037A > \\u0020\\u03B9;"+ + "\\u0386 > \\u03AC;"+ + "\\u0388 > \\u03AD;"+ + "\\u0389 > \\u03AE;"+ + "\\u038A > \\u03AF;"+ + "\\u038C > \\u03CC;"+ + "\\u038E > \\u03CD;"+ + "\\u038F > \\u03CE;"+ + "\\u0390 > \\u03B9\\u0308\\u0301;"+ + "\\u0391 > \\u03B1;"+ + "\\u0392 > \\u03B2;"+ + "\\u0393 > \\u03B3;"+ + "\\u0394 > \\u03B4;"+ + "\\u0395 > \\u03B5;"+ + "\\u0396 > \\u03B6;"+ + "\\u0397 > \\u03B7;"+ + "\\u0398 > \\u03B8;"+ + "\\u0399 > \\u03B9;"+ + "\\u039A > \\u03BA;"+ + "\\u039B > \\u03BB;"+ + "\\u039C > \\u03BC;"+ + "\\u039D > \\u03BD;"+ + "\\u039E > \\u03BE;"+ + "\\u039F > \\u03BF;"+ + "\\u03A0 > \\u03C0;"+ + "\\u03A1 > \\u03C1;"+ + "\\u03A3 > \\u03C3;"+ + "\\u03A4 > \\u03C4;"+ + "\\u03A5 > \\u03C5;"+ + "\\u03A6 > \\u03C6;"+ + "\\u03A7 > \\u03C7;"+ + "\\u03A8 > \\u03C8;"+ + "\\u03A9 > \\u03C9;"+ + "\\u03AA > \\u03CA;"+ + "\\u03AB > \\u03CB;"+ + "\\u03B0 > \\u03C5\\u0308\\u0301;"+ + "\\u03C2 > \\u03C3;"+ + "\\u03D0 > \\u03B2;"+ + "\\u03D1 > \\u03B8;"+ + "\\u03D2 > \\u03C5;"+ + "\\u03D3 > \\u03CD;"+ + "\\u03D4 > \\u03CB;"+ + "\\u03D5 > \\u03C6;"+ + "\\u03D6 > \\u03C0;"+ + "\\u03D8 > \\u03D9;"+ + "\\u03DA > \\u03DB;"+ + "\\u03DC > \\u03DD;"+ + "\\u03DE > \\u03DF;"+ + "\\u03E0 > \\u03E1;"+ + "\\u03E2 > \\u03E3;"+ + "\\u03E4 > \\u03E5;"+ + "\\u03E6 > \\u03E7;"+ + "\\u03E8 > \\u03E9;"+ + "\\u03EA > \\u03EB;"+ + "\\u03EC > \\u03ED;"+ + "\\u03EE > \\u03EF;"+ + "\\u03F0 > \\u03BA;"+ + "\\u03F1 > \\u03C1;"+ + "\\u03F2 > \\u03C3;"+ + "\\u03F4 > \\u03B8;"+ + "\\u03F5 > \\u03B5;"+ + "\\u0400 > \\u0450;"+ + "\\u0401 > \\u0451;"+ + "\\u0402 > \\u0452;"+ + "\\u0403 > \\u0453;"+ + "\\u0404 > \\u0454;"+ + "\\u0405 > \\u0455;"+ + "\\u0406 > \\u0456;"+ + "\\u0407 > \\u0457;"+ + "\\u0408 > \\u0458;"+ + "\\u0409 > \\u0459;"+ + "\\u040A > \\u045A;"+ + "\\u040B > \\u045B;"+ + "\\u040C > \\u045C;"+ + "\\u040D > \\u045D;"+ + "\\u040E > \\u045E;"+ + "\\u040F > \\u045F;"+ + "\\u0410 > \\u0430;"+ + "\\u0411 > \\u0431;"+ + "\\u0412 > \\u0432;"+ + "\\u0413 > \\u0433;"+ + "\\u0414 > \\u0434;"+ + "\\u0415 > \\u0435;"+ + "\\u0416 > \\u0436;"+ + "\\u0417 > \\u0437;"+ + "\\u0418 > \\u0438;"+ + "\\u0419 > \\u0439;"+ + "\\u041A > \\u043A;"+ + "\\u041B > \\u043B;"+ + "\\u041C > \\u043C;"+ + "\\u041D > \\u043D;"+ + "\\u041E > \\u043E;"+ + "\\u041F > \\u043F;"+ + "\\u0420 > \\u0440;"+ + "\\u0421 > \\u0441;"+ + "\\u0422 > \\u0442;"+ + "\\u0423 > \\u0443;"+ + "\\u0424 > \\u0444;"+ + "\\u0425 > \\u0445;"+ + "\\u0426 > \\u0446;"+ + "\\u0427 > \\u0447;"+ + "\\u0428 > \\u0448;"+ + "\\u0429 > \\u0449;"+ + "\\u042A > \\u044A;"+ + "\\u042B > \\u044B;"+ + "\\u042C > \\u044C;"+ + "\\u042D > \\u044D;"+ + "\\u042E > \\u044E;"+ + "\\u042F > \\u044F;"+ + "\\u0460 > \\u0461;"+ + "\\u0462 > \\u0463;"+ + "\\u0464 > \\u0465;"+ + "\\u0466 > \\u0467;"+ + "\\u0468 > \\u0469;"+ + "\\u046A > \\u046B;"+ + "\\u046C > \\u046D;"+ + "\\u046E > \\u046F;"+ + "\\u0470 > \\u0471;"+ + "\\u0472 > \\u0473;"+ + "\\u0474 > \\u0475;"+ + "\\u0476 > \\u0477;"+ + "\\u0478 > \\u0479;"+ + "\\u047A > \\u047B;"+ + "\\u047C > \\u047D;"+ + "\\u047E > \\u047F;"+ + "\\u0480 > \\u0481;"+ + "\\u048A > \\u048B;"+ + "\\u048C > \\u048D;"+ + "\\u048E > \\u048F;"+ + "\\u0490 > \\u0491;"+ + "\\u0492 > \\u0493;"+ + "\\u0494 > \\u0495;"+ + "\\u0496 > \\u0497;"+ + "\\u0498 > \\u0499;"+ + "\\u049A > \\u049B;"+ + "\\u049C > \\u049D;"+ + "\\u049E > \\u049F;"+ + "\\u04A0 > \\u04A1;"+ + "\\u04A2 > \\u04A3;"+ + "\\u04A4 > \\u04A5;"+ + "\\u04A6 > \\u04A7;"+ + "\\u04A8 > \\u04A9;"+ + "\\u04AA > \\u04AB;"+ + "\\u04AC > \\u04AD;"+ + "\\u04AE > \\u04AF;"+ + "\\u04B0 > \\u04B1;"+ + "\\u04B2 > \\u04B3;"+ + "\\u04B4 > \\u04B5;"+ + "\\u04B6 > \\u04B7;"+ + "\\u04B8 > \\u04B9;"+ + "\\u04BA > \\u04BB;"+ + "\\u04BC > \\u04BD;"+ + "\\u04BE > \\u04BF;"+ + "\\u04C1 > \\u04C2;"+ + "\\u04C3 > \\u04C4;"+ + "\\u04C5 > \\u04C6;"+ + "\\u04C7 > \\u04C8;"+ + "\\u04C9 > \\u04CA;"+ + "\\u04CB > \\u04CC;"+ + "\\u04CD > \\u04CE;"+ + "\\u04D0 > \\u04D1;"+ + "\\u04D2 > \\u04D3;"+ + "\\u04D4 > \\u04D5;"+ + "\\u04D6 > \\u04D7;"+ + "\\u04D8 > \\u04D9;"+ + "\\u04DA > \\u04DB;"+ + "\\u04DC > \\u04DD;"+ + "\\u04DE > \\u04DF;"+ + "\\u04E0 > \\u04E1;"+ + "\\u04E2 > \\u04E3;"+ + "\\u04E4 > \\u04E5;"+ + "\\u04E6 > \\u04E7;"+ + "\\u04E8 > \\u04E9;"+ + "\\u04EA > \\u04EB;"+ + "\\u04EC > \\u04ED;"+ + "\\u04EE > \\u04EF;"+ + "\\u04F0 > \\u04F1;"+ + "\\u04F2 > \\u04F3;"+ + "\\u04F4 > \\u04F5;"+ + "\\u04F8 > \\u04F9;"+ + "\\u0500 > \\u0501;"+ + "\\u0502 > \\u0503;"+ + "\\u0504 > \\u0505;"+ + "\\u0506 > \\u0507;"+ + "\\u0508 > \\u0509;"+ + "\\u050A > \\u050B;"+ + "\\u050C > \\u050D;"+ + "\\u050E > \\u050F;"+ + "\\u0531 > \\u0561;"+ + "\\u0532 > \\u0562;"+ + "\\u0533 > \\u0563;"+ + "\\u0534 > \\u0564;"+ + "\\u0535 > \\u0565;"+ + "\\u0536 > \\u0566;"+ + "\\u0537 > \\u0567;"+ + "\\u0538 > \\u0568;"+ + "\\u0539 > \\u0569;"+ + "\\u053A > \\u056A;"+ + "\\u053B > \\u056B;"+ + "\\u053C > \\u056C;"+ + "\\u053D > \\u056D;"+ + "\\u053E > \\u056E;"+ + "\\u053F > \\u056F;"+ + "\\u0540 > \\u0570;"+ + "\\u0541 > \\u0571;"+ + "\\u0542 > \\u0572;"+ + "\\u0543 > \\u0573;"+ + "\\u0544 > \\u0574;"+ + "\\u0545 > \\u0575;"+ + "\\u0546 > \\u0576;"+ + "\\u0547 > \\u0577;"+ + "\\u0548 > \\u0578;"+ + "\\u0549 > \\u0579;"+ + "\\u054A > \\u057A;"+ + "\\u054B > \\u057B;"+ + "\\u054C > \\u057C;"+ + "\\u054D > \\u057D;"+ + "\\u054E > \\u057E;"+ + "\\u054F > \\u057F;"+ + "\\u0550 > \\u0580;"+ + "\\u0551 > \\u0581;"+ + "\\u0552 > \\u0582;"+ + "\\u0553 > \\u0583;"+ + "\\u0554 > \\u0584;"+ + "\\u0555 > \\u0585;"+ + "\\u0556 > \\u0586;"+ + "\\u0587 > \\u0565\\u0582;"+ + "\\u1E00 > \\u1E01;"+ + "\\u1E02 > \\u1E03;"+ + "\\u1E04 > \\u1E05;"+ + "\\u1E06 > \\u1E07;"+ + "\\u1E08 > \\u1E09;"+ + "\\u1E0A > \\u1E0B;"+ + "\\u1E0C > \\u1E0D;"+ + "\\u1E0E > \\u1E0F;"+ + "\\u1E10 > \\u1E11;"+ + "\\u1E12 > \\u1E13;"+ + "\\u1E14 > \\u1E15;"+ + "\\u1E16 > \\u1E17;"+ + "\\u1E18 > \\u1E19;"+ + "\\u1E1A > \\u1E1B;"+ + "\\u1E1C > \\u1E1D;"+ + "\\u1E1E > \\u1E1F;"+ + "\\u1E20 > \\u1E21;"+ + "\\u1E22 > \\u1E23;"+ + "\\u1E24 > \\u1E25;"+ + "\\u1E26 > \\u1E27;"+ + "\\u1E28 > \\u1E29;"+ + "\\u1E2A > \\u1E2B;"+ + "\\u1E2C > \\u1E2D;"+ + "\\u1E2E > \\u1E2F;"+ + "\\u1E30 > \\u1E31;"+ + "\\u1E32 > \\u1E33;"+ + "\\u1E34 > \\u1E35;"+ + "\\u1E36 > \\u1E37;"+ + "\\u1E38 > \\u1E39;"+ + "\\u1E3A > \\u1E3B;"+ + "\\u1E3C > \\u1E3D;"+ + "\\u1E3E > \\u1E3F;"+ + "\\u1E40 > \\u1E41;"+ + "\\u1E42 > \\u1E43;"+ + "\\u1E44 > \\u1E45;"+ + "\\u1E46 > \\u1E47;"+ + "\\u1E48 > \\u1E49;"+ + "\\u1E4A > \\u1E4B;"+ + "\\u1E4C > \\u1E4D;"+ + "\\u1E4E > \\u1E4F;"+ + "\\u1E50 > \\u1E51;"+ + "\\u1E52 > \\u1E53;"+ + "\\u1E54 > \\u1E55;"+ + "\\u1E56 > \\u1E57;"+ + "\\u1E58 > \\u1E59;"+ + "\\u1E5A > \\u1E5B;"+ + "\\u1E5C > \\u1E5D;"+ + "\\u1E5E > \\u1E5F;"+ + "\\u1E60 > \\u1E61;"+ + "\\u1E62 > \\u1E63;"+ + "\\u1E64 > \\u1E65;"+ + "\\u1E66 > \\u1E67;"+ + "\\u1E68 > \\u1E69;"+ + "\\u1E6A > \\u1E6B;"+ + "\\u1E6C > \\u1E6D;"+ + "\\u1E6E > \\u1E6F;"+ + "\\u1E70 > \\u1E71;"+ + "\\u1E72 > \\u1E73;"+ + "\\u1E74 > \\u1E75;"+ + "\\u1E76 > \\u1E77;"+ + "\\u1E78 > \\u1E79;"+ + "\\u1E7A > \\u1E7B;"+ + "\\u1E7C > \\u1E7D;"+ + "\\u1E7E > \\u1E7F;"+ + "\\u1E80 > \\u1E81;"+ + "\\u1E82 > \\u1E83;"+ + "\\u1E84 > \\u1E85;"+ + "\\u1E86 > \\u1E87;"+ + "\\u1E88 > \\u1E89;"+ + "\\u1E8A > \\u1E8B;"+ + "\\u1E8C > \\u1E8D;"+ + "\\u1E8E > \\u1E8F;"+ + "\\u1E90 > \\u1E91;"+ + "\\u1E92 > \\u1E93;"+ + "\\u1E94 > \\u1E95;"+ + "\\u1E96 > h\\u0331;"+ + "\\u1E97 > t\\u0308;"+ + "\\u1E98 > w\\u030A;"+ + "\\u1E99 > y\\u030A;"+ + "\\u1E9A > a\\u02BE;"+ + "\\u1E9B > \\u1E61;"+ + "\\u1EA0 > \\u1EA1;"+ + "\\u1EA2 > \\u1EA3;"+ + "\\u1EA4 > \\u1EA5;"+ + "\\u1EA6 > \\u1EA7;"+ + "\\u1EA8 > \\u1EA9;"+ + "\\u1EAA > \\u1EAB;"+ + "\\u1EAC > \\u1EAD;"+ + "\\u1EAE > \\u1EAF;"+ + "\\u1EB0 > \\u1EB1;"+ + "\\u1EB2 > \\u1EB3;"+ + "\\u1EB4 > \\u1EB5;"+ + "\\u1EB6 > \\u1EB7;"+ + "\\u1EB8 > \\u1EB9;"+ + "\\u1EBA > \\u1EBB;"+ + "\\u1EBC > \\u1EBD;"+ + "\\u1EBE > \\u1EBF;"+ + "\\u1EC0 > \\u1EC1;"+ + "\\u1EC2 > \\u1EC3;"+ + "\\u1EC4 > \\u1EC5;"+ + "\\u1EC6 > \\u1EC7;"+ + "\\u1EC8 > \\u1EC9;"+ + "\\u1ECA > \\u1ECB;"+ + "\\u1ECC > \\u1ECD;"+ + "\\u1ECE > \\u1ECF;"+ + "\\u1ED0 > \\u1ED1;"+ + "\\u1ED2 > \\u1ED3;"+ + "\\u1ED4 > \\u1ED5;"+ + "\\u1ED6 > \\u1ED7;"+ + "\\u1ED8 > \\u1ED9;"+ + "\\u1EDA > \\u1EDB;"+ + "\\u1EDC > \\u1EDD;"+ + "\\u1EDE > \\u1EDF;"+ + "\\u1EE0 > \\u1EE1;"+ + "\\u1EE2 > \\u1EE3;"+ + "\\u1EE4 > \\u1EE5;"+ + "\\u1EE6 > \\u1EE7;"+ + "\\u1EE8 > \\u1EE9;"+ + "\\u1EEA > \\u1EEB;"+ + "\\u1EEC > \\u1EED;"+ + "\\u1EEE > \\u1EEF;"+ + "\\u1EF0 > \\u1EF1;"+ + "\\u1EF2 > \\u1EF3;"+ + "\\u1EF4 > \\u1EF5;"+ + "\\u1EF6 > \\u1EF7;"+ + "\\u1EF8 > \\u1EF9;"+ + "\\u1F08 > \\u1F00;"+ + "\\u1F09 > \\u1F01;"+ + "\\u1F0A > \\u1F02;"+ + "\\u1F0B > \\u1F03;"+ + "\\u1F0C > \\u1F04;"+ + "\\u1F0D > \\u1F05;"+ + "\\u1F0E > \\u1F06;"+ + "\\u1F0F > \\u1F07;"+ + "\\u1F18 > \\u1F10;"+ + "\\u1F19 > \\u1F11;"+ + "\\u1F1A > \\u1F12;"+ + "\\u1F1B > \\u1F13;"+ + "\\u1F1C > \\u1F14;"+ + "\\u1F1D > \\u1F15;"+ + "\\u1F28 > \\u1F20;"+ + "\\u1F29 > \\u1F21;"+ + "\\u1F2A > \\u1F22;"+ + "\\u1F2B > \\u1F23;"+ + "\\u1F2C > \\u1F24;"+ + "\\u1F2D > \\u1F25;"+ + "\\u1F2E > \\u1F26;"+ + "\\u1F2F > \\u1F27;"+ + "\\u1F38 > \\u1F30;"+ + "\\u1F39 > \\u1F31;"+ + "\\u1F3A > \\u1F32;"+ + "\\u1F3B > \\u1F33;"+ + "\\u1F3C > \\u1F34;"+ + "\\u1F3D > \\u1F35;"+ + "\\u1F3E > \\u1F36;"+ + "\\u1F3F > \\u1F37;"+ + "\\u1F48 > \\u1F40;"+ + "\\u1F49 > \\u1F41;"+ + "\\u1F4A > \\u1F42;"+ + "\\u1F4B > \\u1F43;"+ + "\\u1F4C > \\u1F44;"+ + "\\u1F4D > \\u1F45;"+ + "\\u1F50 > \\u03C5\\u0313;"+ + "\\u1F52 > \\u03C5\\u0313\\u0300;"+ + "\\u1F54 > \\u03C5\\u0313\\u0301;"+ + "\\u1F56 > \\u03C5\\u0313\\u0342;"+ + "\\u1F59 > \\u1F51;"+ + "\\u1F5B > \\u1F53;"+ + "\\u1F5D > \\u1F55;"+ + "\\u1F5F > \\u1F57;"+ + "\\u1F68 > \\u1F60;"+ + "\\u1F69 > \\u1F61;"+ + "\\u1F6A > \\u1F62;"+ + "\\u1F6B > \\u1F63;"+ + "\\u1F6C > \\u1F64;"+ + "\\u1F6D > \\u1F65;"+ + "\\u1F6E > \\u1F66;"+ + "\\u1F6F > \\u1F67;"+ + "\\u1F80 > \\u1F00\\u03B9;"+ + "\\u1F81 > \\u1F01\\u03B9;"+ + "\\u1F82 > \\u1F02\\u03B9;"+ + "\\u1F83 > \\u1F03\\u03B9;"+ + "\\u1F84 > \\u1F04\\u03B9;"+ + "\\u1F85 > \\u1F05\\u03B9;"+ + "\\u1F86 > \\u1F06\\u03B9;"+ + "\\u1F87 > \\u1F07\\u03B9;"+ + "\\u1F88 > \\u1F00\\u03B9;"+ + "\\u1F89 > \\u1F01\\u03B9;"+ + "\\u1F8A > \\u1F02\\u03B9;"+ + "\\u1F8B > \\u1F03\\u03B9;"+ + "\\u1F8C > \\u1F04\\u03B9;"+ + "\\u1F8D > \\u1F05\\u03B9;"+ + "\\u1F8E > \\u1F06\\u03B9;"+ + "\\u1F8F > \\u1F07\\u03B9;"+ + "\\u1F90 > \\u1F20\\u03B9;"+ + "\\u1F91 > \\u1F21\\u03B9;"+ + "\\u1F92 > \\u1F22\\u03B9;"+ + "\\u1F93 > \\u1F23\\u03B9;"+ + "\\u1F94 > \\u1F24\\u03B9;"+ + "\\u1F95 > \\u1F25\\u03B9;"+ + "\\u1F96 > \\u1F26\\u03B9;"+ + "\\u1F97 > \\u1F27\\u03B9;"+ + "\\u1F98 > \\u1F20\\u03B9;"+ + "\\u1F99 > \\u1F21\\u03B9;"+ + "\\u1F9A > \\u1F22\\u03B9;"+ + "\\u1F9B > \\u1F23\\u03B9;"+ + "\\u1F9C > \\u1F24\\u03B9;"+ + "\\u1F9D > \\u1F25\\u03B9;"+ + "\\u1F9E > \\u1F26\\u03B9;"+ + "\\u1F9F > \\u1F27\\u03B9;"+ + "\\u1FA0 > \\u1F60\\u03B9;"+ + "\\u1FA1 > \\u1F61\\u03B9;"+ + "\\u1FA2 > \\u1F62\\u03B9;"+ + "\\u1FA3 > \\u1F63\\u03B9;"+ + "\\u1FA4 > \\u1F64\\u03B9;"+ + "\\u1FA5 > \\u1F65\\u03B9;"+ + "\\u1FA6 > \\u1F66\\u03B9;"+ + "\\u1FA7 > \\u1F67\\u03B9;"+ + "\\u1FA8 > \\u1F60\\u03B9;"+ + "\\u1FA9 > \\u1F61\\u03B9;"+ + "\\u1FAA > \\u1F62\\u03B9;"+ + "\\u1FAB > \\u1F63\\u03B9;"+ + "\\u1FAC > \\u1F64\\u03B9;"+ + "\\u1FAD > \\u1F65\\u03B9;"+ + "\\u1FAE > \\u1F66\\u03B9;"+ + "\\u1FAF > \\u1F67\\u03B9;"+ + "\\u1FB2 > \\u1F70\\u03B9;"+ + "\\u1FB3 > \\u03B1\\u03B9;"+ + "\\u1FB4 > \\u03AC\\u03B9;"+ + "\\u1FB6 > \\u03B1\\u0342;"+ + "\\u1FB7 > \\u03B1\\u0342\\u03B9;"+ + "\\u1FB8 > \\u1FB0;"+ + "\\u1FB9 > \\u1FB1;"+ + "\\u1FBA > \\u1F70;"+ + "\\u1FBB > \\u1F71;"+ + "\\u1FBC > \\u03B1\\u03B9;"+ + "\\u1FBE > \\u03B9;"+ + "\\u1FC2 > \\u1F74\\u03B9;"+ + "\\u1FC3 > \\u03B7\\u03B9;"+ + "\\u1FC4 > \\u03AE\\u03B9;"+ + "\\u1FC6 > \\u03B7\\u0342;"+ + "\\u1FC7 > \\u03B7\\u0342\\u03B9;"+ + "\\u1FC8 > \\u1F72;"+ + "\\u1FC9 > \\u1F73;"+ + "\\u1FCA > \\u1F74;"+ + "\\u1FCB > \\u1F75;"+ + "\\u1FCC > \\u03B7\\u03B9;"+ + "\\u1FD2 > \\u03B9\\u0308\\u0300;"+ + "\\u1FD3 > \\u03B9\\u0308\\u0301;"+ + "\\u1FD6 > \\u03B9\\u0342;"+ + "\\u1FD7 > \\u03B9\\u0308\\u0342;"+ + "\\u1FD8 > \\u1FD0;"+ + "\\u1FD9 > \\u1FD1;"+ + "\\u1FDA > \\u1F76;"+ + "\\u1FDB > \\u1F77;"+ + "\\u1FE2 > \\u03C5\\u0308\\u0300;"+ + "\\u1FE3 > \\u03C5\\u0308\\u0301;"+ + "\\u1FE4 > \\u03C1\\u0313;"+ + "\\u1FE6 > \\u03C5\\u0342;"+ + "\\u1FE7 > \\u03C5\\u0308\\u0342;"+ + "\\u1FE8 > \\u1FE0;"+ + "\\u1FE9 > \\u1FE1;"+ + "\\u1FEA > \\u1F7A;"+ + "\\u1FEB > \\u1F7B;"+ + "\\u1FEC > \\u1FE5;"+ + "\\u1FF2 > \\u1F7C\\u03B9;"+ + "\\u1FF3 > \\u03C9\\u03B9;"+ + "\\u1FF4 > \\u03CE\\u03B9;"+ + "\\u1FF6 > \\u03C9\\u0342;"+ + "\\u1FF7 > \\u03C9\\u0342\\u03B9;"+ + "\\u1FF8 > \\u1F78;"+ + "\\u1FF9 > \\u1F79;"+ + "\\u1FFA > \\u1F7C;"+ + "\\u1FFB > \\u1F7D;"+ + "\\u1FFC > \\u03C9\\u03B9;"+ + "\\u20A8 > rs;"+ + "\\u2102 > c;"+ + "\\u2103 > \\u00B0c;"+ + "\\u2107 > \\u025B;"+ + "\\u2109 > \\u00B0f;"+ + "\\u210B > h;"+ + "\\u210C > h;"+ + "\\u210D > h;"+ + "\\u2110 > i;"+ + "\\u2111 > i;"+ + "\\u2112 > l;"+ + "\\u2115 > n;"+ + "\\u2116 > no;"+ + "\\u2119 > p;"+ + "\\u211A > q;"+ + "\\u211B > r;"+ + "\\u211C > r;"+ + "\\u211D > r;"+ + "\\u2120 > sm;"+ + "\\u2121 > tel;"+ + "\\u2122 > tm;"+ + "\\u2124 > z;"+ + "\\u2126 > \\u03C9;"+ + "\\u2128 > z;"+ + "\\u212A > k;"+ + "\\u212B > \\u00E5;"+ + "\\u212C > b;"+ + "\\u212D > c;"+ + "\\u2130 > e;"+ + "\\u2131 > f;"+ + "\\u2133 > m;"+ + "\\u213E > \\u03B3;"+ + "\\u213F > \\u03C0;"+ + "\\u2145 > d;"+ + "\\u2160 > \\u2170;"+ + "\\u2161 > \\u2171;"+ + "\\u2162 > \\u2172;"+ + "\\u2163 > \\u2173;"+ + "\\u2164 > \\u2174;"+ + "\\u2165 > \\u2175;"+ + "\\u2166 > \\u2176;"+ + "\\u2167 > \\u2177;"+ + "\\u2168 > \\u2178;"+ + "\\u2169 > \\u2179;"+ + "\\u216A > \\u217A;"+ + "\\u216B > \\u217B;"+ + "\\u216C > \\u217C;"+ + "\\u216D > \\u217D;"+ + "\\u216E > \\u217E;"+ + "\\u216F > \\u217F;"+ + "\\u24B6 > \\u24D0;"+ + "\\u24B7 > \\u24D1;"+ + "\\u24B8 > \\u24D2;"+ + "\\u24B9 > \\u24D3;"+ + "\\u24BA > \\u24D4;"+ + "\\u24BB > \\u24D5;"+ + "\\u24BC > \\u24D6;"+ + "\\u24BD > \\u24D7;"+ + "\\u24BE > \\u24D8;"+ + "\\u24BF > \\u24D9;"+ + "\\u24C0 > \\u24DA;"+ + "\\u24C1 > \\u24DB;"+ + "\\u24C2 > \\u24DC;"+ + "\\u24C3 > \\u24DD;"+ + "\\u24C4 > \\u24DE;"+ + "\\u24C5 > \\u24DF;"+ + "\\u24C6 > \\u24E0;"+ + "\\u24C7 > \\u24E1;"+ + "\\u24C8 > \\u24E2;"+ + "\\u24C9 > \\u24E3;"+ + "\\u24CA > \\u24E4;"+ + "\\u24CB > \\u24E5;"+ + "\\u24CC > \\u24E6;"+ + "\\u24CD > \\u24E7;"+ + "\\u24CE > \\u24E8;"+ + "\\u24CF > \\u24E9;"+ + "\\u3371 > hpa;"+ + "\\u3373 > au;"+ + "\\u3375 > ov;"+ + "\\u3380 > pa;"+ + "\\u3381 > na;"+ + "\\u3382 > \\u03BCa;"+ + "\\u3383 > ma;"+ + "\\u3384 > ka;"+ + "\\u3385 > kb;"+ + "\\u3386 > mb;"+ + "\\u3387 > gb;"+ + "\\u338A > pf;"+ + "\\u338B > nf;"+ + "\\u338C > \\u03BCf;"+ + "\\u3390 > hz;"+ + "\\u3391 > khz;"+ + "\\u3392 > mhz;"+ + "\\u3393 > ghz;"+ + "\\u3394 > thz;"+ + "\\u33A9 > pa;"+ + "\\u33AA > kpa;"+ + "\\u33AB > mpa;"+ + "\\u33AC > gpa;"+ + "\\u33B4 > pv;"+ + "\\u33B5 > nv;"+ + "\\u33B6 > \\u03BCv;"+ + "\\u33B7 > mv;"+ + "\\u33B8 > kv;"+ + "\\u33B9 > mv;"+ + "\\u33BA > pw;"+ + "\\u33BB > nw;"+ + "\\u33BC > \\u03BCw;"+ + "\\u33BD > mw;"+ + "\\u33BE > kw;"+ + "\\u33BF > mw;"+ + "\\u33C0 > k\\u03C9;"+ + "\\u33C1 > m\\u03C9;"+ + "\\u33C3 > bq;"+ + "\\u33C6 > c\\u2215kg;"+ + "\\u33C7 > co\\.;"+ + "\\u33C8 > db;"+ + "\\u33C9 > gy;"+ + "\\u33CB > hp;"+ + "\\u33CD > kk;"+ + "\\u33CE > km;"+ + "\\u33D7 > ph;"+ + "\\u33D9 > ppm;"+ + "\\u33DA > pr;"+ + "\\u33DC > sv;"+ + "\\u33DD > wb;"+ + "\\uFB00 > ff;"+ + "\\uFB01 > fi;"+ + "\\uFB02 > fl;"+ + "\\uFB03 > ffi;"+ + "\\uFB04 > ffl;"+ + "\\uFB05 > st;"+ + "\\uFB06 > st;"+ + "\\uFB13 > \\u0574\\u0576;"+ + "\\uFB14 > \\u0574\\u0565;"+ + "\\uFB15 > \\u0574\\u056B;"+ + "\\uFB16 > \\u057E\\u0576;"+ + "\\uFB17 > \\u0574\\u056D;"+ + "\\uFF21 > \\uFF41;"+ + "\\uFF22 > \\uFF42;"+ + "\\uFF23 > \\uFF43;"+ + "\\uFF24 > \\uFF44;"+ + "\\uFF25 > \\uFF45;"+ + "\\uFF26 > \\uFF46;"+ + "\\uFF27 > \\uFF47;"+ + "\\uFF28 > \\uFF48;"+ + "\\uFF29 > \\uFF49;"+ + "\\uFF2A > \\uFF4A;"+ + "\\uFF2B > \\uFF4B;"+ + "\\uFF2C > \\uFF4C;"+ + "\\uFF2D > \\uFF4D;"+ + "\\uFF2E > \\uFF4E;"+ + "\\uFF2F > \\uFF4F;"+ + "\\uFF30 > \\uFF50;"+ + "\\uFF31 > \\uFF51;"+ + "\\uFF32 > \\uFF52;"+ + "\\uFF33 > \\uFF53;"+ + "\\uFF34 > \\uFF54;"+ + "\\uFF35 > \\uFF55;"+ + "\\uFF36 > \\uFF56;"+ + "\\uFF37 > \\uFF57;"+ + "\\uFF38 > \\uFF58;"+ + "\\uFF39 > \\uFF59;"+ + "\\uFF3A > \\uFF5A;"+ + "\\U00010400 > \\U00010428;"+ + "\\U00010401 > \\U00010429;"+ + "\\U00010402 > \\U0001042A;"+ + "\\U00010403 > \\U0001042B;"+ + "\\U00010404 > \\U0001042C;"+ + "\\U00010405 > \\U0001042D;"+ + "\\U00010406 > \\U0001042E;"+ + "\\U00010407 > \\U0001042F;"+ + "\\U00010408 > \\U00010430;"+ + "\\U00010409 > \\U00010431;"+ + "\\U0001040A > \\U00010432;"+ + "\\U0001040B > \\U00010433;"+ + "\\U0001040C > \\U00010434;"+ + "\\U0001040D > \\U00010435;"+ + "\\U0001040E > \\U00010436;"+ + "\\U0001040F > \\U00010437;"+ + "\\U00010410 > \\U00010438;"+ + "\\U00010411 > \\U00010439;"+ + "\\U00010412 > \\U0001043A;"+ + "\\U00010413 > \\U0001043B;"+ + "\\U00010414 > \\U0001043C;"+ + "\\U00010415 > \\U0001043D;"+ + "\\U00010416 > \\U0001043E;"+ + "\\U00010417 > \\U0001043F;"+ + "\\U00010418 > \\U00010440;"+ + "\\U00010419 > \\U00010441;"+ + "\\U0001041A > \\U00010442;"+ + "\\U0001041B > \\U00010443;"+ + "\\U0001041C > \\U00010444;"+ + "\\U0001041D > \\U00010445;"+ + "\\U0001041E > \\U00010446;"+ + "\\U0001041F > \\U00010447;"+ + "\\U00010420 > \\U00010448;"+ + "\\U00010421 > \\U00010449;"+ + "\\U00010422 > \\U0001044A;"+ + "\\U00010423 > \\U0001044B;"+ + "\\U00010424 > \\U0001044C;"+ + "\\U00010425 > \\U0001044D;"+ + "\\U0001D400 > a;"+ + "\\U0001D401 > b;"+ + "\\U0001D402 > c;"+ + "\\U0001D403 > d;"+ + "\\U0001D404 > e;"+ + "\\U0001D405 > f;"+ + "\\U0001D406 > g;"+ + "\\U0001D407 > h;"+ + "\\U0001D408 > i;"+ + "\\U0001D409 > j;"+ + "\\U0001D40A > k;"+ + "\\U0001D40B > l;"+ + "\\U0001D40C > m;"+ + "\\U0001D40D > n;"+ + "\\U0001D40E > o;"+ + "\\U0001D40F > p;"+ + "\\U0001D410 > q;"+ + "\\U0001D411 > r;"+ + "\\U0001D412 > s;"+ + "\\U0001D413 > t;"+ + "\\U0001D414 > u;"+ + "\\U0001D415 > v;"+ + "\\U0001D416 > w;"+ + "\\U0001D417 > x;"+ + "\\U0001D418 > y;"+ + "\\U0001D419 > z;"+ + "\\U0001D434 > a;"+ + "\\U0001D435 > b;"+ + "\\U0001D436 > c;"+ + "\\U0001D437 > d;"+ + "\\U0001D438 > e;"+ + "\\U0001D439 > f;"+ + "\\U0001D43A > g;"+ + "\\U0001D43B > h;"+ + "\\U0001D43C > i;"+ + "\\U0001D43D > j;"+ + "\\U0001D43E > k;"+ + "\\U0001D43F > l;"+ + "\\U0001D440 > m;"+ + "\\U0001D441 > n;"+ + "\\U0001D442 > o;"+ + "\\U0001D443 > p;"+ + "\\U0001D444 > q;"+ + "\\U0001D445 > r;"+ + "\\U0001D446 > s;"+ + "\\U0001D447 > t;"+ + "\\U0001D448 > u;"+ + "\\U0001D449 > v;"+ + "\\U0001D44A > w;"+ + "\\U0001D44B > x;"+ + "\\U0001D44C > y;"+ + "\\U0001D44D > z;"+ + "\\U0001D468 > a;"+ + "\\U0001D469 > b;"+ + "\\U0001D46A > c;"+ + "\\U0001D46B > d;"+ + "\\U0001D46C > e;"+ + "\\U0001D46D > f;"+ + "\\U0001D46E > g;"+ + "\\U0001D46F > h;"+ + "\\U0001D470 > i;"+ + "\\U0001D471 > j;"+ + "\\U0001D472 > k;"+ + "\\U0001D473 > l;"+ + "\\U0001D474 > m;"+ + "\\U0001D475 > n;"+ + "\\U0001D476 > o;"+ + "\\U0001D477 > p;"+ + "\\U0001D478 > q;"+ + "\\U0001D479 > r;"+ + "\\U0001D47A > s;"+ + "\\U0001D47B > t;"+ + "\\U0001D47C > u;"+ + "\\U0001D47D > v;"+ + "\\U0001D47E > w;"+ + "\\U0001D47F > x;"+ + "\\U0001D480 > y;"+ + "\\U0001D481 > z;"+ + "\\U0001D49C > a;"+ + "\\U0001D49E > c;"+ + "\\U0001D49F > d;"+ + "\\U0001D4A2 > g;"+ + "\\U0001D4A5 > j;"+ + "\\U0001D4A6 > k;"+ + "\\U0001D4A9 > n;"+ + "\\U0001D4AA > o;"+ + "\\U0001D4AB > p;"+ + "\\U0001D4AC > q;"+ + "\\U0001D4AE > s;"+ + "\\U0001D4AF > t;"+ + "\\U0001D4B0 > u;"+ + "\\U0001D4B1 > v;"+ + "\\U0001D4B2 > w;"+ + "\\U0001D4B3 > x;"+ + "\\U0001D4B4 > y;"+ + "\\U0001D4B5 > z;"+ + "\\U0001D4D0 > a;"+ + "\\U0001D4D1 > b;"+ + "\\U0001D4D2 > c;"+ + "\\U0001D4D3 > d;"+ + "\\U0001D4D4 > e;"+ + "\\U0001D4D5 > f;"+ + "\\U0001D4D6 > g;"+ + "\\U0001D4D7 > h;"+ + "\\U0001D4D8 > i;"+ + "\\U0001D4D9 > j;"+ + "\\U0001D4DA > k;"+ + "\\U0001D4DB > l;"+ + "\\U0001D4DC > m;"+ + "\\U0001D4DD > n;"+ + "\\U0001D4DE > o;"+ + "\\U0001D4DF > p;"+ + "\\U0001D4E0 > q;"+ + "\\U0001D4E1 > r;"+ + "\\U0001D4E2 > s;"+ + "\\U0001D4E3 > t;"+ + "\\U0001D4E4 > u;"+ + "\\U0001D4E5 > v;"+ + "\\U0001D4E6 > w;"+ + "\\U0001D4E7 > x;"+ + "\\U0001D4E8 > y;"+ + "\\U0001D4E9 > z;"+ + "\\U0001D504 > a;"+ + "\\U0001D505 > b;"+ + "\\U0001D507 > d;"+ + "\\U0001D508 > e;"+ + "\\U0001D509 > f;"+ + "\\U0001D50A > g;"+ + "\\U0001D50D > j;"+ + "\\U0001D50E > k;"+ + "\\U0001D50F > l;"+ + "\\U0001D510 > m;"+ + "\\U0001D511 > n;"+ + "\\U0001D512 > o;"+ + "\\U0001D513 > p;"+ + "\\U0001D514 > q;"+ + "\\U0001D516 > s;"+ + "\\U0001D517 > t;"+ + "\\U0001D518 > u;"+ + "\\U0001D519 > v;"+ + "\\U0001D51A > w;"+ + "\\U0001D51B > x;"+ + "\\U0001D51C > y;"+ + "\\U0001D538 > a;"+ + "\\U0001D539 > b;"+ + "\\U0001D53B > d;"+ + "\\U0001D53C > e;"+ + "\\U0001D53D > f;"+ + "\\U0001D53E > g;"+ + "\\U0001D540 > i;"+ + "\\U0001D541 > j;"+ + "\\U0001D542 > k;"+ + "\\U0001D543 > l;"+ + "\\U0001D544 > m;"+ + "\\U0001D546 > o;"+ + "\\U0001D54A > s;"+ + "\\U0001D54B > t;"+ + "\\U0001D54C > u;"+ + "\\U0001D54D > v;"+ + "\\U0001D54E > w;"+ + "\\U0001D54F > x;"+ + "\\U0001D550 > y;"+ + "\\U0001D56C > a;"+ + "\\U0001D56D > b;"+ + "\\U0001D56E > c;"+ + "\\U0001D56F > d;"+ + "\\U0001D570 > e;"+ + "\\U0001D571 > f;"+ + "\\U0001D572 > g;"+ + "\\U0001D573 > h;"+ + "\\U0001D574 > i;"+ + "\\U0001D575 > j;"+ + "\\U0001D576 > k;"+ + "\\U0001D577 > l;"+ + "\\U0001D578 > m;"+ + "\\U0001D579 > n;"+ + "\\U0001D57A > o;"+ + "\\U0001D57B > p;"+ + "\\U0001D57C > q;"+ + "\\U0001D57D > r;"+ + "\\U0001D57E > s;"+ + "\\U0001D57F > t;"+ + "\\U0001D580 > u;"+ + "\\U0001D581 > v;"+ + "\\U0001D582 > w;"+ + "\\U0001D583 > x;"+ + "\\U0001D584 > y;"+ + "\\U0001D585 > z;"+ + "\\U0001D5A0 > a;"+ + "\\U0001D5A1 > b;"+ + "\\U0001D5A2 > c;"+ + "\\U0001D5A3 > d;"+ + "\\U0001D5A4 > e;"+ + "\\U0001D5A5 > f;"+ + "\\U0001D5A6 > g;"+ + "\\U0001D5A7 > h;"+ + "\\U0001D5A8 > i;"+ + "\\U0001D5A9 > j;"+ + "\\U0001D5AA > k;"+ + "\\U0001D5AB > l;"+ + "\\U0001D5AC > m;"+ + "\\U0001D5AD > n;"+ + "\\U0001D5AE > o;"+ + "\\U0001D5AF > p;"+ + "\\U0001D5B0 > q;"+ + "\\U0001D5B1 > r;"+ + "\\U0001D5B2 > s;"+ + "\\U0001D5B3 > t;"+ + "\\U0001D5B4 > u;"+ + "\\U0001D5B5 > v;"+ + "\\U0001D5B6 > w;"+ + "\\U0001D5B7 > x;"+ + "\\U0001D5B8 > y;"+ + "\\U0001D5B9 > z;"+ + "\\U0001D5D4 > a;"+ + "\\U0001D5D5 > b;"+ + "\\U0001D5D6 > c;"+ + "\\U0001D5D7 > d;"+ + "\\U0001D5D8 > e;"+ + "\\U0001D5D9 > f;"+ + "\\U0001D5DA > g;"+ + "\\U0001D5DB > h;"+ + "\\U0001D5DC > i;"+ + "\\U0001D5DD > j;"+ + "\\U0001D5DE > k;"+ + "\\U0001D5DF > l;"+ + "\\U0001D5E0 > m;"+ + "\\U0001D5E1 > n;"+ + "\\U0001D5E2 > o;"+ + "\\U0001D5E3 > p;"+ + "\\U0001D5E4 > q;"+ + "\\U0001D5E5 > r;"+ + "\\U0001D5E6 > s;"+ + "\\U0001D5E7 > t;"+ + "\\U0001D5E8 > u;"+ + "\\U0001D5E9 > v;"+ + "\\U0001D5EA > w;"+ + "\\U0001D5EB > x;"+ + "\\U0001D5EC > y;"+ + "\\U0001D5ED > z;"+ + "\\U0001D608 > a;"+ + "\\U0001D609 > b;"+ + "\\U0001D60A > c;"+ + "\\U0001D60B > d;"+ + "\\U0001D60C > e;"+ + "\\U0001D60D > f;"+ + "\\U0001D60E > g;"+ + "\\U0001D60F > h;"+ + "\\U0001D610 > i;"+ + "\\U0001D611 > j;"+ + "\\U0001D612 > k;"+ + "\\U0001D613 > l;"+ + "\\U0001D614 > m;"+ + "\\U0001D615 > n;"+ + "\\U0001D616 > o;"+ + "\\U0001D617 > p;"+ + "\\U0001D618 > q;"+ + "\\U0001D619 > r;"+ + "\\U0001D61A > s;"+ + "\\U0001D61B > t;"+ + "\\U0001D61C > u;"+ + "\\U0001D61D > v;"+ + "\\U0001D61E > w;"+ + "\\U0001D61F > x;"+ + "\\U0001D620 > y;"+ + "\\U0001D621 > z;"+ + "\\U0001D63C > a;"+ + "\\U0001D63D > b;"+ + "\\U0001D63E > c;"+ + "\\U0001D63F > d;"+ + "\\U0001D640 > e;"+ + "\\U0001D641 > f;"+ + "\\U0001D642 > g;"+ + "\\U0001D643 > h;"+ + "\\U0001D644 > i;"+ + "\\U0001D645 > j;"+ + "\\U0001D646 > k;"+ + "\\U0001D647 > l;"+ + "\\U0001D648 > m;"+ + "\\U0001D649 > n;"+ + "\\U0001D64A > o;"+ + "\\U0001D64B > p;"+ + "\\U0001D64C > q;"+ + "\\U0001D64D > r;"+ + "\\U0001D64E > s;"+ + "\\U0001D64F > t;"+ + "\\U0001D650 > u;"+ + "\\U0001D651 > v;"+ + "\\U0001D652 > w;"+ + "\\U0001D653 > x;"+ + "\\U0001D654 > y;"+ + "\\U0001D655 > z;"+ + "\\U0001D670 > a;"+ + "\\U0001D671 > b;"+ + "\\U0001D672 > c;"+ + "\\U0001D673 > d;"+ + "\\U0001D674 > e;"+ + "\\U0001D675 > f;"+ + "\\U0001D676 > g;"+ + "\\U0001D677 > h;"+ + "\\U0001D678 > i;"+ + "\\U0001D679 > j;"+ + "\\U0001D67A > k;"+ + "\\U0001D67B > l;"+ + "\\U0001D67C > m;"+ + "\\U0001D67D > n;"+ + "\\U0001D67E > o;"+ + "\\U0001D67F > p;"+ + "\\U0001D680 > q;"+ + "\\U0001D681 > r;"+ + "\\U0001D682 > s;"+ + "\\U0001D683 > t;"+ + "\\U0001D684 > u;"+ + "\\U0001D685 > v;"+ + "\\U0001D686 > w;"+ + "\\U0001D687 > x;"+ + "\\U0001D688 > y;"+ + "\\U0001D689 > z;"+ + "\\U0001D6A8 > \\u03B1;"+ + "\\U0001D6A9 > \\u03B2;"+ + "\\U0001D6AA > \\u03B3;"+ + "\\U0001D6AB > \\u03B4;"+ + "\\U0001D6AC > \\u03B5;"+ + "\\U0001D6AD > \\u03B6;"+ + "\\U0001D6AE > \\u03B7;"+ + "\\U0001D6AF > \\u03B8;"+ + "\\U0001D6B0 > \\u03B9;"+ + "\\U0001D6B1 > \\u03BA;"+ + "\\U0001D6B2 > \\u03BB;"+ + "\\U0001D6B3 > \\u03BC;"+ + "\\U0001D6B4 > \\u03BD;"+ + "\\U0001D6B5 > \\u03BE;"+ + "\\U0001D6B6 > \\u03BF;"+ + "\\U0001D6B7 > \\u03C0;"+ + "\\U0001D6B8 > \\u03C1;"+ + "\\U0001D6B9 > \\u03B8;"+ + "\\U0001D6BA > \\u03C3;"+ + "\\U0001D6BB > \\u03C4;"+ + "\\U0001D6BC > \\u03C5;"+ + "\\U0001D6BD > \\u03C6;"+ + "\\U0001D6BE > \\u03C7;"+ + "\\U0001D6BF > \\u03C8;"+ + "\\U0001D6C0 > \\u03C9;"+ + "\\U0001D6D3 > \\u03C3;"+ + "\\U0001D6E2 > \\u03B1;"+ + "\\U0001D6E3 > \\u03B2;"+ + "\\U0001D6E4 > \\u03B3;"+ + "\\U0001D6E5 > \\u03B4;"+ + "\\U0001D6E6 > \\u03B5;"+ + "\\U0001D6E7 > \\u03B6;"+ + "\\U0001D6E8 > \\u03B7;"+ + "\\U0001D6E9 > \\u03B8;"+ + "\\U0001D6EA > \\u03B9;"+ + "\\U0001D6EB > \\u03BA;"+ + "\\U0001D6EC > \\u03BB;"+ + "\\U0001D6ED > \\u03BC;"+ + "\\U0001D6EE > \\u03BD;"+ + "\\U0001D6EF > \\u03BE;"+ + "\\U0001D6F0 > \\u03BF;"+ + "\\U0001D6F1 > \\u03C0;"+ + "\\U0001D6F2 > \\u03C1;"+ + "\\U0001D6F3 > \\u03B8;"+ + "\\U0001D6F4 > \\u03C3;"+ + "\\U0001D6F5 > \\u03C4;"+ + "\\U0001D6F6 > \\u03C5;"+ + "\\U0001D6F7 > \\u03C6;"+ + "\\U0001D6F8 > \\u03C7;"+ + "\\U0001D6F9 > \\u03C8;"+ + "\\U0001D6FA > \\u03C9;"+ + "\\U0001D70D > \\u03C3;"+ + "\\U0001D71C > \\u03B1;"+ + "\\U0001D71D > \\u03B2;"+ + "\\U0001D71E > \\u03B3;"+ + "\\U0001D71F > \\u03B4;"+ + "\\U0001D720 > \\u03B5;"+ + "\\U0001D721 > \\u03B6;"+ + "\\U0001D722 > \\u03B7;"+ + "\\U0001D723 > \\u03B8;"+ + "\\U0001D724 > \\u03B9;"+ + "\\U0001D725 > \\u03BA;"+ + "\\U0001D726 > \\u03BB;"+ + "\\U0001D727 > \\u03BC;"+ + "\\U0001D728 > \\u03BD;"+ + "\\U0001D729 > \\u03BE;"+ + "\\U0001D72A > \\u03BF;"+ + "\\U0001D72B > \\u03C0;"+ + "\\U0001D72C > \\u03C1;"+ + "\\U0001D72D > \\u03B8;"+ + "\\U0001D72E > \\u03C3;"+ + "\\U0001D72F > \\u03C4;"+ + "\\U0001D730 > \\u03C5;"+ + "\\U0001D731 > \\u03C6;"+ + "\\U0001D732 > \\u03C7;"+ + "\\U0001D733 > \\u03C8;"+ + "\\U0001D734 > \\u03C9;"+ + "\\U0001D747 > \\u03C3;"+ + "\\U0001D756 > \\u03B1;"+ + "\\U0001D757 > \\u03B2;"+ + "\\U0001D758 > \\u03B3;"+ + "\\U0001D759 > \\u03B4;"+ + "\\U0001D75A > \\u03B5;"+ + "\\U0001D75B > \\u03B6;"+ + "\\U0001D75C > \\u03B7;"+ + "\\U0001D75D > \\u03B8;"+ + "\\U0001D75E > \\u03B9;"+ + "\\U0001D75F > \\u03BA;"+ + "\\U0001D760 > \\u03BB;"+ + "\\U0001D761 > \\u03BC;"+ + "\\U0001D762 > \\u03BD;"+ + "\\U0001D763 > \\u03BE;"+ + "\\U0001D764 > \\u03BF;"+ + "\\U0001D765 > \\u03C0;"+ + "\\U0001D766 > \\u03C1;"+ + "\\U0001D767 > \\u03B8;"+ + "\\U0001D768 > \\u03C3;"+ + "\\U0001D769 > \\u03C4;"+ + "\\U0001D76A > \\u03C5;"+ + "\\U0001D76B > \\u03C6;"+ + "\\U0001D76C > \\u03C7;"+ + "\\U0001D76D > \\u03C8;"+ + "\\U0001D76E > \\u03C9;"+ + "\\U0001D781 > \\u03C3;"+ + "\\U0001D790 > \\u03B1;"+ + "\\U0001D791 > \\u03B2;"+ + "\\U0001D792 > \\u03B3;"+ + "\\U0001D793 > \\u03B4;"+ + "\\U0001D794 > \\u03B5;"+ + "\\U0001D795 > \\u03B6;"+ + "\\U0001D796 > \\u03B7;"+ + "\\U0001D797 > \\u03B8;"+ + "\\U0001D798 > \\u03B9;"+ + "\\U0001D799 > \\u03BA;"+ + "\\U0001D79A > \\u03BB;"+ + "\\U0001D79B > \\u03BC;"+ + "\\U0001D79C > \\u03BD;"+ + "\\U0001D79D > \\u03BE;"+ + "\\U0001D79E > \\u03BF;"+ + "\\U0001D79F > \\u03C0;"+ + "\\U0001D7A0 > \\u03C1;"+ + "\\U0001D7A1 > \\u03B8;"+ + "\\U0001D7A2 > \\u03C3;"+ + "\\U0001D7A3 > \\u03C4;"+ + "\\U0001D7A4 > \\u03C5;"+ + "\\U0001D7A5 > \\u03C6;"+ + "\\U0001D7A6 > \\u03C7;"+ + "\\U0001D7A7 > \\u03C8;"+ + "\\U0001D7A8 > \\u03C9;"+ + "\\U0001D7BB > \\u03C3;"+ + "\\U0002F868 > \\U0002136A;"+ + "\\U0002F874 > \\u5F33;"+ + "\\U0002F91F > \\u43AB;"+ + "\\U0002F95F > \\u7AAE;"+ + "\\U0002F9BF > \\u4D57;"+ + // generate the characters that are unaffected + "::[:AGE=3.2:]NFKC;" + }, + { + "Map", + "\\u00AD > ;"+ + "\\u034F > ;"+ + "\\u1806 > ;"+ + "\\u180B > ;"+ + "\\u180C > ;"+ + "\\u180D > ;"+ + "\\u200B > ;"+ + "\\u200C > ;"+ + "\\u200D > ;"+ + "\\u2060 > ;"+ + "\\uFE00 > ;"+ + "\\uFE01 > ;"+ + "\\uFE02 > ;"+ + "\\uFE03 > ;"+ + "\\uFE04 > ;"+ + "\\uFE05 > ;"+ + "\\uFE06 > ;"+ + "\\uFE07 > ;"+ + "\\uFE08 > ;"+ + "\\uFE09 > ;"+ + "\\uFE0A > ;"+ + "\\uFE0B > ;"+ + "\\uFE0C > ;"+ + "\\uFE0D > ;"+ + "\\uFE0E > ;"+ + "\\uFE0F > ;"+ + "\\uFEFF > ;" + }, + { + "ProhibitedSet", + "[\\u00A0 \\u1680 \\u2000 \\u2001 \\u2002 \\u2003 \\u2004 \\u2005" + + " \\u2006 \\u2007 \\u2008 \\u2009 \\u200A \\u200B \\u202F \\u205F \\u3000" + + " \\u0080-\\u009F \\u06DD \\u070F \\u180E \\u200C \\u200D \\u2028 \\u2029" + + " \\u2060 \\u2061 \\u2062 \\u2063 \\u206A-\\u206F \\uFEFF \\uFFF9-\\uFFFC" + + " \\U0001D173-\\U0001D17A \\uE000-\\uF8FF \\U000F0000-\\U000FFFFD " + + "\\U00100000-\\U0010FFFD \\uFDD0-\\uFDEF \\uFFFE-\\uFFFF \\U0001FFF" + + "E-\\U0001FFFF \\U0002FFFE-\\U0002FFFF \\U0003FFFE-\\U0003FFFF \\U" + + "0004FFFE-\\U0004FFFF \\U0005FFFE-\\U0005FFFF \\U0006FFFE-\\U0006" + + "FFFF \\U0007FFFE-\\U0007FFFF \\U0008FFFE-\\U0008FFFF \\U0009FFFE" + + "-\\U0009FFFF \\U000AFFFE-\\U000AFFFF \\U000BFFFE-\\U000BFFFF \\U0" + + "00CFFFE-\\U000CFFFF \\U000DFFFE-\\U000DFFFF \\U000EFFFE-\\U000EF" + + "FFF \\U000FFFFE-\\U000FFFFF \\U0010FFFE-\\U0010FFFF \\uD800-\\uDFFF" + + " \\uFFF9 \\uFFFA \\uFFFB \\uFFFC \\uFFFD \\u2FF0-\\u2FFB \\u0340 \\u0341" + + " \\u200E \\u200F \\u202A \\u202B \\u202C \\u202D \\u202E \\u206A \\u206B" + + " \\u206C \\u206D \\u206E \\u206F \\U000E0001 \\U000E0020-\\U000E00" + + "7F ]", + }, + { + "UnassignedSet", + "[ \\u0221 \\u0234-\\u024F \\u02AE-\\u02AF \\u02EF-\\u02FF \\u0350-\\u035F" + + " \\u0370-\\u0373 \\u0376-\\u0379 \\u037B-\\u037D \\u037F-\\u0383 \\u038B" + + " \\u038D \\u03A2 \\u03CF \\u03F7-\\u03FF \\u0487 \\u04CF \\u04F6-\\u04F7" + + " \\u04FA-\\u04FF \\u0510-\\u0530 \\u0557-\\u0558 \\u0560 \\u0588 \\u058B" + + "-\\u0590 \\u05A2 \\u05BA \\u05C5-\\u05CF \\u05EB-\\u05EF \\u05F5-\\u060B" + + " \\u060D-\\u061A \\u061C-\\u061E \\u0620 \\u063B-\\u063F \\u0656-\\u065F" + + " \\u06EE-\\u06EF \\u06FF \\u070E \\u072D-\\u072F \\u074B-\\u077F \\u07B2" + + "-\\u0900 \\u0904 \\u093A-\\u093B \\u094E-\\u094F \\u0955-\\u0957 \\u0971" + + "-\\u0980 \\u0984 \\u098D-\\u098E \\u0991-\\u0992 \\u09A9 \\u09B1 \\u09B3" + + "-\\u09B5 \\u09BA-\\u09BB \\u09BD \\u09C5-\\u09C6 \\u09C9-\\u09CA \\u09CE" + + "-\\u09D6 \\u09D8-\\u09DB \\u09DE \\u09E4-\\u09E5 \\u09FB-\\u0A01 \\u0A03" + + "-\\u0A04 \\u0A0B-\\u0A0E \\u0A11-\\u0A12 \\u0A29 \\u0A31 \\u0A34 \\u0A37" + + " \\u0A3A-\\u0A3B \\u0A3D \\u0A43-\\u0A46 \\u0A49-\\u0A4A \\u0A4E-\\u0A58" + + " \\u0A5D \\u0A5F-\\u0A65 \\u0A75-\\u0A80 \\u0A84 \\u0A8C \\u0A8E \\u0A92" + + " \\u0AA9 \\u0AB1 \\u0AB4 \\u0ABA-\\u0ABB \\u0AC6 \\u0ACA \\u0ACE-\\u0ACF" + + " \\u0AD1-\\u0ADF \\u0AE1-\\u0AE5 \\u0AF0-\\u0B00 \\u0B04 \\u0B0D-\\u0B0E" + + " \\u0B11-\\u0B12 \\u0B29 \\u0B31 \\u0B34-\\u0B35 \\u0B3A-\\u0B3B \\u0B44" + + "-\\u0B46 \\u0B49-\\u0B4A \\u0B4E-\\u0B55 \\u0B58-\\u0B5B \\u0B5E \\u0B62" + + "-\\u0B65 \\u0B71-\\u0B81 \\u0B84 \\u0B8B-\\u0B8D \\u0B91 \\u0B96-\\u0B98" + + " \\u0B9B \\u0B9D \\u0BA0-\\u0BA2 \\u0BA5-\\u0BA7 \\u0BAB-\\u0BAD \\u0BB6" + + " \\u0BBA-\\u0BBD \\u0BC3-\\u0BC5 \\u0BC9 \\u0BCE-\\u0BD6 \\u0BD8-\\u0BE6" + + " \\u0BF3-\\u0C00 \\u0C04 \\u0C0D \\u0C11 \\u0C29 \\u0C34 \\u0C3A-\\u0C3D" + + " \\u0C45 \\u0C49 \\u0C4E-\\u0C54 \\u0C57-\\u0C5F \\u0C62-\\u0C65 \\u0C70" + + "-\\u0C81 \\u0C84 \\u0C8D \\u0C91 \\u0CA9 \\u0CB4 \\u0CBA-\\u0CBD \\u0CC5" + + " \\u0CC9 \\u0CCE-\\u0CD4 \\u0CD7-\\u0CDD \\u0CDF \\u0CE2-\\u0CE5 \\u0CF0" + + "-\\u0D01 \\u0D04 \\u0D0D \\u0D11 \\u0D29 \\u0D3A-\\u0D3D \\u0D44-\\u0D45" + + " \\u0D49 \\u0D4E-\\u0D56 \\u0D58-\\u0D5F \\u0D62-\\u0D65 \\u0D70-\\u0D81" + + " \\u0D84 \\u0D97-\\u0D99 \\u0DB2 \\u0DBC \\u0DBE-\\u0DBF \\u0DC7-\\u0DC9" + + " \\u0DCB-\\u0DCE \\u0DD5 \\u0DD7 \\u0DE0-\\u0DF1 \\u0DF5-\\u0E00 \\u0E3B" + + "-\\u0E3E \\u0E5C-\\u0E80 \\u0E83 \\u0E85-\\u0E86 \\u0E89 \\u0E8B-\\u0E8C" + + " \\u0E8E-\\u0E93 \\u0E98 \\u0EA0 \\u0EA4 \\u0EA6 \\u0EA8-\\u0EA9 \\u0EAC" + + " \\u0EBA \\u0EBE-\\u0EBF \\u0EC5 \\u0EC7 \\u0ECE-\\u0ECF \\u0EDA-\\u0EDB" + + " \\u0EDE-\\u0EFF \\u0F48 \\u0F6B-\\u0F70 \\u0F8C-\\u0F8F \\u0F98 \\u0FBD" + + " \\u0FCD-\\u0FCE \\u0FD0-\\u0FFF \\u1022 \\u1028 \\u102B \\u1033-\\u1035" + + " \\u103A-\\u103F \\u105A-\\u109F \\u10C6-\\u10CF \\u10F9-\\u10FA \\u10FC" + + "-\\u10FF \\u115A-\\u115E \\u11A3-\\u11A7 \\u11FA-\\u11FF \\u1207 \\u1247" + + " \\u1249 \\u124E-\\u124F \\u1257 \\u1259 \\u125E-\\u125F \\u1287 \\u1289" + + " \\u128E-\\u128F \\u12AF \\u12B1 \\u12B6-\\u12B7 \\u12BF \\u12C1 \\u12C6" + + "-\\u12C7 \\u12CF \\u12D7 \\u12EF \\u130F \\u1311 \\u1316-\\u1317 \\u131F" + + " \\u1347 \\u135B-\\u1360 \\u137D-\\u139F \\u13F5-\\u1400 \\u1677-\\u167F" + + " \\u169D-\\u169F \\u16F1-\\u16FF \\u170D \\u1715-\\u171F \\u1737-\\u173F" + + " \\u1754-\\u175F \\u176D \\u1771 \\u1774-\\u177F \\u17DD-\\u17DF \\u17EA" + + "-\\u17FF \\u180F \\u181A-\\u181F \\u1878-\\u187F \\u18AA-\\u1DFF \\u1E9C" + + "-\\u1E9F \\u1EFA-\\u1EFF \\u1F16-\\u1F17 \\u1F1E-\\u1F1F \\u1F46-\\u1F47" + + " \\u1F4E-\\u1F4F \\u1F58 \\u1F5A \\u1F5C \\u1F5E \\u1F7E-\\u1F7F \\u1FB5" + + " \\u1FC5 \\u1FD4-\\u1FD5 \\u1FDC \\u1FF0-\\u1FF1 \\u1FF5 \\u1FFF \\u2053" + + "-\\u2056 \\u2058-\\u205E \\u2064-\\u2069 \\u2072-\\u2073 \\u208F-\\u209F" + + " \\u20B2-\\u20CF \\u20EB-\\u20FF \\u213B-\\u213C \\u214C-\\u2152 \\u2184" + + "-\\u218F \\u23CF-\\u23FF \\u2427-\\u243F \\u244B-\\u245F \\u24FF \\u2614" + + "-\\u2615 \\u2618 \\u267E-\\u267F \\u268A-\\u2700 \\u2705 \\u270A-\\u270B" + + " \\u2728 \\u274C \\u274E \\u2753-\\u2755 \\u2757 \\u275F-\\u2760 \\u2795" + + "-\\u2797 \\u27B0 \\u27BF-\\u27CF \\u27EC-\\u27EF \\u2B00-\\u2E7F \\u2E9A" + + " \\u2EF4-\\u2EFF \\u2FD6-\\u2FEF \\u2FFC-\\u2FFF \\u3040 \\u3097-\\u3098" + + " \\u3100-\\u3104 \\u312D-\\u3130 \\u318F \\u31B8-\\u31EF \\u321D-\\u321F" + + " \\u3244-\\u3250 \\u327C-\\u327E \\u32CC-\\u32CF \\u32FF \\u3377-\\u337A" + + " \\u33DE-\\u33DF \\u33FF \\u4DB6-\\u4DFF \\u9FA6-\\u9FFF \\uA48D-\\uA48F" + + " \\uA4C7-\\uABFF \\uD7A4-\\uD7FF \\uFA2E-\\uFA2F \\uFA6B-\\uFAFF \\uFB07" + + "-\\uFB12 \\uFB18-\\uFB1C \\uFB37 \\uFB3D \\uFB3F \\uFB42 \\uFB45 \\uFBB2" + + "-\\uFBD2 \\uFD40-\\uFD4F \\uFD90-\\uFD91 \\uFDC8-\\uFDCF \\uFDFD-\\uFDFF" + + " \\uFE10-\\uFE1F \\uFE24-\\uFE2F \\uFE47-\\uFE48 \\uFE53 \\uFE67 \\uFE6C" + + "-\\uFE6F \\uFE75 \\uFEFD-\\uFEFE \\uFF00 \\uFFBF-\\uFFC1 \\uFFC8-\\uFFC9" + + " \\uFFD0-\\uFFD1 \\uFFD8-\\uFFD9 \\uFFDD-\\uFFDF \\uFFE7 \\uFFEF-\\uFFF8" + + " \\U00010000-\\U000102FF \\U0001031F \\U00010324-\\U0001032F \\U0" + + "001034B-\\U000103FF \\U00010426-\\U00010427 \\U0001044E-\\U0001C" + + "FFF \\U0001D0F6-\\U0001D0FF \\U0001D127-\\U0001D129 \\U0001D1DE-" + + "\\U0001D3FF \\U0001D455 \\U0001D49D \\U0001D4A0-\\U0001D4A1 \\U00" + + "01D4A3-\\U0001D4A4 \\U0001D4A7-\\U0001D4A8 \\U0001D4AD \\U0001D4" + + "BA \\U0001D4BC \\U0001D4C1 \\U0001D4C4 \\U0001D506 \\U0001D50B-\\U" + + "0001D50C \\U0001D515 \\U0001D51D \\U0001D53A \\U0001D53F \\U0001" + + "D545 \\U0001D547-\\U0001D549 \\U0001D551 \\U0001D6A4-\\U0001D6A7" + + " \\U0001D7CA-\\U0001D7CD \\U0001D800-\\U0001FFFD \\U0002A6D7-\\U0" + + "002F7FF \\U0002FA1E-\\U0002FFFD \\U00030000-\\U0003FFFD \\U00040" + + "000-\\U0004FFFD \\U00050000-\\U0005FFFD \\U00060000-\\U0006FFFD " + + "\\U00070000-\\U0007FFFD \\U00080000-\\U0008FFFD \\U00090000-\\U00" + + "09FFFD \\U000A0000-\\U000AFFFD \\U000B0000-\\U000BFFFD \\U000C00" + + "00-\\U000CFFFD \\U000D0000-\\U000DFFFD \\U000E0000 \\U000E0002-\\U" + + "000E001F \\U000E0080-\\U000EFFFD ]", + }, + }; +} diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/NFS4StringPrep.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/NFS4StringPrep.java new file mode 100644 index 00000000000..f93fc08a949 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/NFS4StringPrep.java @@ -0,0 +1,173 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/NFS4StringPrep.java,v $ + * $Date: 2003/08/21 23:42:25 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ +package com.ibm.icu.dev.test.stringprep; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; + +import com.ibm.icu.dev.test.TestUtil; +import com.ibm.icu.stringprep.ParseException; +import com.ibm.icu.stringprep.StringPrep; +import com.ibm.icu.text.UCharacterIterator; + +/** + * @author ram + * + * This is a dumb implementation of NFS4 profiles. It is a direct port of + * C code, does not use Object Oriented principles. Quick and Dirty implementation + * for testing. + */ +public final class NFS4StringPrep { + private static final String[] NFS4DataFileNames ={ + "nfscss.spp", + "nfscsi.spp", + "nfscis.spp", + "nfsmxp.spp", + "nfsmxs.spp" + }; + private StringPrep nfscss = null; + private StringPrep nfscsi = null; + private StringPrep nfscis = null; + private StringPrep nfsmxp = null; + private StringPrep nfsmxs = null; + //singleton instance + private static NFS4StringPrep prep = null; + + // we donot synchronize the constructor because we + // know that the constructor is only called from + // getInstance method if and only if the the singleton + // intance is null, which means this constructor is called + // only once + private NFS4StringPrep ()throws IOException{ + + InputStream nfscssFile = TestUtil.getDataStream(NFS4DataFileNames[0]); + nfscss = StringPrep.getInstance(nfscssFile); + nfscssFile.close(); + + InputStream nfscsiFile = TestUtil.getDataStream(NFS4DataFileNames[1]); + nfscsi = StringPrep.getInstance(nfscsiFile); + nfscsiFile.close(); + + InputStream nfscisFile = TestUtil.getDataStream(NFS4DataFileNames[2]); + nfscis = StringPrep.getInstance(nfscisFile); + nfscsiFile.close(); + + InputStream nfsmxpFile = TestUtil.getDataStream(NFS4DataFileNames[3]); + nfsmxp = StringPrep.getInstance(nfsmxpFile); + nfscsiFile.close(); + + InputStream nfsmxsFile = TestUtil.getDataStream(NFS4DataFileNames[4]); + nfsmxs = StringPrep.getInstance(nfsmxsFile); + nfsmxsFile.close(); + + } + + public static synchronized final NFS4StringPrep getInstance() + throws IOException{ + if(prep==null){ + prep = new NFS4StringPrep(); + } + return prep; + } + + private static byte[] prepare(byte[] src, StringPrep prep) + throws ParseException, UnsupportedEncodingException{ + String s = new String(src, "UTF-8"); + UCharacterIterator iter = UCharacterIterator.getInstance(s); + StringBuffer out = prep.prepare(iter,StringPrep.NONE); + return out.toString().getBytes("UTF-8"); + } + + public static byte[] cs_prepare(byte[] src, boolean caseInsensitive) + throws IOException, ParseException, UnsupportedEncodingException{ + NFS4StringPrep prep = getInstance(); + if(caseInsensitive){ + return prepare(src, prep.nfscsi); + }else{ + return prepare(src,prep.nfscsi); + } + } + + public static byte[] cis_prepare(byte[] src) + throws IOException, ParseException, UnsupportedEncodingException{ + NFS4StringPrep prep = getInstance(); + return prepare(src, prep.nfscis); + } + + /* sorted array for binary search*/ + private static final String[] special_prefixes={ + "ANONYMOUS", + "AUTHENTICATED", + "BATCH", + "DIALUP", + "EVERYONE", + "GROUP", + "INTERACTIVE", + "NETWORK", + "OWNER", + }; + + + /* binary search the sorted array */ + private static final int findStringIndex(String[] sortedArr,String target){ + + int left, middle, right,rc; + + left =0; + right= sortedArr.length-1; + + while(left <= right){ + middle = (left+right)/2; + rc= sortedArr[middle].compareTo(target); + + if(rc<0){ + left = middle+1; + }else if(rc >0){ + right = middle -1; + }else{ + return middle; + } + } + return -1; + } + private static final char AT_SIGN = '@'; + + public static byte[] mixed_prepare(byte[] src) + throws IOException, ParseException, UnsupportedEncodingException{ + String s = new String(src, "UTF-8"); + int index = s.indexOf(AT_SIGN); + StringBuffer out = new StringBuffer(); + NFS4StringPrep prep = getInstance(); + if(index > -1){ + /* special prefixes must not be followed by suffixes! */ + String prefixString = s.substring(0,index); + int i= findStringIndex(special_prefixes, prefixString); + String suffixString = s.substring(index+1, s.length()); + if(i>-1 && !suffixString.equals("")){ + throw new ParseException("Suffix following a special index", ParseException.INVALID_CHAR_FOUND); + } + UCharacterIterator prefix = UCharacterIterator.getInstance(prefixString); + UCharacterIterator suffix = UCharacterIterator.getInstance(suffixString); + out.append(prep.nfsmxp.prepare(prefix,StringPrep.NONE)); + out.append(AT_SIGN); // add the delimiter + out.append(prep.nfsmxs.prepare(suffix, StringPrep.NONE)); + }else{ + UCharacterIterator iter = UCharacterIterator.getInstance(s); + out.append(prep.nfsmxp.prepare(iter,StringPrep.NONE)); + + } + return out.toString().getBytes("UTF-8"); + } + +} diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/NamePrepTransform.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/NamePrepTransform.java new file mode 100644 index 00000000000..9b2c6c7ecd4 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/NamePrepTransform.java @@ -0,0 +1,172 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/NamePrepTransform.java,v $ + * $Date: 2003/08/21 23:42:21 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ +package com.ibm.icu.dev.test.stringprep; + +import java.util.ResourceBundle; + +import com.ibm.icu.impl.ICULocaleData; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UCharacterDirection; +import com.ibm.icu.stringprep.ParseException; +import com.ibm.icu.text.UCharacterIterator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.Transliterator; + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public class NamePrepTransform { + + private static final NamePrepTransform transform = new NamePrepTransform(); + + private UnicodeSet labelSeparatorSet; + private UnicodeSet prohibitedSet; + private UnicodeSet unassignedSet; + private Transliterator mapTransform; + public static final int NONE = 0; + public static final int ALLOW_UNASSIGNED = 1; + + private NamePrepTransform(){ + // load the resource bundle + ResourceBundle bundle = ICULocaleData.getResourceBundle("com.ibm.icu.dev.test.stringprep","IDNA","rules"); + String mapRules = bundle.getString("Map"); + mapRules += bundle.getString("CaseMap"); + mapTransform = Transliterator.createFromRules("CaseMap",mapRules,Transliterator.FORWARD); + labelSeparatorSet = new UnicodeSet(bundle.getString("LabelSeparatorSet")); + prohibitedSet = new UnicodeSet(bundle.getString("ProhibitedSet")); + unassignedSet = new UnicodeSet(bundle.getString("UnassignedSet")); + } + + public static final NamePrepTransform getInstance(){ + return transform; + } + public static boolean isLabelSeparator(int ch){ + return transform.labelSeparatorSet.contains(ch); + } + + /* + 1) Map -- For each character in the input, check if it has a mapping + and, if so, replace it with its mapping. + + 2) Normalize -- Possibly normalize the result of step 1 using Unicode + normalization. + + 3) Prohibit -- Check for any characters that are not allowed in the + output. If any are found, return an error. + + 4) Check bidi -- Possibly check for right-to-left characters, and if + any are found, make sure that the whole string satisfies the + requirements for bidirectional strings. If the string does not + satisfy the requirements for bidirectional strings, return an + error. + [Unicode3.2] defines several bidirectional categories; each character + has one bidirectional category assigned to it. For the purposes of + the requirements below, an "RandALCat character" is a character that + has Unicode bidirectional categories "R" or "AL"; an "LCat character" + is a character that has Unicode bidirectional category "L". Note + + + that there are many characters which fall in neither of the above + definitions; Latin digits ( through ) are examples of + this because they have bidirectional category "EN". + + In any profile that specifies bidirectional character handling, all + three of the following requirements MUST be met: + + 1) The characters in section 5.8 MUST be prohibited. + + 2) If a string contains any RandALCat character, the string MUST NOT + contain any LCat character. + + 3) If a string contains any RandALCat character, a RandALCat + character MUST be the first character of the string, and a + RandALCat character MUST be the last character of the string. + */ + public StringBuffer prepare(UCharacterIterator src, + int options) + throws ParseException{ + return prepare(src.getText(),options); + } + private String map ( String src, int options) + throws ParseException{ + // map + boolean allowUnassigned = (boolean) ((options & ALLOW_UNASSIGNED)>0); + String caseMapOut = transform.mapTransform.transliterate(src); + UCharacterIterator iter = UCharacterIterator.getInstance(caseMapOut); + int ch; + while((ch=iter.nextCodePoint())!=UCharacterIterator.DONE){ + if(transform.unassignedSet.contains(ch)==true && allowUnassigned ==false){ + throw new ParseException("An unassigned code point was found in the input", + ParseException.UNASSIGNED_ERROR); + } + } + return caseMapOut; + } + public StringBuffer prepare(String src,int options) + throws ParseException{ + + int ch; + String mapOut = map(src,options); + UCharacterIterator iter = UCharacterIterator.getInstance(mapOut); + + int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, + firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; + int rtlPos=-1, ltrPos=-1; + boolean rightToLeft=false, leftToRight=false; + + while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ + + + if(transform.prohibitedSet.contains(ch)==true){ + throw new ParseException("A prohibited code point was found in the input", + ParseException.PROHIBITED_ERROR, + iter.getText(),iter.getIndex()); + } + + direction = UCharacter.getDirection(ch); + if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ + firstCharDir = direction; + } + if(direction == UCharacterDirection.LEFT_TO_RIGHT){ + leftToRight = true; + ltrPos = iter.getIndex()-1; + } + if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ + rightToLeft = true; + rtlPos = iter.getIndex()-1; + } + } + + // satisfy 2 + if( leftToRight == true && rightToLeft == true){ + throw new ParseException("The input does not conform to the rules for BiDi code points.", + ParseException.CHECK_BIDI_ERROR,iter.getText(),(rtlPos>ltrPos) ? rtlPos : ltrPos); + } + + //satisfy 3 + if( rightToLeft == true && + !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && + (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) + ){ + throw new ParseException("The input does not conform to the rules for BiDi code points.", + ParseException.CHECK_BIDI_ERROR,iter.getText(),(rtlPos>ltrPos) ? rtlPos : ltrPos); + } + + return new StringBuffer(mapOut); + + } + +} diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/PunycodeReference.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/PunycodeReference.java new file mode 100644 index 00000000000..735798eedf0 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/PunycodeReference.java @@ -0,0 +1,388 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/PunycodeReference.java,v $ + * $Date: 2003/08/21 23:42:25 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ + +/* + * +Disclaimer and license + + Regarding this entire document or any portion of it (including + the pseudocode and C code), the author makes no guarantees and + is not responsible for any damage resulting from its use. The + author grants irrevocable permission to anyone to use, modify, + and distribute it in any way that does not diminish the rights + of anyone else to use, modify, and distribute it, provided that + redistributed derivative works do not contain misleading author or + version information. Derivative works need not be licensed under + similar terms. + +punycode.c 0.4.0 (2001-Nov-17-Sat) +http://www.cs.berkeley.edu/~amc/idn/ +Adam M. Costello +http://www.nicemice.net/amc/ +*/ + +package com.ibm.icu.dev.test.stringprep; +import com.ibm.icu.stringprep.ParseException; +import com.ibm.icu.text.UCharacterIterator; +import com.ibm.icu.text.UTF16; + +/** + * The implementation is direct port of C code in the RFC + */ + +public final class PunycodeReference { + /*** punycode status codes */ + public static final int punycode_success=0; + public static final int punycode_bad_input=1; /* Input is invalid. */ + public static final int punycode_big_output=2; /* Output would exceed the space provided. */ + public static final int punycode_overflow =3; /* Input needs wider integers to process. */ + + /*** Bootstring parameters for Punycode ***/ + private static final int base = 36; + private static final int tmin = 1; + private static final int tmax = 26; + private static final int skew = 38; + private static final int damp = 700; + private static final int initial_bias = 72; + private static final int initial_n = 0x80; + private static final int delimiter = 0x2D; + + + private static final long UNSIGNED_INT_MASK = 0xffffffffL; + + /* basic(cp) tests whether cp is a basic code point: */ + private static boolean basic(int cp){ + return (char)(cp) < 0x80; + } + + /* delim(cp) tests whether cp is a delimiter: */ + private static boolean delim(int cp){ + return ((cp) == delimiter); + } + + /* decode_digit(cp) returns the numeric value of a basic code */ + /* point (for use in representing integers) in the range 0 to */ + /* base-1, or base if cp is does not represent a value. */ + + private static int decode_digit(int cp) + { + return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 : + cp - 97 < 26 ? cp - 97 : base; + } + + /* encode_digit(d,flag) returns the basic code point whose value */ + /* (when used for representing integers) is d, which needs to be in */ + /* the range 0 to base-1. The lowercase form is used unless flag is */ + /* nonzero, in which case the uppercase form is used. The behavior */ + /* is undefined if flag is nonzero and digit d has no uppercase form. */ + + private static char encode_digit(int d, int flag) + { + return (char) (d + 22 + (75 * ((d < 26) ? 1 : 0) - (((flag != 0) ? 1 :0) << 5))); + /* 0..25 map to ASCII a..z or A..Z */ + /* 26..35 map to ASCII 0..9 */ + } + + /* flagged(bcp) tests whether a basic code point is flagged */ + /* (uppercase). The behavior is undefined if bcp is not a */ + /* basic code point. */ + + private static boolean flagged(int bcp){ + return ((bcp) - 65 < 26); + } + + /* encode_basic(bcp,flag) forces a basic code point to lowercase */ + /* if flag is zero, uppercase if flag is nonzero, and returns */ + /* the resulting code point. The code point is unchanged if it */ + /* is caseless. The behavior is undefined if bcp is not a basic */ + /* code point. */ + + private static char encode_basic(int bcp, int flag) + { + bcp -= (((bcp - 97) < 26) ? 1 :0 ) << 5; + boolean mybcp = (bcp - 65 < 26); + return (char) (bcp + (((flag==0) && mybcp ) ? 1 : 0 ) << 5); + } + + /*** Platform-specific constants ***/ + + /* maxint is the maximum value of a punycode_uint variable: */ + private static long maxint = 0xFFFFFFFFL; + /* Because maxint is unsigned, -1 becomes the maximum value. */ + + /*** Bias adaptation function ***/ + + private static int adapt(int delta, int numpoints, boolean firsttime ){ + int k; + + delta = (firsttime==true) ? delta / damp : delta >> 1; + /* delta >> 1 is a faster way of doing delta / 2 */ + delta += delta / numpoints; + + for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) { + delta /= base - tmin; + } + + return k + (base - tmin + 1) * delta / (delta + skew); + } + + /*** Main encode function ***/ + + public static final int encode( int input_length, + int input[], + char[] case_flags, + int[] output_length, + char output[] ){ + int delta, h, b, out, max_out, bias, j, q, k, t; + long m,n; + /* Initialize the state: */ + + n = initial_n; + delta = out = 0; + max_out = output_length[0]; + bias = initial_bias; + + /* Handle the basic code points: */ + + for (j = 0; j < input_length; ++j) { + if (basic(input[j])) { + if (max_out - out < 2) return punycode_big_output; + output[out++] = (char) + (case_flags!=null ? encode_basic(input[j], case_flags[j]) : input[j]); + } + /* else if (input[j] < n) return punycode_bad_input; */ + /* (not needed for Punycode with unsigned code points) */ + } + + h = b = out; + + /* h is the number of code points that have been handled, b is the */ + /* number of basic code points, and out is the number of characters */ + /* that have been output. */ + + if (b > 0) output[out++] = delimiter; + + /* Main encoding loop: */ + + while (h < input_length) { + /* All non-basic code points < n have been */ + /* handled already. Find the next larger one: */ + + for (m = maxint, j = 0; j < input_length; ++j) { + /* if (basic(input[j])) continue; */ + /* (not needed for Punycode) */ + if (input[j] >= n && input[j] < m) m = input[j]; + } + + /* Increase delta enough to advance the decoder's */ + /* state to , but guard against overflow: */ + + if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow; + delta += (m - n) * (h + 1); + n = m; + + for (j = 0; j < input_length; ++j) { + /* Punycode does not need to check whether input[j] is basic: */ + if (input[j] < n /* || basic(input[j]) */ ) { + if (++delta == 0) return punycode_overflow; + } + + if (input[j] == n) { + /* Represent delta as a generalized variable-length integer: */ + + for (q = delta, k = base; ; k += base) { + if (out >= max_out) return punycode_big_output; + t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ + k >= bias + tmax ? tmax : k - bias; + if (q < t) break; + output[out++] = encode_digit(t + (q - t) % (base - t), 0); + q = (q - t) / (base - t); + } + + output[out++] = encode_digit(q, (case_flags !=null) ? case_flags[j] : 0); + bias = adapt(delta, h + 1, (h == b)); + delta = 0; + ++h; + } + } + + ++delta; + ++n; + } + + output_length[0] = out; + return punycode_success; + } + + public static final StringBuffer encode(StringBuffer input,char[] case_flags) + throws ParseException{ + int[] in = new int[input.length()]; + int inLen = 0; + int ch; + StringBuffer result = new StringBuffer(); + UCharacterIterator iter = UCharacterIterator.getInstance(input); + while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ + in[inLen++]=ch; + } + + int[] outLen = new int[1]; + outLen[0] = input.length()*4; + char[] output = new char[outLen[0]]; + int rc = punycode_success; + for(;;){ + rc = encode(inLen,in,case_flags, outLen, output); + if(rc==punycode_big_output){ + outLen[0] = outLen[0]*4; + output = new char[outLen[0]]; + // continue to convert + continue; + } + break; + } + if(rc==punycode_success){ + return result.append(output,0,outLen[0]); + } + getException(rc); + return result; + } + + private static void getException(int rc) + throws ParseException{ + switch(rc){ + case punycode_big_output: + throw new ParseException("The output capacity was not sufficient.",ParseException.BUFFER_OVERFLOW_ERROR); + case punycode_bad_input: + throw new ParseException("Illegal char found in the input",ParseException.ILLEGAL_CHAR_FOUND); + case punycode_overflow: + throw new ParseException("Invalid char found in the input",ParseException.INVALID_CHAR_FOUND); + } + + } + private static final int MAX_BUFFER_SIZE = 100; + + public static final StringBuffer decode(StringBuffer input,char[] case_flags) + throws ParseException{ + char[] in = input.toString().toCharArray(); + int[] outLen = new int[1]; + outLen[0] = MAX_BUFFER_SIZE; + int[] output = new int[outLen[0]]; + int rc = punycode_success; + StringBuffer result = new StringBuffer(); + for(;;){ + rc = decode(input.length(),in, outLen, output,case_flags); + if(rc==punycode_big_output){ + outLen[0] = output.length * 4; + output = new int[outLen[0]]; + continue; + } + break; + } + if(rc==punycode_success){ + for(int i=0; i < outLen[0]; i++ ){ + UTF16.append(result,output[i]); + } + }else{ + getException(rc); + } + return result; + } + + /*** Main decode function ***/ + public static final int decode(int input_length, + char[] input, + int[] output_length, + int[] output, + char[] case_flags ){ + int n, out, i, max_out, bias, + b, j, in, oldi, w, k, digit, t; + + /* Initialize the state: */ + + n = initial_n; + out = i = 0; + max_out = output_length[0]; + bias = initial_bias; + + /* Handle the basic code points: Let b be the number of input code */ + /* points before the last delimiter, or 0 if there is none, then */ + /* copy the first b code points to the output. */ + + for (b = j = 0; j < input_length; ++j){ + if (delim(input[j])==true){ + b = j; + } + } + if (b > max_out) return punycode_big_output; + + for (j = 0; j < b; ++j) { + if (case_flags != null) case_flags[out] = (char)(flagged(input[j]) ? 1 : 0); + if (!basic(input[j])) return punycode_bad_input; + output[out++] = input[j]; + } + + /* Main decoding loop: Start just after the last delimiter if any */ + /* basic code points were copied; start at the beginning otherwise. */ + + for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) { + + /* in is the index of the next character to be consumed, and */ + /* out is the number of code points in the output array. */ + + /* Decode a generalized variable-length integer into delta, */ + /* which gets added to i. The overflow checking is easier */ + /* if we increase i as we go, then subtract off its starting */ + /* value at the end to obtain delta. */ + + for (oldi = i, w = 1, k = base; ; k += base) { + if (in >= input_length) return punycode_bad_input; + digit = decode_digit(input[in++]); + if (digit >= base) return punycode_bad_input; + if (digit > (maxint - i) / w) return punycode_overflow; + i += digit * w; + t = (k <= bias) /* + tmin */ ? tmin : /* +tmin not needed */ + (k >= (bias + tmax)) ? tmax : k - bias; + if (digit < t) break; + if (w > maxint / (base - t)) return punycode_overflow; + w *= (base - t); + } + + bias = adapt(i - oldi, out + 1, (oldi == 0)); + + /* i was supposed to wrap around from out+1 to 0, */ + /* incrementing n each time, so we'll fix that now: */ + + if (i / (out + 1) > maxint - n) return punycode_overflow; + n += i / (out + 1); + i %= (out + 1); + + /* Insert n at position i of the output: */ + + /* not needed for Punycode: */ + /* if (decode_digit(n) <= base) return punycode_invalid_input; */ + if (out >= max_out) return punycode_big_output; + + if (case_flags != null) { + System.arraycopy(case_flags, i, case_flags, i + 1, out - i); + /* Case of last character determines uppercase flag: */ + case_flags[i] = (char)(flagged(input[in - 1]) ? 0 :1); + } + + System.arraycopy(output, i, output, i + 1, (out - i)); + output[i++] = n; + } + + output_length[0] = out; + return punycode_success; + } + +} diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/TestAll.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/TestAll.java new file mode 100644 index 00000000000..b40d3469331 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/TestAll.java @@ -0,0 +1,42 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/TestAll.java,v $ + * $Date: 2003/08/21 23:42:25 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ +package com.ibm.icu.dev.test.stringprep; + +import com.ibm.icu.dev.test.TestFmwk.TestGroup; + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public class TestAll extends TestGroup { + + public static void main(String[] args) throws Exception { + new TestAll().run(args); + } + + public TestAll() { + super( + new String[] { + "TestIDNA", + "TestStringPrep", + "TestIDNARef" + }, + "StringPrep and IDNA test"); + } + + public static final String CLASS_TARGET_NAME = "StringPrep"; + + +} diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/TestData.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/TestData.java new file mode 100644 index 00000000000..709df3fd00a --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/TestData.java @@ -0,0 +1,631 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/TestData.java,v $ + * $Date: 2003/08/21 23:42:25 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ +package com.ibm.icu.dev.test.stringprep; + +import com.ibm.icu.stringprep.IDNA; +import com.ibm.icu.stringprep.ParseException; + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public class TestData { + public static final char[][] unicodeIn ={ + { + 0x0644, 0x064A, 0x0647, 0x0645, 0x0627, 0x0628, 0x062A, 0x0643, 0x0644, + 0x0645, 0x0648, 0x0634, 0x0639, 0x0631, 0x0628, 0x064A, 0x061F + }, + { + 0x4ED6, 0x4EEC, 0x4E3A, 0x4EC0, 0x4E48, 0x4E0D, 0x8BF4, 0x4E2D, 0x6587, + + }, + { + 0x0050, 0x0072, 0x006F, 0x010D, 0x0070, 0x0072, 0x006F, 0x0073, 0x0074, + 0x011B, 0x006E, 0x0065, 0x006D, 0x006C, 0x0075, 0x0076, 0x00ED, 0x010D, + 0x0065, 0x0073, 0x006B, 0x0079, + }, + { + 0x05DC, 0x05DE, 0x05D4, 0x05D4, 0x05DD, 0x05E4, 0x05E9, 0x05D5, 0x05D8, + 0x05DC, 0x05D0, 0x05DE, 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x05DD, 0x05E2, + 0x05D1, 0x05E8, 0x05D9, 0x05EA, + }, + { + 0x092F, 0x0939, 0x0932, 0x094B, 0x0917, 0x0939, 0x093F, 0x0928, 0x094D, + 0x0926, 0x0940, 0x0915, 0x094D, 0x092F, 0x094B, 0x0902, 0x0928, 0x0939, + 0x0940, 0x0902, 0x092C, 0x094B, 0x0932, 0x0938, 0x0915, 0x0924, 0x0947, + 0x0939, 0x0948, 0x0902, + }, + { + 0x306A, 0x305C, 0x307F, 0x3093, 0x306A, 0x65E5, 0x672C, 0x8A9E, 0x3092, + 0x8A71, 0x3057, 0x3066, 0x304F, 0x308C, 0x306A, 0x3044, 0x306E, 0x304B, + + }, + /* + { + 0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774, + 0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74, + 0xC5BC, 0xB9C8, 0xB098, 0xC88B, 0xC744, 0xAE4C, + }, + */ + { + 0x043F, 0x043E, 0x0447, 0x0435, 0x043C, 0x0443, 0x0436, 0x0435, 0x043E, + 0x043D, 0x0438, 0x043D, 0x0435, 0x0433, 0x043E, 0x0432, 0x043E, 0x0440, + 0x044F, 0x0442, 0x043F, 0x043E, 0x0440, 0x0443, 0x0441, 0x0441, 0x043A, + 0x0438, + }, + { + 0x0050, 0x006F, 0x0072, 0x0071, 0x0075, 0x00E9, 0x006E, 0x006F, 0x0070, + 0x0075, 0x0065, 0x0064, 0x0065, 0x006E, 0x0073, 0x0069, 0x006D, 0x0070, + 0x006C, 0x0065, 0x006D, 0x0065, 0x006E, 0x0074, 0x0065, 0x0068, 0x0061, + 0x0062, 0x006C, 0x0061, 0x0072, 0x0065, 0x006E, 0x0045, 0x0073, 0x0070, + 0x0061, 0x00F1, 0x006F, 0x006C, + }, + { + 0x4ED6, 0x5011, 0x7232, 0x4EC0, 0x9EBD, 0x4E0D, 0x8AAA, 0x4E2D, 0x6587, + + }, + { + 0x0054, 0x1EA1, 0x0069, 0x0073, 0x0061, 0x006F, 0x0068, 0x1ECD, 0x006B, + 0x0068, 0x00F4, 0x006E, 0x0067, 0x0074, 0x0068, 0x1EC3, 0x0063, 0x0068, + 0x1EC9, 0x006E, 0x00F3, 0x0069, 0x0074, 0x0069, 0x1EBF, 0x006E, 0x0067, + 0x0056, 0x0069, 0x1EC7, 0x0074, + }, + { + 0x0033, 0x5E74, 0x0042, 0x7D44, 0x91D1, 0x516B, 0x5148, 0x751F, + }, + { + 0x5B89, 0x5BA4, 0x5948, 0x7F8E, 0x6075, 0x002D, 0x0077, 0x0069, 0x0074, + 0x0068, 0x002D, 0x0053, 0x0055, 0x0050, 0x0045, 0x0052, 0x002D, 0x004D, + 0x004F, 0x004E, 0x004B, 0x0045, 0x0059, 0x0053, + }, + { + 0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002D, 0x0041, 0x006E, 0x006F, + 0x0074, 0x0068, 0x0065, 0x0072, 0x002D, 0x0057, 0x0061, 0x0079, 0x002D, + 0x305D, 0x308C, 0x305E, 0x308C, 0x306E, 0x5834, 0x6240, + }, + { + 0x3072, 0x3068, 0x3064, 0x5C4B, 0x6839, 0x306E, 0x4E0B, 0x0032, + }, + { + 0x004D, 0x0061, 0x006A, 0x0069, 0x3067, 0x004B, 0x006F, 0x0069, 0x3059, + 0x308B, 0x0035, 0x79D2, 0x524D, + }, + { + 0x30D1, 0x30D5, 0x30A3, 0x30FC, 0x0064, 0x0065, 0x30EB, 0x30F3, 0x30D0, + + }, + { + 0x305D, 0x306E, 0x30B9, 0x30D4, 0x30FC, 0x30C9, 0x3067, + }, + // test non-BMP code points + { + 0xD800, 0xDF00, 0xD800, 0xDF01, 0xD800, 0xDF02, 0xD800, 0xDF03, 0xD800, 0xDF05, + 0xD800, 0xDF06, 0xD800, 0xDF07, 0xD800, 0xDF09, 0xD800, 0xDF0A, 0xD800, 0xDF0B, + + }, + { + 0xD800, 0xDF0D, 0xD800, 0xDF0C, 0xD800, 0xDF1E, 0xD800, 0xDF0F, 0xD800, 0xDF16, + 0xD800, 0xDF15, 0xD800, 0xDF14, 0xD800, 0xDF12, 0xD800, 0xDF10, 0xD800, 0xDF20, + 0xD800, 0xDF21, + + }, + // Greek + { + 0x03b5, 0x03bb, 0x03bb, 0x03b7, 0x03bd, 0x03b9, 0x03ba, 0x03ac + }, + // Maltese + { + 0x0062, 0x006f, 0x006e, 0x0121, 0x0075, 0x0073, 0x0061, 0x0127, + 0x0127, 0x0061 + }, + // Russian + { + 0x043f, 0x043e, 0x0447, 0x0435, 0x043c, 0x0443, 0x0436, 0x0435, + 0x043e, 0x043d, 0x0438, 0x043d, 0x0435, 0x0433, 0x043e, 0x0432, + 0x043e, 0x0440, 0x044f, 0x0442, 0x043f, 0x043e, 0x0440, 0x0443, + 0x0441, 0x0441, 0x043a, 0x0438 + }, + + }; + + public static final String[] asciiIn = { + "xn--egbpdaj6bu4bxfgehfvwxn", + "xn--ihqwcrb4cv8a8dqg056pqjye", + "xn--Proprostnemluvesky-uyb24dma41a", + "xn--4dbcagdahymbxekheh6e0a7fei0b", + "xn--i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd", + "xn--n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa", + /* "xn--989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c",*/ + "xn--b1abfaaepdrnnbgefbaDotcwatmq2g4l", + "xn--PorqunopuedensimplementehablarenEspaol-fmd56a", + "xn--ihqwctvzc91f659drss3x8bo0yb", + "xn--TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g", + "xn--3B-ww4c5e180e575a65lsy2b", + "xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n", + "xn--Hello-Another-Way--fc4qua05auwb3674vfr0b", + "xn--2-u9tlzr9756bt3uc0v", + "xn--MajiKoi5-783gue6qz075azm5e", + "xn--de-jg4avhby1noc0d", + "xn--d9juau41awczczp", + "XN--097CCDEKGHQJK", + "XN--db8CBHEJLGH4E0AL", + "xn--hxargifdar", // Greek + "xn--bonusaa-5bb1da", // Maltese + "xn--b1abfaaepdrnnbgefbadotcwatmq2g4l", // Russian (Cyrillic) + }; + + public static final String[] domainNames = { + "slip129-37-118-146.nc.us.ibm.net", + "saratoga.pe.utexas.edu", + "dial-120-45.ots.utexas.edu", + "woo-085.dorms.waller.net", + "hd30-049.hil.compuserve.com", + "pem203-31.pe.ttu.edu", + "56K-227.MaxTNT3.pdq.net", + "dial-36-2.ots.utexas.edu", + "slip129-37-23-152.ga.us.ibm.net", + "ts45ip119.cadvision.com", + "sdn-ts-004txaustP05.dialsprint.net", + "bar-tnt1s66.erols.com", + "101.st-louis-15.mo.dial-access.att.net", + "h92-245.Arco.COM", + "dial-13-2.ots.utexas.edu", + "net-redynet29.datamarkets.com.ar", + "ccs-shiva28.reacciun.net.ve", + "7.houston-11.tx.dial-access.att.net", + "ingw129-37-120-26.mo.us.ibm.net", + "dialup6.austintx.com", + "dns2.tpao.gov.tr", + "slip129-37-119-194.nc.us.ibm.net", + "cs7.dillons.co.uk.203.119.193.in-addr.arpa", + "swprd1.innovplace.saskatoon.sk.ca", + "bikini.bologna.maraut.it", + "node91.subnet159-198-79.baxter.com", + "cust19.max5.new-york.ny.ms.uu.net", + "balexander.slip.andrew.cmu.edu", + "pool029.max2.denver.co.dynip.alter.net", + "cust49.max9.new-york.ny.ms.uu.net", + "s61.abq-dialin2.hollyberry.com", + + }; + + public static final String[] domainNames1Uni = { + "http://\u0917\u0928\u0947\u0936.sanjose.ibm.com", + "www.\u0121.com", + "www.\u00E0\u00B3\u00AF.com", + "www.\u00C2\u00A4.com", + "www.\u00C2\u00A3.com", + "\u0025", + "\u005C\u005C", + "@", + "\u002F", + "www.\u0021.com", + "www.\u0024.com", + "\u003f", + // These yeild U_IDNA_PROHIBITED_ERROR + //"\\u00CF\\u0082.com", + //"\\u00CE\\u00B2\\u00C3\\u009Fss.com", + //"\\u00E2\\u0098\\u00BA.com", + "\u00C3\u00BC.com" + }; + public static final String[] domainNamesToASCIIOut = { + "xn--http://-3mo7iufsh.sanjose.ibm.com", + "www.xn--vea.com", + "www.xn--3 -iia80t.com", + "www.xn--bba7j.com", + "www.xn--9a9j.com", + "\u0025", + "\u005C\u005C", + "@", + "\u002F", + "www.\u0021.com", + "www.\u0024.com", + "\u003f", + "xn--14-ria7423a.com" + + }; + + public static final String[] domainNamesToUnicodeOut = { + "http://\u0917\u0928\u0947\u0936.sanjose.ibm.com", + "www.\u0121.com", + "www.\u00E0\u0033\u0020\u0304.com", + "www.\u00E2\u00A4.com", + "www.\u00E2\u00A3.com", + "\u0025", + "\u005C\u005C", + "@", + "\u002F", + "www.\u0021.com", + "www.\u0024.com", + "\u003f", + "\u00E3\u0031\u2044\u0034.com" + + }; + + + public static class ErrorCase{ + + public char[] unicode; + public String ascii; + public Exception expected; + public boolean useSTD3ASCIIRules; + public boolean testToUnicode; + public boolean testLabel; + ErrorCase(char[] uniIn, String asciiIn, Exception ex, + boolean std3, boolean testToUni, boolean testlabel){ + unicode = uniIn; + ascii = asciiIn; + expected = ex; + useSTD3ASCIIRules = std3; + testToUnicode = testToUni; + testLabel = testlabel; + + } + }; + public static final ErrorCase[] errorCases = { + + + new ErrorCase( new char[]{ + 0x0077, 0x0077, 0x0077, 0x002e, /* www. */ + 0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774, + 0x070F,/*prohibited*/ + 0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74, + 0x002e, 0x0063, 0x006f, 0x006d, /* com. */ + + }, + "www.XN--8mb5595fsoa28orucya378bqre2tcwop06c5qbw82a1rffmae0361dea96b.com", + new ParseException("",ParseException.PROHIBITED_ERROR), + false, true, true), + + new ErrorCase( new char[]{ + 0x0077, 0x0077, 0x0077, 0x002e, /* www. */ + 0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774, + 0x0221, 0x0234/*Unassigned code points*/, + 0x002e, 0x0063, 0x006f, 0x006d, /* com. */ + + }, + "www.XN--6lA2Bz548Fj1GuA391Bf1Gb1N59Ab29A7iA.com", + + new ParseException("",ParseException.UNASSIGNED_ERROR), + false, true, true + ), + new ErrorCase( new char[]{ + 0x0077, 0x0077, 0x0077, 0x002e, /* www. */ + 0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774, + 0x0644, 0x064A, 0x0647,/*Arabic code points. Cannot mix RTL with LTR*/ + 0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74, + 0x002e, 0x0063, 0x006f, 0x006d, /* com. */ + + }, + "www.xn--ghBGI4851OiyA33VqrD6Az86C4qF83CtRv93D5xBk15AzfG0nAgA0578DeA71C.com", + new ParseException("",ParseException.CHECK_BIDI_ERROR), + false, true, true + ), + new ErrorCase( new char[]{ + 0x0077, 0x0077, 0x0077, 0x002e, /* www. */ + /* labels cannot begin with an HYPHEN */ + 0x002D, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774, + 0x002E, + 0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74, + 0x002e, 0x0063, 0x006f, 0x006d, /* com. */ + + + }, + "www.xn----b95Ew8SqA315Ao5FbuMlnNmhA.com", + new ParseException("",ParseException.STD3_ASCII_RULES_ERROR), + true, true, false + ), + new ErrorCase( new char[]{ + /* correct ACE-prefix followed by unicode */ + 0x0077, 0x0077, 0x0077, 0x002e, /* www. */ + 0x0078, 0x006e, 0x002d,0x002d, /* ACE Prefix */ + 0x002D, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774, + 0x002D, + 0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74, + 0x002e, 0x0063, 0x006f, 0x006d, /* com. */ + + + }, + /* wrong ACE-prefix followed by valid ACE-encoded ASCII */ + "www.XY-----b91I0V65S96C2A355Cw1E5yCeQr19CsnP1mFfmAE0361DeA96B.com", + new ParseException("",ParseException.ACE_PREFIX_ERROR), + false, false, false + ), + /* cannot verify U_IDNA_VERIFICATION_ERROR */ + + new ErrorCase( new char[]{ + 0x0077, 0x0077, 0x0077, 0x002e, /* www. */ + 0xC138, 0xACC4, 0xC758, 0xBAA8, 0xB4E0, 0xC0AC, 0xB78C, 0xB4E4, 0xC774, + 0xD55C, 0xAD6D, 0xC5B4, 0xB97C, 0xC774, 0xD574, 0xD55C, 0xB2E4, 0xBA74, + 0xC5BC, 0xB9C8, 0xB098, 0xC88B, 0xC744, 0xAE4C, + 0x002e, 0x0063, 0x006f, 0x006d, /* com. */ + + }, + "www.xn--989AoMsVi5E83Db1D2A355Cv1E0vAk1DwRv93D5xBh15A0Dt30A5JpSD879Ccm6FeA98C.com", + new ParseException("",ParseException.LABEL_TOO_LONG_ERROR), + false, true, true + ), + new ErrorCase( new char[]{ + 0x0077, 0x0077, 0x0077, 0x002e, /* www. */ + 0x0030, 0x0644, 0x064A, 0x0647, 0x0031, /* Arabic code points squashed between EN codepoints */ + 0x002e, 0x0063, 0x006f, 0x006d, /* com. */ + + }, + "www.xn--01-tvdmo.com", + new ParseException("",ParseException.CHECK_BIDI_ERROR), + false, true, true + ), + + new ErrorCase( new char[]{ + 0x0077, 0x0077, 0x0077, 0x002e, // www. + 0x206C, 0x0644, 0x064A, 0x0647, 0x206D, // Arabic code points squashed between BN codepoints + 0x002e, 0x0063, 0x006f, 0x006d, // com. + + }, + "www.XN--ghbgi278xia.com", + new ParseException("",ParseException.PROHIBITED_ERROR), + false, true, true + ), + new ErrorCase( new char[] { + 0x0077, 0x0077, 0x0077, 0x002e, // www. + 0x002D, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, // HYPHEN at the start of label + 0x002e, 0x0063, 0x006f, 0x006d, // com. + + }, + "www.-abcde.com", + new ParseException("",ParseException.STD3_ASCII_RULES_ERROR), + true, true, false + ), + new ErrorCase( new char[] { + 0x0077, 0x0077, 0x0077, 0x002e, // www. + 0x0041, 0x0042, 0x0043, 0x0044, 0x0045,0x002D, // HYPHEN at the end of the label + 0x002e, 0x0063, 0x006f, 0x006d, // com. + + }, + "www.abcde-.com", + new ParseException("",ParseException.STD3_ASCII_RULES_ERROR), + true, true, false + ), + new ErrorCase( new char[]{ + 0x0077, 0x0077, 0x0077, 0x002e, // www. + 0x0041, 0x0042, 0x0043, 0x0044, 0x0045,0x0040, // Containing non LDH code point + 0x002e, 0x0063, 0x006f, 0x006d, // com. + + }, + "www.abcde@.com", + new ParseException("",ParseException.STD3_ASCII_RULES_ERROR), + true, true, false + ), + + }; + + + public static final class ConformanceTestCase{ + String comment; + String input; + String output; + String profile; + int flags; + Exception expected; + private static byte[] getBytes(String in){ + if(in==null){ + return null; + } + byte[] bytes = new byte[in.length()]; + for(int i=0; i < in.length();i++){ + bytes[i] = (byte)in.charAt(i); + } + return bytes; + } + ConformanceTestCase(String comt, String in, String out, + String prof, int flg, Exception ex) + { + + try{ + comment = comt; + byte[] bytes = getBytes(in); + input = new String(bytes,"UTF-8"); + bytes = getBytes(out); + output = (bytes==null)? null : new String(bytes,"UTF-8"); + profile = prof; + flags = flg; + expected = ex; + }catch (Exception e){ + e.printStackTrace(); + throw new RuntimeException(); + } + } + } + + public static final ConformanceTestCase[] conformanceTestCases = + { + + new ConformanceTestCase( + "Case folding ASCII U+0043 U+0041 U+0046 U+0045", + "\u0043\u0041\u0046\u0045", "\u0063\u0061\u0066\u0065", + "Nameprep", IDNA.DEFAULT, + null + + ), + new ConformanceTestCase( + "Case folding 8bit U+00DF (german sharp s)", + "\u00C3\u009F", "\u0073\u0073", + "Nameprep", IDNA.DEFAULT, + null + ), + new ConformanceTestCase( + "Non-ASCII multibyte space character U+1680", + "\u00E1\u009A\u0080", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Non-ASCII 8bit control character U+0085", + "\u00C2\u0085", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Non-ASCII multibyte control character U+180E", + "\u00E1\u00A0\u008E", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Non-ASCII control character U+1D175", + "\u00F0\u009D\u0085\u00B5", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Plane 0 private use character U+F123", + "\u00EF\u0084\u00A3", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Plane 15 private use character U+F1234", + "\u00F3\u00B1\u0088\u00B4", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Plane 16 private use character U+10F234", + "\u00F4\u008F\u0088\u00B4", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Non-character code point U+8FFFE", + "\u00F2\u008F\u00BF\u00BE", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Non-character code point U+10FFFF", + "\u00F4\u008F\u00BF\u00BF", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + /* + { + "Surrogate code U+DF42", + "\u00ED\u00BD\u0082", null, "Nameprep", InternationalizedDomainNames.DEFAULT, + U_IDNA_PROHIBITED_ERROR + }, + */ + new ConformanceTestCase( + "Non-plain text character U+FFFD", + "\u00EF\u00BF\u00BD", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Ideographic description character U+2FF5", + "\u00E2\u00BF\u00B5", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Display property character U+0341", + "\u00CD\u0081", "\u00CC\u0081", + "Nameprep", IDNA.DEFAULT, + null + + ), + + new ConformanceTestCase( + "Left-to-right mark U+200E", + "\u00E2\u0080\u008E", "\u00CC\u0081", + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + + "Deprecated U+202A", + "\u00E2\u0080\u00AA", "\u00CC\u0081", + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Language tagging character U+E0001", + "\u00F3\u00A0\u0080\u0081", "\u00CC\u0081", + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Language tagging character U+E0042", + "\u00F3\u00A0\u0081\u0082", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.PROHIBITED_ERROR) + ), + new ConformanceTestCase( + "Bidi: RandALCat character U+05BE and LCat characters", + "\u0066\u006F\u006F\u00D6\u00BE\u0062\u0061\u0072", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.CHECK_BIDI_ERROR) + ), + new ConformanceTestCase( + "Bidi: RandALCat character U+FD50 and LCat characters", + "\u0066\u006F\u006F\u00EF\u00B5\u0090\u0062\u0061\u0072", null, + "Nameprep",IDNA.DEFAULT , + new ParseException("",ParseException.CHECK_BIDI_ERROR) + ), + new ConformanceTestCase( + "Bidi: RandALCat character U+FB38 and LCat characters", + "\u0066\u006F\u006F\u00EF\u00B9\u00B6\u0062\u0061\u0072", "\u0066\u006F\u006F \u00d9\u008e\u0062\u0061\u0072", + "Nameprep", IDNA.DEFAULT, + null + ), + new ConformanceTestCase( + "Bidi: RandALCat without trailing RandALCat U+0627 U+0031", + "\u00D8\u00A7\u0031", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.CHECK_BIDI_ERROR) + ), + new ConformanceTestCase( + "Bidi: RandALCat character U+0627 U+0031 U+0628", + "\u00D8\u00A7\u0031\u00D8\u00A8", "\u00D8\u00A7\u0031\u00D8\u00A8", + "Nameprep", IDNA.DEFAULT, + null + ), + new ConformanceTestCase( + "Unassigned code point U+E0002", + "\u00F3\u00A0\u0080\u0082", null, + "Nameprep", IDNA.DEFAULT, + new ParseException("",ParseException.UNASSIGNED_ERROR) + ), + + /* // Invalid UTF-8 + { + "Larger test (shrinking)", + "X\u00C2\u00AD\u00C3\u00DF\u00C4\u00B0\u00E2\u0084\u00A1\u006a\u00cc\u008c\u00c2\u00a0\u00c2" + "\u00aa\u00ce\u00b0\u00e2\u0080\u0080", "xssi\u00cc\u0087""tel\u00c7\u00b0 a\u00ce\u00b0 ", + "Nameprep", + InternationalizedDomainNames.DEFAULT, U_ZERO_ERROR + }, + { + + "Larger test (expanding)", + "X\u00C3\u00DF\u00e3\u008c\u0096\u00C4\u00B0\u00E2\u0084\u00A1\u00E2\u0092\u009F\u00E3\u008c\u0080", + "xss\u00e3\u0082\u00ad\u00e3\u0083\u00ad\u00e3\u0083\u00a1\u00e3\u0083\u00bc\u00e3\u0083\u0088" + "\u00e3\u0083\u00ab""i\u00cc\u0087""tel\u0028""d\u0029\u00e3\u0082\u00a2\u00e3\u0083\u0091" + "\u00e3\u0083\u00bc\u00e3\u0083\u0088" + "Nameprep", + InternationalizedDomainNames.DEFAULT, U_ZERO_ERROR + }, + */ + }; +} diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNA.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNA.java new file mode 100644 index 00000000000..02485c30f53 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNA.java @@ -0,0 +1,700 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNA.java,v $ + * $Date: 2003/08/21 23:42:21 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ +package com.ibm.icu.dev.test.stringprep; + +import java.io.InputStream; +import java.util.Random; + +import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.stringprep.IDNA; +import com.ibm.icu.stringprep.StringPrep; +import com.ibm.icu.stringprep.ParseException; +import com.ibm.icu.text.UCharacterIterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.impl.LocaleUtility; +import com.ibm.icu.impl.Utility; + +/** + * @author ram + */ +public class TestIDNA extends TestFmwk { + public static void main(String[] args) throws Exception { + new TestIDNA().run(args); + } + private ParseException unassignedException = new ParseException("",ParseException.UNASSIGNED_ERROR); + public void TestToUnicode() throws Exception{ + for(int i=0; i= 0x30000){ + retVal+=0xB0000; + } + return retVal; + } + + private int randi(int n){ + return (int) (random.nextInt(0x7fff) % (n+1)); + } + + private StringBuffer getTestSource(StringBuffer fillIn) { + // use uniform seed value from the framework + if(random==null){ + random = createRandom(); + } + int i = 0; + int charCount = (randi(maxCharCount) + 1); + while (i 0x1FFFF){ + return; + } + if(i >= 0x30000){ + i+=0xB0000; + } + UTF16.append(src,i); + doTestCompareReferenceImpl(src); + } + } +} diff --git a/icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNARef.java b/icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNARef.java new file mode 100644 index 00000000000..fce44e6a047 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNARef.java @@ -0,0 +1,565 @@ +/* + ******************************************************************************* + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/stringprep/TestIDNARef.java,v $ + * $Date: 2003/08/21 23:42:27 $ + * $Revision: 1.1 $ + * + ******************************************************************************* +*/ +package com.ibm.icu.dev.test.stringprep; + +import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.stringprep.ParseException; +import com.ibm.icu.text.UCharacterIterator; + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public class TestIDNARef extends TestFmwk { + public static void main(String[] args) throws Exception { + new TestIDNARef().run(args); + } + private ParseException unassignedException = new ParseException("",ParseException.UNASSIGNED_ERROR); + public void TestToUnicode() throws Exception{ + for(int i=0; iPreferences>Java>Code Generation>Code and Comments + */ +public class TestStringPrep extends TestFmwk { + public static void main(String[] args) throws Exception { + new TestStringPrep().run(args); + } + /* + There are several special identifiers ("who") which need to be + understood universally, rather than in the context of a particular + DNS domain. Some of these identifiers cannot be understood when an + NFS client accesses the server, but have meaning when a local process + accesses the file. The ability to display and modify these + permissions is permitted over NFS, even if none of the access methods + on the server understands the identifiers. + + Who Description + _______________________________________________________________ + + "OWNER" The owner of the file. + "GROUP" The group associated with the file. + "EVERYONE" The world. + "INTERACTIVE" Accessed from an interactive terminal. + "NETWORK" Accessed via the network. + "DIALUP" Accessed as a dialup user to the server. + "BATCH" Accessed from a batch job. + "ANONYMOUS" Accessed without any authentication. + "AUTHENTICATED" Any authenticated user (opposite of + ANONYMOUS) + "SERVICE" Access from a system service. + + To avoid conflict, these special identifiers are distinguish by an + appended "@" and should appear in the form "xxxx@" (note: no domain + name after the "@"). For example: ANONYMOUS@. + */ + private String[] mixed_prep_data ={ + "OWNER@", + "GROUP@", + "EVERYONE@", + "INTERACTIVE@", + "NETWORK@", + "DIALUP@", + "BATCH@", + "ANONYMOUS@", + "AUTHENTICATED@", + "\u0930\u094D\u092E\u094D\u0915\u094D\u0937\u0947\u0924\u094D@slip129-37-118-146.nc.us.ibm.net", + "\u0936\u094d\u0930\u0940\u092e\u0926\u094d@saratoga.pe.utexas.edu", + "\u092d\u0917\u0935\u0926\u094d\u0917\u0940\u0924\u093e@dial-120-45.ots.utexas.edu", + "\u0905\u0927\u094d\u092f\u093e\u092f@woo-085.dorms.waller.net", + "\u0905\u0930\u094d\u091c\u0941\u0928@hd30-049.hil.compuserve.com", + "\u0935\u093f\u0937\u093e\u0926@pem203-31.pe.ttu.edu", + "\u092f\u094b\u0917@56K-227.MaxTNT3.pdq.net", + "\u0927\u0943\u0924\u0930\u093e\u0937\u094d\u091f\u094d\u0930@dial-36-2.ots.utexas.edu", + "\u0909\u0935\u093E\u091A\u0943@slip129-37-23-152.ga.us.ibm.net", + "\u0927\u0930\u094d\u092e\u0915\u094d\u0937\u0947\u0924\u094d\u0930\u0947@ts45ip119.cadvision.com", + "\u0915\u0941\u0930\u0941\u0915\u094d\u0937\u0947\u0924\u094d\u0930\u0947@sdn-ts-004txaustP05.dialsprint.net", + "\u0938\u092e\u0935\u0947\u0924\u093e@bar-tnt1s66.erols.com", + "\u092f\u0941\u092f\u0941\u0924\u094d\u0938\u0935\u0903@101.st-louis-15.mo.dial-access.att.net", + "\u092e\u093e\u092e\u0915\u093e\u0903@h92-245.Arco.COM", + "\u092a\u093e\u0923\u094d\u0921\u0935\u093e\u0936\u094d\u091a\u0948\u0935@dial-13-2.ots.utexas.edu", + "\u0915\u093f\u092e\u0915\u0941\u0930\u094d\u0935\u0924@net-redynet29.datamarkets.com.ar", + "\u0938\u0902\u091c\u0935@ccs-shiva28.reacciun.net.ve", + "\u0c30\u0c18\u0c41\u0c30\u0c3e\u0c2e\u0c4d@7.houston-11.tx.dial-access.att.net", + "\u0c35\u0c3f\u0c36\u0c4d\u0c35\u0c28\u0c3e\u0c27@ingw129-37-120-26.mo.us.ibm.net", + "\u0c06\u0c28\u0c02\u0c26\u0c4d@dialup6.austintx.com", + "\u0C35\u0C26\u0C4D\u0C26\u0C3F\u0C30\u0C3E\u0C1C\u0C41@dns2.tpao.gov.tr", + "\u0c30\u0c3e\u0c1c\u0c40\u0c35\u0c4d@slip129-37-119-194.nc.us.ibm.net", + "\u0c15\u0c36\u0c30\u0c2c\u0c3e\u0c26@cs7.dillons.co.uk.203.119.193.in-addr.arpa", + "\u0c38\u0c02\u0c1c\u0c40\u0c35\u0c4d@swprd1.innovplace.saskatoon.sk.ca", + "\u0c15\u0c36\u0c30\u0c2c\u0c3e\u0c26@bikini.bologna.maraut.it", + "\u0c38\u0c02\u0c1c\u0c40\u0c2c\u0c4d@node91.subnet159-198-79.baxter.com", + "\u0c38\u0c46\u0c28\u0c4d\u0c17\u0c41\u0c2a\u0c4d\u0c24@cust19.max5.new-york.ny.ms.uu.net", + "\u0c05\u0c2e\u0c30\u0c47\u0c02\u0c26\u0c4d\u0c30@balexander.slip.andrew.cmu.edu", + "\u0c39\u0c28\u0c41\u0c2e\u0c3e\u0c28\u0c41\u0c32@pool029.max2.denver.co.dynip.alter.net", + "\u0c30\u0c35\u0c3f@cust49.max9.new-york.ny.ms.uu.net", + "\u0c15\u0c41\u0c2e\u0c3e\u0c30\u0c4d@s61.abq-dialin2.hollyberry.com", + "\u0c35\u0c3f\u0c36\u0c4d\u0c35\u0c28\u0c3e\u0c27@\u0917\u0928\u0947\u0936.sanjose.ibm.com", + "\u0c06\u0c26\u0c3f\u0c24\u0c4d\u0c2f@www.\u00E0\u00B3\u00AF.com", + "\u0C15\u0C02\u0C26\u0C4D\u0C30\u0C47\u0C17\u0C41\u0c32@www.\u00C2\u00A4.com", + "\u0c36\u0c4d\u0c30\u0c40\u0C27\u0C30\u0C4D@www.\u00C2\u00A3.com", + "\u0c15\u0c02\u0c1f\u0c2e\u0c36\u0c46\u0c1f\u0c4d\u0c1f\u0c3f@\u0025", + "\u0c2e\u0c3e\u0c27\u0c35\u0c4d@\u005C\u005C", + "\u0c26\u0c46\u0c36\u0c46\u0c1f\u0c4d\u0c1f\u0c3f@www.\u0021.com", + "test@www.\u0024.com", + "help@\u00C3\u00BC.com", + }; + public void TestNFS4MixedPrep(){ + for(int i=0; i< mixed_prep_data.length; i++){ + try{ + String src = mixed_prep_data[i]; + byte[] dest = NFS4StringPrep.mixed_prepare(src.getBytes("UTF-8")); + String destString = new String(dest, "UTF-8"); + int destIndex = destString.indexOf('@'); + if(destIndex < 0){ + errln("Delimiter @ disappeared from the output!"); + } + }catch(Exception e){ + errln("mixed_prepare for string: " + mixed_prep_data[i] +" failed with " + e.toString()); + } + } + /* test the error condition */ + { + String src = "OWNER@oss.software.ibm.com"; + try{ + byte[] dest = NFS4StringPrep.mixed_prepare(src.getBytes("UTF-8")); + if(dest!=null){ + errln("Did not get the expected exception"); + } + }catch(Exception e){ + logln("mixed_prepare for string: " + src +" passed with " + e.toString()); + } + + } + } + public void TestCISPrep(){ + + for(int i=0;i< (TestData.conformanceTestCases.length);i++){ + TestData.ConformanceTestCase testCase = TestData.conformanceTestCases[i]; + String src = testCase.input; + Exception expected = testCase.expected; + String expectedDest = testCase.output; + try{ + byte[] dest =NFS4StringPrep.cis_prepare(src.getBytes("UTF-8")); + String destString = new String(dest, "UTF-8"); + if(!expectedDest.equalsIgnoreCase(destString)){ + errln("Did not get the expected output for nfs4_cis_prep at index " + i); + } + }catch(Exception e){ + if(!expected.equals(e)){ + errln("Did not get the expected exception"); + } + } + + } + } + private static String[] cs_prep_data = { + //BIDI checking is turned off .. so + "\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774\u0644\u064A\u0647\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74", + + }; + public void TestCSPrep(){ + + // Checking for bidi is turned off + String src = "\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774\u0644\u064A\u0647\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"; + try{ + NFS4StringPrep.cs_prepare(src.getBytes("UTF-8"), false); + }catch(Exception e){ + errln("Got unexpected exception: " + e.toString()); + } + + // normalization is turned off + try{ + src = "www.\u00E0\u00B3\u00AF.com"; + byte[] dest = NFS4StringPrep.cs_prepare(src.getBytes("UTF-8"), false); + String destStr = new String(dest, "UTF-8"); + if(!src.equals(destStr)){ + errln("Did not get expected output. Expected: "+ prettify(src)+ + " Got: " + prettify(destStr)); + } + }catch(Exception e){ + errln("Got unexpected exception: " + e.toString()); + } + + // test case insensitive string + try{ + src = "THISISATEST"; + byte[] dest = NFS4StringPrep.cs_prepare(src.getBytes("UTF-8"), true); + String destStr = new String(dest, "UTF-8"); + if(!src.toLowerCase().equals(destStr)){ + errln("Did not get expected output. Expected: "+ prettify(src)+ + " Got: " + prettify(destStr)); + } + }catch(Exception e){ + errln("Got unexpected exception: " + e.toString()); + } + } + +} diff --git a/icu4j/src/com/ibm/icu/impl/LocaleUtility.java b/icu4j/src/com/ibm/icu/impl/LocaleUtility.java index 7e3118719bc..bc7d2dfc338 100644 --- a/icu4j/src/com/ibm/icu/impl/LocaleUtility.java +++ b/icu4j/src/com/ibm/icu/impl/LocaleUtility.java @@ -5,13 +5,14 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/LocaleUtility.java,v $ - * $Date: 2003/06/03 18:49:32 $ - * $Revision: 1.8 $ + * $Date: 2003/08/21 23:41:25 $ + * $Revision: 1.9 $ * ***************************************************************************************** */ package com.ibm.icu.impl; +import java.io.InputStream; import java.util.Locale; /** @@ -131,4 +132,10 @@ public class LocaleUtility { } return new Locale(parts[0], parts[1], parts[2]); } + + public static InputStream getImplDataResourceAsStream(String name){ + Class myClass = new LocaleUtility().getClass(); + String fullName = "data/"+name; + return myClass.getResourceAsStream(fullName); + } } diff --git a/icu4j/src/com/ibm/icu/impl/StringPrepDataReader.java b/icu4j/src/com/ibm/icu/impl/StringPrepDataReader.java new file mode 100644 index 00000000000..357827cde20 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/StringPrepDataReader.java @@ -0,0 +1,96 @@ +/* + * Created on May 2, 2003 + * + * To change the template for this generated file go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +package com.ibm.icu.impl; + +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; + + + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public final class StringPrepDataReader implements ICUBinary.Authenticate { + private final static boolean debug = ICUDebug.enabled("NormalizerDataReader"); + + /** + *

private constructor.

+ * @param inputStream ICU uprop.dat file input stream + * @exception IOException throw if data file fails authentication + * @draft 2.1 + */ + public StringPrepDataReader(InputStream inputStream) + throws IOException{ + if(debug) System.out.println("Bytes in inputStream " + inputStream.available()); + + unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this); + + if(debug) System.out.println("Bytes left in inputStream " +inputStream.available()); + + dataInputStream = new DataInputStream(inputStream); + + if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available()); + } + + public void read(byte[] idnaBytes, + char[] mappingTable) + throws IOException{ + + //Read the bytes that make up the idnaTrie + dataInputStream.read(idnaBytes); + + //Read the extra data + for(int i=0;iApM zBL<9X)QC|db~R#Mf>sO=Ahn@{QXtTZQL6-~5Tr<0iiA>@D77T}J$Evlxo^2Qt$*;O zLd1@WQ&j$=ad|De&VY|=2${1;S)|u%ui$#6_Kfxh%9r)m^}lIv>r^)lT@M>U)4I#YwTur z3*@)5+u5CL8+(vF#)jDldzSr${gwSM%74g`Gwx%5<=K1=&*k%#-|+%o#OCo5UdAhA zTxAij;@9w6zLKw(ZGJt!N#=ptH~7ub{#&BgTOqlL-;Qf9=;wFvL4F&*1NBGwZ>Bvx ziIK{=Npte&_)AiUUBX{O{Vo2EY^kd1S5J!8Fo&u?$!sK>p;^g{n~ZN5-!*=yo}stv zx2or=7wV_07posL?ob!1SE*NPFX)%4wdyKmwYpBJWvA-rsMo1C>I>BNbYnvQYB>u` zFl3LNbCG_r+7WN1c1el)BWHB#ZM;g|idFfo+Nad21M&Ll+Q&La>Y((@-}3s$y)~p? z0SVSn9kzS;6Y7&u&D&JDOKVjXeb1^dNSQj|j8A>VxYrm^-*DQt?TPEVX$F{L-_`G? zoKEC&Vg^pkz=;`nH_U+g_CNFa$WD%a5}4IzlVkl^?zMY2%;bq2PRzjUXF#KJ9~--j zNAdmXH^!63NXCppcg#NZD^HyNpJpJ*KEn5I`Hk5=HB$>~r${Y!mUdbaZS=_R>slUo zpLV|X0kvJbRJ+{P)D~)2MZfJg;@eDzcC}V}ymRN|m1(JVsO|1(u>b#CTctH-YW>gG zi=9FFrb27RciAOUuKp-}JmWm0P-`{jYVBI5wprhyZPB)BeMYhVy#9#(l>RgQ7NgRr zHkKMIwe7}w<9g#{<4k=@|AXGFw`vbz6_4vDrnWma|Cw9-xx{F;h2H5{>Uh7Q-z!`C zujMt}9B99{cTl3|%p}LV|5Ci>%2=!zQwMnjZ_jdo0J$I!%m)Ra5EK(Kb#Uf0lYr7g zCVJ_k!PB9sL$$mB}IKpkiRO`rv|f_Bge zuTPVmWCzNF6c=X(+*04?jcm_@f%mkA1tMS$z+N(tqfr2g0QQkl0!pQ=u@&@z0k8|~ z0eb;FPaQNyQQHs3zyUB04uJ`97);^~t^zb*fe4rba>0C10E$3~oxNERI|{QAeHgdd z3@~=H6|{kN&=ITajyk4x>D27cggYSpPV~-ms(K`6dMA4B(p0*xe73P;^<(DZXk6F( zOf}cm)pBLYYBR5h(<@8YW9BtCV&*nZOxIS{X0lJ$pDugYADYR(-Dl^MUQcS+y2o?P zB+ktKZ4Wcg+{{s=|Cuym`|WII(swdCw!EEaPMIoFe9p2Ro15(+QO}mR{%tGo)$!zL zx;eO3t}J!U>(j~4tL^klSvt>&BF)1${gQr>GY;9y}oQaTi2;OzFet{>}DHZbLP^qbIR2Dx}F`s(k3zEWHPDc zM2IgFJ-8BEH__ISIAy#~r=7gtD|KQy7O@dsO-IizrBI$46$`$+R2~_89H4#n)XOyGQzQ zYqnRXElV3);>eT7c5J;`wzXGZ#`m_Jx+8Z=M=s0c7G8~H^<*PWrsuAAeDATv^?FI? z!5*zQ(`0q&(d)^sdnLAp?ahww^d6wGJGWEP?;?Ifg&O)Lj6F;tw z>&ca6QnN?mXxbxlTCzOeey=a5?&g|2o}(vASJx{|Esxc5S1B=4yk_dQy*8=)<9lrV z_-nk@%^%|`Qha-c*5&YW)_UMN7D6zJ@NKtr+B6lz0rAiJqGiKL&NRNaxy>G+OI}AG5 z(e4o&)0M~9V)fisyx#2Esplu&k3EviS1#V(+qScJByJ8)45w$NQd-4Y#!75kS0k0L znKe@PIxCRM?@V&9SI1Av(zBE)tjKCNFH&<0aQjEx`P(uTA>@HDLL)kd^^sEum-wK460hOukYpBA+V?XWhf zE4rpzdSK=_GE;e4Eozh1YPDJIR)^JTby=IOZflFxV{Ns1tv;*Y8n6bfUDlAb#~QZw zS|jm!$+qJwSzo-&(LJV8-YN2bv`puIOy$wt5TBX*ZPD&|^DprlvK{kT<>;L|`Hnng zUox9mS>pd?NiT6dq}TOoAm5n(T@&n5y75VU3+Mq`K`-b7{a^s>0{DcX>;d?cLI0xc zMGfn&?8CQ#^9ecfD+Ql86nx@1^0+dN>k~(wP@?0WeB{W}NB*cBj>;av_1+^poZm@A z{SHWe#sc`|!wASh>lTLl=*TT{bSy90&d2qKM}Ei(At^%ryR6tAJ1dzkJJ8b!y1-_D z@i2^sVLS}u2@bMe)`xmOa=(fEnGK*m2rw>&aZT==`HY>f#Fm}K4A*quca!wf?jS}X zkLJ5D0*r+A(5Cbe&;_r|2 z`WQn7KbKSaiH|qib}GN=<(b^;j+gkX9C`9Gv-!wOD|rMdrMP=iA6b27XV!q(m9>kP zXYJ8Sh^Cd2fL2Cwv??Na4au_fd7+Z`U zvf1b*TaA8JY7CMgV~C6x!z@LHu*>+M`5CJwRiuTqlGbavS{`lG^0fl3P%F}kld7f= z`bzZ?-k^`_`}HyXfIhAt(kJx8`Xp~MG~R3kc%zX+>kL5~j6B+8uWn`5-IT%(iCH#X}n zdMn+cx6?jd?tobz6#IcTf z50^tb^^s-qiL3}eoHa)|kd>=UWX)G59c@;|kR7XJ$cj}nNKLBVZpG1^dAmH~^L(52`Mxc@%D0tQ`}%mDZ-CePQe?whc^hx%9lVow@y)!OZ{aU2hxuMU!uRn}zMo@OGC%mFs;HW3sR1>j=BRU2q2{W2>ikq5^nATQ zFVu_lV!cE!)ywp9y+W_ltMqEUMz7QB^#;9BZ^CaaUOehc08O9;AfC|y5YOnw|4ptp(L4LY+-!E6 zTg)DFtJ!Pznf>N~IcV-Ohs-_Zu({V9G549H=6-X`JYbHShs+7{usNAY6f0uoSaU34 zS-yZT;>+>P@d;n9FV8pMm+vd^75a*N#l8~Pn-A;d!+QC! zUOudsuNq*Td{_k^M&fG$SR)_S$cHuZb%P$z3;Mwz7y{S>zF~kp;2QxLmv0p82V>v> z7zc;I1UL*P@o#Ag&_Dp>00HtqJ}3mmpcIsYN>B~zKm%w5O`sXHfL723+Cc~C1YKY= z=muLr57-KNK|kKsmWIlMqY)9w%bN6S{(wKnFG5|RZhxLX-(MIS3=M^b{l)%Lf4RTX zUmY3^jfKYjb)m_y77m1S!ouI;Z}WHfyZqh$9)GXD-#_Rd@(=q*{Gpfpe(s0>sG>H-acrch<5I#d^G2sMRTLTw>rjhuQyy&>clt&c>; zBZsrK?1)Wx?95a*jowakUK)+08Mrp)rN?)QudwantL%R9HTHn`I(txj zlRYH9#U2*lW{-;Ru*byr*lzI?@l$b|xLy2Q{6hTl^9#hCWSO{I+#~K2+r)Npzj#1A zC>|0IlQrT|@tD|68pW^0Z^UoK@5LX)lcY&JC7uy~B3}^CiRZ-&;zjY2cv-w6UKOv2 z*Toy+P4Slan|PZnCQG6$CCe`R1zFwPOV*L~WF^^vaw9&kuSNfA(tuVYWE;rFg`a_b zEm=lZkX394sbvpCVk{X-f_1V$){;Li^U09uR^}C!l;hsibw-&2YZJ7 zi9LtN&$AcUi=>9VM3%CbNgaEIy-MoYYh)#R{pfgO;!e+yGe7R^td7l(-OKu?V_|$V zjLaFu(OKn-$rtz%(u}_#g*$;S=PURszM8D!Yxz39fp6rS_;vh?{7a;Ue>q{qEBV(+ z1OFyj!@oru`M1e>{vFc9zXy-shsPiA|KdO5KjuH-KgC~~x}E==|APOL-^uUh_rTkI zWGUZ9>iBkYEx(`C^9RUE{-B%%zKg8okC1hIh-~1GlZ|{2*~Fh7}y->cG9Q}yAm zRt1lFH_4qOcl0srWA-q+o85zzyANx(4Xd{ud*y!YmIts?9>gwv2(pLa|4~GEj6KeF zM=knhXG0bXDwt4(P8x*C?iN0FkMOhmM38M0VLZLF@$^0kPw$iQ^gabo@Au&8eJY;b zbMf>(4NvdW#TnvEake-|oGZ=~7l;MoLUED!p!kruSX?4LA})i^kHP2V@c9Y&yaGNK z!sjR9^V9HoC4625pPzxx&%)>D;PYznd9he55w&8OST0tGRbsVRE7pk(Vx!n3t`lE` z&o9B}m*Mk9wvAlNF2H?dkd4g3oo|uF%HY! z9|jkLVo-t^&6ZF7xrA+>OMwCy;6Mc$&_TNXdC)lx#h?Z( z0ZTzGr~}JDy8boTGk1eKqGgG!&s~rDHqaX_OQQAdtOUC$8{;mSdp+d4!Em%Jk@e^^ zb{(E5ci~xl7oNn+nXc@>y;#q(qR*vucpk4Ot1x%-(JRm8U}f|fgy)sIUTp%kU0x007A){ + return false; + } + //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A] + if( (ch==0x002D) || + (0x0030 <= ch && ch <= 0x0039) || + (0x0041 <= ch && ch <= 0x005A) || + (0x0061 <= ch && ch <= 0x007A) + ){ + return true; + } + return false; + } + + /** + * This function implements the ToASCII operation as defined in the IDNA RFC. + * This operation is done on single labels before sending it to something that expects + * ASCII names. A label is an individual part of a domain name. Labels are usually + * separated by dots; e.g." "www.example.com" is composed of 3 labels + * "www","example", and "com". + * + * @param src The input string to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertToASCII(String src, int options) + throws ParseException, IOException{ + UCharacterIterator iter = UCharacterIterator.getInstance(src); + return convertToASCII(iter,options); + } + + /** + * This function implements the ToASCII operation as defined in the IDNA RFC. + * This operation is done on single labels before sending it to something that expects + * ASCII names. A label is an individual part of a domain name. Labels are usually + * separated by dots; e.g." "www.example.com" is composed of 3 labels + * "www","example", and "com". + * + * @param src The input string as StringBuffer to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertToASCII(StringBuffer src, int options) + throws ParseException, IOException{ + UCharacterIterator iter = UCharacterIterator.getInstance(src); + return convertToASCII(iter,options); + } + + /** + * This function implements the ToASCII operation as defined in the IDNA RFC. + * This operation is done on single labels before sending it to something that expects + * ASCII names. A label is an individual part of a domain name. Labels are usually + * separated by dots; e.g." "www.example.com" is composed of 3 labels + * "www","example", and "com". + * + * @param src The input string as UCharacterIterator to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertToASCII(UCharacterIterator srcIter, int options) + throws ParseException, IOException{ + //load the data + loadInstance(); + + boolean[] caseFlags = null; + + // the source contains all ascii codepoints + boolean srcIsASCII = true; + // assume the source contains all LDH codepoints + boolean srcIsLDH = true; + + //get the options + boolean useSTD3ASCIIRules = (boolean)((options & USE_STD3_RULES) != 0); + + int failPos = -1; + // step 2 + StringBuffer processOut = prep.prepare(srcIter,options); + int poLen = processOut.length(); + StringBuffer dest = new StringBuffer(); + // step 3 & 4 + for(int j=0;j 0x7F){ + srcIsASCII = false; + } + // here we do not assemble surrogates + // since we know that LDH code points + // are in the ASCII range only + if(isLDHChar(ch)==false){ + srcIsLDH = false; + failPos = j; + } + } + + if(useSTD3ASCIIRules == true){ + // verify 3a and 3b + if( srcIsLDH == false /* source contains some non-LDH characters */ + || processOut.charAt(0) == HYPHEN + || processOut.charAt(processOut.length()-1) == HYPHEN){ + + /* populate the parseError struct */ + if(srcIsLDH==false){ + throw new ParseException( "The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(), + (failPos>0) ? (failPos-1) : failPos); + }else if(processOut.charAt(0) == HYPHEN){ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0); + + }else{ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(), + (poLen>0) ? poLen-1 : poLen); + + } + } + } + if(srcIsASCII){ + dest = processOut; + }else{ + // step 5 : verify the sequence does not begin with ACE prefix + if(!startsWithPrefix(processOut)){ + + //step 6: encode the sequence with punycode + caseFlags = new boolean[poLen]; + + StringBuffer punyout = Punycode.encode(processOut,caseFlags); + + // convert all codepoints to lower case ASCII + StringBuffer lowerOut = toASCIILower(punyout); + + //Step 7: prepend the ACE prefix + dest.append(ACE_PREFIX,0,ACE_PREFIX_LENGTH); + //Step 6: copy the contents in b2 into dest + dest.append(lowerOut); + }else{ + + throw new ParseException("The input does not start with the ACE Prefix.", + ParseException.ACE_PREFIX_ERROR,processOut.toString(),0); + } + } + if(dest.length() > MAX_LABEL_LENGTH){ + throw new ParseException("The labels in the input are too long. Length > 64.", + ParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0); + } + return dest; + } + + /** + * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. + * This operation is done on complete domain names, e.g: "www.example.com". + * It is important to note that this operation can fail. If it fails, then the input + * domain name cannot be used as an Internationalized Domain Name and the application + * should have methods defined to deal with the failure. + * + * Note: IDNA RFC specifies that a conformant application should divide a domain name + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once + * set will apply to all labels in the domain name + * + * @param src The input string as UCharacterIterator to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertIDNtoASCII(UCharacterIterator iter,int options) + throws ParseException, IOException{ + return convertIDNToASCII(iter.getText(), options); + } + + /** + * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. + * This operation is done on complete domain names, e.g: "www.example.com". + * It is important to note that this operation can fail. If it fails, then the input + * domain name cannot be used as an Internationalized Domain Name and the application + * should have methods defined to deal with the failure. + * + * Note: IDNA RFC specifies that a conformant application should divide a domain name + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once + * set will apply to all labels in the domain name + * + * @param src The input string as StringBuffer to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertIDNtoASCII(StringBuffer str,int options) + throws ParseException, IOException{ + return convertIDNToASCII(str.toString(), options); + } + + /** + * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. + * This operation is done on complete domain names, e.g: "www.example.com". + * It is important to note that this operation can fail. If it fails, then the input + * domain name cannot be used as an Internationalized Domain Name and the application + * should have methods defined to deal with the failure. + * + * Note: IDNA RFC specifies that a conformant application should divide a domain name + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once + * set will apply to all labels in the domain name + * + * @param src The input string to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertIDNToASCII(String src,int options) + throws ParseException, IOException{ + //load the data + loadInstance(); + char[] srcArr = src.toCharArray(); + StringBuffer result = new StringBuffer(); + int sepIndex=0; + int oldSepIndex=0; + for(;;){ + sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); + UCharacterIterator iter = UCharacterIterator.getInstance(new String(srcArr,oldSepIndex,sepIndex-oldSepIndex)); + result.append(convertToASCII(iter,options)); + if(sepIndex==srcArr.length){ + break; + } + // increment the sepIndex to skip past the separator + sepIndex++; + oldSepIndex = sepIndex; + result.append((char)FULL_STOP); + } + return result; + } + + + /** + * This function implements the ToUnicode operation as defined in the IDNA RFC. + * This operation is done on single labels before sending it to something that expects + * Unicode names. A label is an individual part of a domain name. Labels are usually + * separated by dots; for e.g." "www.example.com" is composed of 3 labels + * "www","example", and "com". + * + * @param src The input string to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertToUnicode(String src, int options) + throws ParseException, IOException{ + UCharacterIterator iter = UCharacterIterator.getInstance(src); + return convertToUnicode(iter,options); + } + + /** + * This function implements the ToUnicode operation as defined in the IDNA RFC. + * This operation is done on single labels before sending it to something that expects + * Unicode names. A label is an individual part of a domain name. Labels are usually + * separated by dots; for e.g." "www.example.com" is composed of 3 labels + * "www","example", and "com". + * + * @param src The input string as StringBuffer to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertToUnicode(StringBuffer src, int options) + throws ParseException, IOException{ + UCharacterIterator iter = UCharacterIterator.getInstance(src); + return convertToUnicode(iter,options); + } + + /** + * This function implements the ToUnicode operation as defined in the IDNA RFC. + * This operation is done on single labels before sending it to something that expects + * Unicode names. A label is an individual part of a domain name. Labels are usually + * separated by dots; for e.g." "www.example.com" is composed of 3 labels + * "www","example", and "com". + * + * @param src The input string as UCharacterIterator to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertToUnicode(UCharacterIterator iter, int options) + throws ParseException, IOException{ + //load the data + loadInstance(); + + boolean[] caseFlags = null; + + // the source contains all ascii codepoints + boolean srcIsASCII = true; + // assume the source contains all LDH codepoints + boolean srcIsLDH = true; + + //get the options + boolean useSTD3ASCIIRules = (boolean)((options & USE_STD3_RULES) != 0); + + int failPos = -1; + int ch; + int saveIndex = iter.getIndex(); + // step 1: find out if all the codepoints in src are ASCII + while((ch=iter.next())!= UCharacterIterator.DONE){ + if(ch>0x7F){ + srcIsASCII = false; + } + if((srcIsLDH = isLDHChar(ch))==false){ + failPos = iter.getIndex(); + } + } + StringBuffer processOut; + + if(srcIsASCII == false){ + // step 2: process the string + iter.setIndex(saveIndex); + processOut = prep.prepare(iter,options); + + }else{ + //just point to source + processOut = new StringBuffer(iter.getText()); + } + // TODO: + // The RFC states that + // + // ToUnicode never fails. If any step fails, then the original input + // is returned immediately in that step. + // + + //step 3: verify ACE Prefix + if(startsWithPrefix(processOut)){ + + //step 4: Remove the ACE Prefix + String temp = processOut.substring(ACE_PREFIX_LENGTH,processOut.length()); + + //step 5: Decode using punycode + StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp),caseFlags); + + //step 6:Apply toASCII + StringBuffer toASCIIOut = convertToASCII(decodeOut, options); + + //step 7: verify + if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){ + throw new ParseException("The verification step prescribed by the RFC 3491 failed", + ParseException.VERIFICATION_ERROR); + } + + //step 8: return output of step 5 + return decodeOut; + + }else{ + // verify that STD3 ASCII rules are satisfied + if(useSTD3ASCIIRules == true){ + if( srcIsLDH == false /* source contains some non-LDH characters */ + || processOut.charAt(0) == HYPHEN + || processOut.charAt(processOut.length()-1) == HYPHEN){ + + if(srcIsLDH==false){ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR,processOut.toString(), + (failPos>0) ? (failPos-1) : failPos); + }else if(processOut.charAt(0) == HYPHEN){ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(),0); + + }else{ + throw new ParseException("The input does not conform to the STD 3 ASCII rules", + ParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(), + processOut.length()); + + } + } + } + // just return the source + return new StringBuffer(iter.getText()); + } + } + + /** + * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. + * This operation is done on complete domain names, e.g: "www.example.com". + * + * Note: IDNA RFC specifies that a conformant application should divide a domain name + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once + * set will apply to all labels in the domain name + * + * @param src The input string as UCharacterIterator to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertIDNToUnicode(UCharacterIterator iter, int options) + throws ParseException, IOException{ + return convertIDNToUnicode(iter.getText(), options); + } + + /** + * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. + * This operation is done on complete domain names, e.g: "www.example.com". + * + * Note: IDNA RFC specifies that a conformant application should divide a domain name + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once + * set will apply to all labels in the domain name + * + * @param src The input string as StringBuffer to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertIDNToUnicode(StringBuffer str, int options) + throws ParseException, IOException{ + return convertIDNToUnicode(str.toString(), options); + } + + /** + * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. + * This operation is done on complete domain names, e.g: "www.example.com". + * + * Note: IDNA RFC specifies that a conformant application should divide a domain name + * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, + * and then convert. This function does not offer that level of granularity. The options once + * set will apply to all labels in the domain name + * + * @param src The input string to be processed + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return StringBuffer the converted String + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + public static StringBuffer convertIDNToUnicode(String src, int options) + throws ParseException, IOException{ + + char[] srcArr = src.toCharArray(); + StringBuffer result = new StringBuffer(); + int sepIndex=0; + int oldSepIndex=0; + for(;;){ + sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); + UCharacterIterator iter = UCharacterIterator.getInstance(new String(srcArr,oldSepIndex,sepIndex-oldSepIndex)); + result.append(convertToUnicode(iter,options)); + if(sepIndex==srcArr.length){ + break; + } + // increment the sepIndex to skip past the separator + sepIndex++; + oldSepIndex =sepIndex; + result.append((char)FULL_STOP); + } + return result; + } + + /** + * Compare two IDN strings for equivalence. + * This function splits the domain names into labels and compares them. + * According to IDN RFC, whenever two labels are compared, they are + * considered equal if and only if their ASCII forms (obtained by + * applying toASCII) match using an case-insensitive ASCII comparison. + * Two domain names are considered a match if and only if all labels + * match regardless of whether label separators match. + * + * @param s1 First IDN string as StringBuffer + * @param s2 Second IDN string as StringBuffer + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2 + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + // TODO: optimize + public static int compare(StringBuffer s1, StringBuffer s2, int options) + throws ParseException, IOException{ + if(s1==null || s2 == null){ + throw new IllegalArgumentException("One of the source buffers is null"); + } + StringBuffer s1Out = convertIDNToASCII(s1.toString(),options); + StringBuffer s2Out = convertIDNToASCII(s2.toString(), options); + return compareCaseInsensitiveASCII(s1Out,s2Out); + } + + /** + * Compare two IDN strings for equivalence. + * This function splits the domain names into labels and compares them. + * According to IDN RFC, whenever two labels are compared, they are + * considered equal if and only if their ASCII forms (obtained by + * applying toASCII) match using an case-insensitive ASCII comparison. + * Two domain names are considered a match if and only if all labels + * match regardless of whether label separators match. + * + * @param s1 First IDN string + * @param s2 Second IDN string + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2 + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + // TODO: optimize + public static int compare(String s1, String s2, int options) + throws ParseException, IOException{ + if(s1==null || s2 == null){ + throw new IllegalArgumentException("One of the source buffers is null"); + } + StringBuffer s1Out = convertIDNToASCII(s1, options); + StringBuffer s2Out = convertIDNToASCII(s2, options); + return compareCaseInsensitiveASCII(s1Out,s2Out); + } + /** + * Compare two IDN strings for equivalence. + * This function splits the domain names into labels and compares them. + * According to IDN RFC, whenever two labels are compared, they are + * considered equal if and only if their ASCII forms (obtained by + * applying toASCII) match using an case-insensitive ASCII comparison. + * Two domain names are considered a match if and only if all labels + * match regardless of whether label separators match. + * + * @param s1 First IDN string as UCharacterIterator + * @param s2 Second IDN string as UCharacterIterator + * @param options A bit set of options: + * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points + * and do not use STD3 ASCII rules + * If unassigned code points are found the operation fails with + * ParseException. + * + * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations + * If this option is set, the unassigned code points are in the input + * are treated as normal Unicode code points. + * + * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions + * If this option is set and the input does not satisfy STD3 rules, + * the operation will fail with ParseException + * @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2 + * @throws ParseException + * @throws IOException + * @draft ICU 2.8 + */ + // TODO: optimize + public static int compare(UCharacterIterator i1, UCharacterIterator i2, int options) + throws ParseException, IOException{ + if(i1==null || i2 == null){ + throw new IllegalArgumentException("One of the source buffers is null"); + } + StringBuffer s1Out = convertIDNToASCII(i1.getText(), options); + StringBuffer s2Out = convertIDNToASCII(i2.getText(), options); + return compareCaseInsensitiveASCII(s1Out,s2Out); + } +} diff --git a/icu4j/src/com/ibm/icu/stringprep/ParseException.java b/icu4j/src/com/ibm/icu/stringprep/ParseException.java new file mode 100644 index 00000000000..f8d741c9708 --- /dev/null +++ b/icu4j/src/com/ibm/icu/stringprep/ParseException.java @@ -0,0 +1,143 @@ +/* + ******************************************************************************* + * Copyright (C) 2003-2004, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/stringprep/Attic/ParseException.java,v $ + * $Date: 2003/08/21 23:40:39 $ + * $Revision: 1.1 $ + * + ***************************************************************************************** + */ +package com.ibm.icu.stringprep; + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public class ParseException extends Exception { + + public static final int INVALID_CHAR_FOUND = 0; + public static final int ILLEGAL_CHAR_FOUND = 1; + public static final int PROHIBITED_ERROR = 2; + public static final int UNASSIGNED_ERROR = 3; + public static final int CHECK_BIDI_ERROR = 4; + public static final int STD3_ASCII_RULES_ERROR = 5; + public static final int ACE_PREFIX_ERROR = 6; + public static final int VERIFICATION_ERROR = 7; + public static final int LABEL_TOO_LONG_ERROR = 8; + public static final int BUFFER_OVERFLOW_ERROR = 9; + + public ParseException(String message,int error){ + super(message); + this.error = error; + } + public ParseException(String message,int error, String rules, int pos){ + super(message); + this.error = error; + setContext(rules,pos); + } + + public boolean equals(Object other){ + if(!(other instanceof ParseException)){ + return false; + } + return ((ParseException)other).error == this.error; + } + public String toString(){ + StringBuffer buf = new StringBuffer(); + buf.append(super.getMessage()); + buf.append(". preContext: "); + buf.append(preContext); + buf.append(". postContext: "); + buf.append(postContext); + buf.append("\n"); + return buf.toString(); + } + + private int error; + /** + * The line on which the error occured. If the parse engine + * is not using this field, it should set it to zero. Otherwise + * it should be a positive integer. The default value of this field + * is -1. It will be set to 0 if the code populating this struct is not + * using line numbers. + * @stable ICU 2.0 + */ + private int line; + + /** + * The character offset to the error. If the line field is + * being used, then this offset is from the start of the line. + * If the line field is not being used, then this offset is from + * the start of the text.The default value of this field + * is -1. It will be set to appropriate value by the code that + * populating the struct. + * @stable ICU 2.0 + */ + private int offset; + + /** + * Textual context before the error. Null-terminated. + * May be the empty string if not implemented by parser. + * @stable ICU 2.0 + */ + private StringBuffer preContext = new StringBuffer(); + + /** + * Textual context after the error. Null-terminated. + * May be the empty string if not implemented by parser. + * @stable ICU 2.0 + */ + private StringBuffer postContext = new StringBuffer(); + + public static final int PARSE_CONTEXT_LEN = 16; + + public void setOffset(int offset){ + this.offset = offset; + } + public int getOffset(){ + return offset; + } + public int getLineNumber(){ + return line; + } + public int setLineNumber(int lineNumber){ + return line; + } + public String getPreContext(){ + return preContext.toString(); + } + public String getPostContext(){ + return postContext.toString(); + } + + public void setPreContext(String str, int pos){ + setPreContext(str.toCharArray(),pos); + } + public void setPreContext(char[] str, int pos){ + int start = (pos <= PARSE_CONTEXT_LEN)? 0 : (pos - (PARSE_CONTEXT_LEN-1)); + int len = (start <= PARSE_CONTEXT_LEN)? start : PARSE_CONTEXT_LEN; + preContext.append(str,start,len); + + } + public void setPostContext(String str, int pos){ + setPostContext(str.toCharArray(),pos); + } + public void setPostContext(char[] str, int pos){ + int start = pos; + int len = str.length - start; + postContext.append(str,start,len); + + } + public void setContext(char[]str,int pos){ + setPreContext(str,pos); + setPostContext(str,pos); + } + public void setContext(String str,int pos){ + setPreContext(str,pos); + setPostContext(str,pos); + } +} diff --git a/icu4j/src/com/ibm/icu/stringprep/Punycode.java b/icu4j/src/com/ibm/icu/stringprep/Punycode.java new file mode 100644 index 00000000000..a3d4a90014a --- /dev/null +++ b/icu4j/src/com/ibm/icu/stringprep/Punycode.java @@ -0,0 +1,467 @@ +/* + ******************************************************************************* + * Copyright (C) 2003-2004, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/stringprep/Attic/Punycode.java,v $ + * $Date: 2003/08/21 23:40:39 $ + * $Revision: 1.1 $ + * + ***************************************************************************************** + */ +package com.ibm.icu.stringprep; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UTF16; + +/** + * @author ram + * + * To change this generated comment edit the template variable "typecomment": + * Window>Preferences>Java>Templates. + * To enable and disable the creation of type comments go to + * Window>Preferences>Java>Code Generation. + */ +/* Package Private class */ +final class Punycode { + + /* Punycode parameters for Bootstring */ + private static final int BASE = 36; + private static final int TMIN = 1; + private static final int TMAX = 26; + private static final int SKEW = 38; + private static final int DAMP = 700; + private static final int INITIAL_BIAS = 72; + private static final int INITIAL_N = 0x80; + + /* "Basic" Unicode/ASCII code points */ + private static final int HYPHEN = 0x2d; + private static final int DELIMITER = HYPHEN; + + private static final int ZERO = 0x30; + private static final int NINE = 0x39; + + private static final int SMALL_A = 0x61; + private static final int SMALL_Z = 0x7a; + + private static final int CAPITAL_A = 0x41; + private static final int CAPITAL_Z = 0x5a; + private static final int MAX_CP_COUNT = 200; + private static final int UINT_MAGIC = 0x80000000; + private static final long ULONG_MAGIC = 0x8000000000000000L; + + private static int adaptBias(int delta, int length, boolean firstTime){ + if(firstTime){ + delta /=DAMP; + }else{ + delta /= 2; + } + delta += delta/length; + + int count=0; + for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { + delta/=(BASE-TMIN); + } + + return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); + } + + /** + * basicToDigit[] contains the numeric value of a basic code + * point (for use in representing integers) in the range 0 to + * BASE-1, or -1 if b is does not represent a value. + */ + static final int[] basicToDigit= new int[]{ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + + private static char asciiCaseMap(char b, boolean uppercase) { + if(uppercase) { + if(SMALL_A<=b && b<=SMALL_Z) { + b-=(SMALL_A-CAPITAL_A); + } + } else { + if(CAPITAL_A<=b && b<=CAPITAL_Z) { + b+=(SMALL_A-CAPITAL_A); + } + } + return b; + } + + /** + * digitToBasic() returns the basic code point whose value + * (when used for representing integers) is d, which must be in the + * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is + * nonzero, in which case the uppercase form is used. + */ + private static char digitToBasic(int digit, boolean uppercase) { + /* 0..25 map to ASCII a..z or A..Z */ + /* 26..35 map to ASCII 0..9 */ + if(digit<26) { + if(uppercase) { + return (char)(CAPITAL_A+digit); + } else { + return (char)(SMALL_A+digit); + } + } else { + return (char)((ZERO-26)+digit); + } + } + + public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{ + + int[] cpBuffer = new int[MAX_CP_COUNT]; + int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; + char c, c2; + int srcLength = src.length(); + int destCapacity = MAX_CP_COUNT; + char[] dest = new char[destCapacity]; + StringBuffer result = new StringBuffer(); + /* + * Handle the basic code points and + * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): + */ + srcCPCount=destLength=0; + + for(j=0; j0) { + if(destLength state to , but guard against overflow: + */ + if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { + throw new RuntimeException("Internal program error"); + } + delta+=(m-n)*(handledCPCount+1); + n=m; + + /* Encode a sequence of same code points n */ + for(j=0; jTMAX) { + t=TMAX; + } + */ + + t=k-bias; + if(t=(bias+TMAX)) { + t=TMAX; + } + + if(q= CAPITAL_Z); + } + private static boolean isSurrogate(int ch){ + return (((ch)&0xfffff800)==0xd800); + } + public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) + throws ParseException{ + int srcLength = src.length(); + StringBuffer result = new StringBuffer(); + int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, + destCPCount, firstSupplementaryIndex, cpLength; + char b; + int destCapacity = MAX_CP_COUNT; + char[] dest = new char[destCapacity]; + + /* + * Handle the basic code points: + * Let basicLength be the number of input code points + * before the last delimiter, or 0 if there is none, + * then copy the first basicLength code points to the output. + * + * The two following loops iterate backward. + */ + for(j=srcLength; j>0;) { + if(src.charAt(--j)==DELIMITER) { + break; + } + } + destLength=basicLength=destCPCount=j; + + while(j>0) { + b=src.charAt(--j); + if(!isBasic(b)) { + throw new ParseException("Illegal char found", ParseException.INVALID_CHAR_FOUND); + } + + if(j0 ? basicLength+1 : 0; in=srcLength) { + throw new ParseException("Illegal char found", ParseException.ILLEGAL_CHAR_FOUND); + } + + digit=basicToDigit[(byte)src.charAt(in++)]; + if(digit<0) { + throw new ParseException("Invalid char found", ParseException.INVALID_CHAR_FOUND); + } + if(digit>(0x7fffffff-i)/w) { + /* integer overflow */ + throw new ParseException("Illegal char found", ParseException.ILLEGAL_CHAR_FOUND); + } + + i+=digit*w; + t=k-bias; + if(t=(bias+TMAX)) { + t=TMAX; + } + if(digit0x7fffffff/(BASE-t)) { + /* integer overflow */ + throw new ParseException("Illegal char found", ParseException.ILLEGAL_CHAR_FOUND); + } + w*=BASE-t; + } + + /* + * Modification from sample code: + * Increments destCPCount here, + * where needed instead of in for() loop tail. + */ + ++destCPCount; + bias=adaptBias(i-oldi, destCPCount, (oldi==0)); + + /* + * i was supposed to wrap around from (incremented) destCPCount to 0, + * incrementing n each time, so we'll fix that now: + */ + if(i/destCPCount>(0x7fffffff-n)) { + /* integer overflow */ + throw new ParseException("Illegal char found", ParseException.ILLEGAL_CHAR_FOUND); + } + + n+=i/destCPCount; + i%=destCPCount; + /* not needed for Punycode: */ + /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ + + if(n>0x10ffff || isSurrogate(n)) { + /* Unicode code point overflow */ + throw new ParseException("Illegal char found", ParseException.ILLEGAL_CHAR_FOUND); + } + + /* Insert n at position i of the output: */ + cpLength=UTF16.getCharCount(n); + if((destLength+cpLength)1) { + firstSupplementaryIndex=codeUnitIndex; + } else { + ++firstSupplementaryIndex; + } + } else { + codeUnitIndex=firstSupplementaryIndex; + codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); + } + + /* use the UChar index codeUnitIndex instead of the code point index i */ + if(codeUnitIndexPreferences>Java>Code Generation>Code and Comments + */ +public class StringPrep { + /** + * Option to prohibit processing of unassigned code points in the input + * + * @see usprep_prepare + * @draft ICU 2.8 + */ + public static final int NONE = 0x0000; + + /** + * Option to allow processing of unassigned code points in the input + * + * @see usprep_prepare + * @draft ICU 2.8 + */ + public static final int ALLOW_UNASSIGNED = 0x0001; + + private static final int UNASSIGNED = 0x0000; + private static final int MAP = 0x0001; + private static final int PROHIBITED = 0x0002; + private static final int LABEL_SEPARATOR = 0x0003; + private static final int DELETE = 0x0004; + private static final int TYPE_LIMIT = 0x0005; + + private static final int NORMALIZATION_ON = 0x0001; + private static final int CHECK_BIDI_ON = 0x0002; + + private static final int TYPE_THRESHOLD = 0xFFF0; + private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/ + private static final int MAX_INDEX_TOP_LENGTH = 0x0003; + + /* indexes[] value names */ + private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ + private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */ + private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */ + private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */ + private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */ + private static final int THREE_UCHARS_MAPPING_INDEX_START = 5; + private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6; + private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */ + private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */ + + + /** + * Default buffer size of datafile + */ + private static final int DATA_BUFFER_SIZE = 25000; + + /* Wrappers for Trie implementations */ + private static final class StringPrepTrieImpl implements Trie.DataManipulate{ + static CharTrie sprepTrie = null; + /** + * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's + * data the index array offset of the indexes for that lead surrogate. + * @param property data value for a surrogate from the trie, including + * the folding offset + * @return data offset or 0 if there is no data for the lead surrogate + */ + public int getFoldingOffset(int value){ + return value; + } + } + + private static StringPrepTrieImpl sprepTrieImpl; + private static int[] indexes; + private static char[] mappingData; + private static byte[] formatVersion; + + private char getCodePointValue(int ch){ + return StringPrepTrieImpl.sprepTrie.getCodePointValue(ch); + } + + //protected + private boolean doNFKC = false; + private boolean checkBiDi = false; + + private VersionInfo unicodeVersion; + private VersionInfo normVersion; + + + private static VersionInfo getVersionInfo(int comp){ + int micro = comp & 0xFF; + int milli =(comp >> 8) & 0xFF; + int minor =(comp >> 16) & 0xFF; + int major =(comp >> 24) & 0xFF; + return VersionInfo.getInstance(major,minor,milli,micro); + } + private static VersionInfo getVersionInfo(byte[] version){ + if(version.length != 4){ + return null; + } + return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]); + } + + private StringPrep(InputStream inputStream) throws IOException{ + + BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE); + + StringPrepDataReader reader = new StringPrepDataReader(b); + + // read the indexes + indexes = reader.readIndexes(INDEX_TOP); + + byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]]; + + sprepTrieImpl = new StringPrepTrieImpl(); + //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes + mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2]; + // load the rest of the data data and initialize the data members + reader.read(sprepBytes,mappingData); + + StringPrepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl ); + + // get the data format version + formatVersion = reader.getDataFormatVersion(); + + // get the options + doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); + checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); + unicodeVersion = getVersionInfo(reader.getUnicodeVersion()); + normVersion = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); + if(normVersion.compareTo(UCharacter.getUnicodeVersion())>0){ + throw new IOException("Normalization Correction version not supported"); + } + b.close(); + } + /** + * Returns the StringPrep instance created after reading the input stream. + * The object does not hold a reference to the input steam, so the stream can be + * closed after the method returns. + * + * @param inputStream The stream for reading the StringPrep profile binary + * @return StringPrep object created from the input stream + * @throws IOException + * @draft ICU 2.8 + */ + public static final StringPrep getInstance(InputStream inputStream) + throws IOException{ + + StringPrep prep = null; + // load the file and create the object + prep = new StringPrep(inputStream); + + return prep; + } + + private class Values{ + boolean isIndex; + int value; + int type; + } + + private static final void getValues(char trieWord,Values values){ + + if(trieWord == 0){ + /* + * Initial value stored in the mapping table + * just return USPREP_TYPE_LIMIT .. so that + * the source codepoint is copied to the destination + */ + values.type = TYPE_LIMIT; + }else if(trieWord >= TYPE_THRESHOLD){ + values.type = (trieWord - TYPE_THRESHOLD); + }else{ + /* get the type */ + values.type = MAP; + /* ascertain if the value is index or delta */ + if((trieWord & 0x02)>0){ + values.isIndex = true; + values.value = trieWord >> 2; //mask off the lower 2 bits and shift + + }else{ + values.isIndex = false; + values.value = ((int)(trieWord<<16))>>16; + values.value = (values.value >> 2); + + } + + if((trieWord>>2) == MAX_INDEX_VALUE){ + values.type = DELETE; + values.isIndex = false; + values.value = 0; + } + } + } + + + + private StringBuffer map( UCharacterIterator iter, int options) + throws ParseException{ + + Values val = new Values(); + char result = 0; + int ch = UCharacterIterator.DONE; + StringBuffer dest = new StringBuffer(); + boolean allowUnassigned = (boolean) ((options & ALLOW_UNASSIGNED)>0); + + while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ + + result = getCodePointValue(ch); + getValues(result,val); + + // check if the source codepoint is unassigned + if(val.type == UNASSIGNED && allowUnassigned == false){ + throw new ParseException("An unassigned code point was found in the input", + ParseException.UNASSIGNED_ERROR, + iter.getText(),iter.getIndex()); + }else if((val.type == MAP)){ + int index, length; + + if(val.isIndex){ + index = val.value; + if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && + index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){ + length = 1; + }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && + index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){ + length = 2; + }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && + index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){ + length = 3; + }else{ + length = mappingData[index++]; + } + /* copy mapping to destination */ + dest.append(mappingData,index,length); + continue; + + }else{ + ch -= val.value; + } + }else if(val.type == DELETE){ + // just consume the codepoint and contine + continue; + } + //copy the source into destination + UTF16.append(dest,ch); + } + + return dest; + } + + + private StringBuffer normalize(StringBuffer src){ + return new StringBuffer(Normalizer.normalize(src.toString(),Normalizer.NFKC,Normalizer.UNICODE_3_2)); + } + + protected boolean isLabelSeparator(int ch){ + int result = getCodePointValue(ch); + if( (result & 0x07) == LABEL_SEPARATOR){ + return true; + } + return false; + } + + /* + 1) Map -- For each character in the input, check if it has a mapping + and, if so, replace it with its mapping. + + 2) Normalize -- Possibly normalize the result of step 1 using Unicode + normalization. + + 3) Prohibit -- Check for any characters that are not allowed in the + output. If any are found, return an error. + + 4) Check bidi -- Possibly check for right-to-left characters, and if + any are found, make sure that the whole string satisfies the + requirements for bidirectional strings. If the string does not + satisfy the requirements for bidirectional strings, return an + error. + [Unicode3.2] defines several bidirectional categories; each character + has one bidirectional category assigned to it. For the purposes of + the requirements below, an "RandALCat character" is a character that + has Unicode bidirectional categories "R" or "AL"; an "LCat character" + is a character that has Unicode bidirectional category "L". Note + + + that there are many characters which fall in neither of the above + definitions; Latin digits ( through ) are examples of + this because they have bidirectional category "EN". + + In any profile that specifies bidirectional character handling, all + three of the following requirements MUST be met: + + 1) The characters in section 5.8 MUST be prohibited. + + 2) If a string contains any RandALCat character, the string MUST NOT + contain any LCat character. + + 3) If a string contains any RandALCat character, a RandALCat + character MUST be the first character of the string, and a + RandALCat character MUST be the last character of the string. + */ + /** + * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), + * checks for prohited and BiDi characters in the order defined by RFC 3454 + * depending on the options specified in the profile. + * + * @param src A UCharacterIterator object containing the source string + * @param options A bit set of options: + * + * - StringPrep.NONE Prohibit processing of unassigned code points in the input + * + * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input + * as normal Unicode code points. + * + * @return StringBuffer A StringBuffer containing the output + * @throws ParseException + * @draft ICU 2.8 + */ + public StringBuffer prepare(UCharacterIterator src, int options) + throws ParseException{ + + // map + StringBuffer mapOut = map(src,options); + StringBuffer normOut = mapOut;// initialize + + if(doNFKC){ + // normalize + normOut = normalize(mapOut); + } + + int ch; + char result; + UCharacterIterator iter = UCharacterIterator.getInstance(normOut); + Values val = new Values(); + int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, + firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; + int rtlPos=-1, ltrPos=-1; + boolean rightToLeft=false, leftToRight=false; + + while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ + result = getCodePointValue(ch); + getValues(result,val); + + if(val.type == PROHIBITED ){ + throw new ParseException("A prohibited code point was found in the input", + ParseException.PROHIBITED_ERROR,iter.getText(),val.value); + } + + direction = UCharacter.getDirection(ch); + if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ + firstCharDir = direction; + } + if(direction == UCharacterDirection.LEFT_TO_RIGHT){ + leftToRight = true; + ltrPos = iter.getIndex()-1; + } + if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ + rightToLeft = true; + rtlPos = iter.getIndex()-1; + } + } + if(checkBiDi == true){ + // satisfy 2 + if( leftToRight == true && rightToLeft == true){ + throw new ParseException("The input does not conform to the rules for BiDi code points.", + ParseException.CHECK_BIDI_ERROR,iter.getText(), + (rtlPos>ltrPos) ? rtlPos : ltrPos); + } + + //satisfy 3 + if( rightToLeft == true && + !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && + (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) + ){ + throw new ParseException("The input does not conform to the rules for BiDi code points.", + ParseException.CHECK_BIDI_ERROR,iter.getText(), + (rtlPos>ltrPos) ? rtlPos : ltrPos); + } + } + return normOut; + + } +}