ICU-2093 Update grapheme cluster rules to latest Unicode TR

X-SVN-Rev: 11461
This commit is contained in:
Andy Heninger 2003-04-04 23:41:03 +00:00
parent 403e3c64a2
commit 9e3648ad6c
2 changed files with 19 additions and 42 deletions

View file

@ -17,51 +17,26 @@ $CR = \r;
$LF = \n;
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
# Note on $Extend: Earlier versions of TR29 included Mc characters.
# To avoid test breakage, Mc is still included for the time being.
# $Extend = [[:Mn:] [:Me:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
$Extend = [[:Grapheme_Extend = TRUE:]];
#
# Korean Syllable Definitions
#
$L = [\u1100-\u115f];
$V = [\u1160-\u11a2];
$T = [\u11a8-\u11f9];
$L = [:Hangul_Syllable_Type = L:];
$V = [:Hangul_Syllable_Type = V:];
$T = [:Hangul_Syllable_Type = T:];
$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
$LVT = [[\uac00-\ud7a3] - $LV];
$LV = [:Hangul_Syllable_Type = LV:];
$LVT = [:Hangul_Syllable_Type = LVT:];
$HungulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
$Control;
$CR $LF;
([^$Control] | $L+ | $T+ | $L* ($LV? $V+ | $LV | $LVT) $T*) $Extend*;
([^$Control] | $HungulSyllable) $Extend*;
.;
#
# Reverse Rule, back up to the beginning of some preceding grapheme cluster.
#
! ($Extend | $V | $T )* ($LF $CR | ($LV | $LVT)*$L* | .)?;
! ($Extend | $V | $T )* ($LF $CR | ($LV | $LVT)?$L* | .);

View file

@ -270,13 +270,15 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration()
ADD_DATACHUNK(chardata, "\\u0906", 0, status); //devanagiri AA
//ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0); //devanagiri vowelsign AA+ chandrabindhu
ADD_DATACHUNK(chardata, "\\u0906\\u0901", 0, status); // Devanagari AA + chandrabindu
ADD_DATACHUNK(chardata, "\\u0915\\u093e\\u0901", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
ADD_DATACHUNK(chardata, "\\u0915", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
ADD_DATACHUNK(chardata, "\\u0916\\u0947", 0, status); //devanagiri KHA+vowelsign E
ADD_DATACHUNK(chardata, "\\u0938\\u0941\\u0902", 0, status); //devanagiri SA+vowelsign U + anusvara(bindu)
ADD_DATACHUNK(chardata, "\\u0926", 0, status); //devanagiri consonant DA
ADD_DATACHUNK(chardata, "\\u0930", 0, status); //devanagiri consonant RA
ADD_DATACHUNK(chardata, "\\u0939\\u094c", 0, status); //devanagiri HA+vowel sign AI
ADD_DATACHUNK(chardata, "\\u0939", 0, status); //devanagiri HA+vowel sign AI
ADD_DATACHUNK(chardata, "\\u094c", 0, status); //devanagiri HA+vowel sign AI
ADD_DATACHUNK(chardata, "\\u0964", 0, status); //devanagiri danda
//end hindi characters
ADD_DATACHUNK(chardata, "A\\u0302", 0, status); //circumflexA
@ -948,20 +950,20 @@ void RBBITest::TestHindiCharacterBreak()
//hindi character break should make sure that it
// doesn't break in-between a vowelsign and a chandrabindu
// TODO: Rules need some fixing. As currently written, they'll correctly recognize this combination
// as part of a legit character, but not standalone.
ADD_DATACHUNK(hindicharData, "\\u000a", 0, status); // Force break so following can appear stand-alone.
ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status); //devanagari vowelsign AA+ chandrabindu
ADD_DATACHUNK(hindicharData, "\\u0906\\u0901", 0, status); // Devanagari AA + chandrabindu
ADD_DATACHUNK(hindicharData, "\\u0915\\u093e\\u0901", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
ADD_DATACHUNK(hindicharData, "\\u0915", 0, status); // Devanagari KA
ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status); // Devanagari AA vowelsign + chandrabindu
ADD_DATACHUNK(hindicharData, "\\u0916\\u0947", 0, status); //devanagari KHA+vowelsign E
ADD_DATACHUNK(hindicharData, "\\u0938\\u0941\\u0902", 0, status); //devanagari SA+vowelsign U + anusvara(bindu)
ADD_DATACHUNK(hindicharData, "\\u0926", 0, status); //devanagari consonant DA
ADD_DATACHUNK(hindicharData, "\\u0930", 0, status); //devanagari consonant RA
ADD_DATACHUNK(hindicharData, "\\u0939\\u094c", 0, status); //devanagari consonant HA+dependent vowel sign AI
ADD_DATACHUNK(hindicharData, "\\u0939", 0, status); //devanagari consonant HA+
ADD_DATACHUNK(hindicharData, "\\u094c", 0, status); // +dependent vowel sign AI
ADD_DATACHUNK(hindicharData, "\\u0964", 0, status); //devanagari danda
ADD_DATACHUNK(hindicharData, "\\u0950", 0, status); //devanagari OM
ADD_DATACHUNK(hindicharData, "\\u0915\\u0943", 0, status); //devanagari KA+dependent vowel RI->KRI