mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-20 20:19:32 +00:00
ICU-2093 Update grapheme cluster rules to latest Unicode TR
X-SVN-Rev: 11461
This commit is contained in:
parent
403e3c64a2
commit
9e3648ad6c
2 changed files with 19 additions and 42 deletions
|
@ -17,51 +17,26 @@ $CR = \r;
|
|||
$LF = \n;
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
|
||||
|
||||
# Note on $Extend: Earlier versions of TR29 included Mc characters.
|
||||
# To avoid test breakage, Mc is still included for the time being.
|
||||
# $Extend = [[:Mn:] [:Me:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
|
||||
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
#
|
||||
# Korean Syllable Definitions
|
||||
#
|
||||
$L = [\u1100-\u115f];
|
||||
$V = [\u1160-\u11a2];
|
||||
$T = [\u11a8-\u11f9];
|
||||
$L = [:Hangul_Syllable_Type = L:];
|
||||
$V = [:Hangul_Syllable_Type = V:];
|
||||
$T = [:Hangul_Syllable_Type = T:];
|
||||
|
||||
$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
|
||||
\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
|
||||
\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
|
||||
\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
|
||||
\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
|
||||
\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
|
||||
\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
|
||||
\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
|
||||
\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
|
||||
\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
|
||||
\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
|
||||
\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
|
||||
\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
|
||||
\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
|
||||
\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
|
||||
\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
|
||||
\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
|
||||
\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
|
||||
\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
|
||||
\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
|
||||
\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
|
||||
\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
|
||||
\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
|
||||
\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
|
||||
\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
|
||||
$LVT = [[\uac00-\ud7a3] - $LV];
|
||||
$LV = [:Hangul_Syllable_Type = LV:];
|
||||
$LVT = [:Hangul_Syllable_Type = LVT:];
|
||||
|
||||
$HungulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
|
||||
|
||||
$Control;
|
||||
$CR $LF;
|
||||
([^$Control] | $L+ | $T+ | $L* ($LV? $V+ | $LV | $LVT) $T*) $Extend*;
|
||||
([^$Control] | $HungulSyllable) $Extend*;
|
||||
.;
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rule, back up to the beginning of some preceding grapheme cluster.
|
||||
#
|
||||
! ($Extend | $V | $T )* ($LF $CR | ($LV | $LVT)*$L* | .)?;
|
||||
! ($Extend | $V | $T )* ($LF $CR | ($LV | $LVT)?$L* | .);
|
||||
|
|
|
@ -270,13 +270,15 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration()
|
|||
ADD_DATACHUNK(chardata, "\\u0906", 0, status); //devanagiri AA
|
||||
//ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0); //devanagiri vowelsign AA+ chandrabindhu
|
||||
ADD_DATACHUNK(chardata, "\\u0906\\u0901", 0, status); // Devanagari AA + chandrabindu
|
||||
ADD_DATACHUNK(chardata, "\\u0915\\u093e\\u0901", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
|
||||
ADD_DATACHUNK(chardata, "\\u0915", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
|
||||
ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
|
||||
|
||||
ADD_DATACHUNK(chardata, "\\u0916\\u0947", 0, status); //devanagiri KHA+vowelsign E
|
||||
ADD_DATACHUNK(chardata, "\\u0938\\u0941\\u0902", 0, status); //devanagiri SA+vowelsign U + anusvara(bindu)
|
||||
ADD_DATACHUNK(chardata, "\\u0926", 0, status); //devanagiri consonant DA
|
||||
ADD_DATACHUNK(chardata, "\\u0930", 0, status); //devanagiri consonant RA
|
||||
ADD_DATACHUNK(chardata, "\\u0939\\u094c", 0, status); //devanagiri HA+vowel sign AI
|
||||
ADD_DATACHUNK(chardata, "\\u0939", 0, status); //devanagiri HA+vowel sign AI
|
||||
ADD_DATACHUNK(chardata, "\\u094c", 0, status); //devanagiri HA+vowel sign AI
|
||||
ADD_DATACHUNK(chardata, "\\u0964", 0, status); //devanagiri danda
|
||||
//end hindi characters
|
||||
ADD_DATACHUNK(chardata, "A\\u0302", 0, status); //circumflexA
|
||||
|
@ -948,20 +950,20 @@ void RBBITest::TestHindiCharacterBreak()
|
|||
|
||||
//hindi character break should make sure that it
|
||||
// doesn't break in-between a vowelsign and a chandrabindu
|
||||
// TODO: Rules need some fixing. As currently written, they'll correctly recognize this combination
|
||||
// as part of a legit character, but not standalone.
|
||||
|
||||
ADD_DATACHUNK(hindicharData, "\\u000a", 0, status); // Force break so following can appear stand-alone.
|
||||
ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status); //devanagari vowelsign AA+ chandrabindu
|
||||
ADD_DATACHUNK(hindicharData, "\\u0906\\u0901", 0, status); // Devanagari AA + chandrabindu
|
||||
ADD_DATACHUNK(hindicharData, "\\u0915\\u093e\\u0901", 0, status); // Devanagari KA + AA vowelsign + chandrabindu
|
||||
ADD_DATACHUNK(hindicharData, "\\u0915", 0, status); // Devanagari KA
|
||||
ADD_DATACHUNK(hindicharData, "\\u093e\\u0901", 0, status); // Devanagari AA vowelsign + chandrabindu
|
||||
|
||||
|
||||
ADD_DATACHUNK(hindicharData, "\\u0916\\u0947", 0, status); //devanagari KHA+vowelsign E
|
||||
ADD_DATACHUNK(hindicharData, "\\u0938\\u0941\\u0902", 0, status); //devanagari SA+vowelsign U + anusvara(bindu)
|
||||
ADD_DATACHUNK(hindicharData, "\\u0926", 0, status); //devanagari consonant DA
|
||||
ADD_DATACHUNK(hindicharData, "\\u0930", 0, status); //devanagari consonant RA
|
||||
ADD_DATACHUNK(hindicharData, "\\u0939\\u094c", 0, status); //devanagari consonant HA+dependent vowel sign AI
|
||||
ADD_DATACHUNK(hindicharData, "\\u0939", 0, status); //devanagari consonant HA+
|
||||
ADD_DATACHUNK(hindicharData, "\\u094c", 0, status); // +dependent vowel sign AI
|
||||
ADD_DATACHUNK(hindicharData, "\\u0964", 0, status); //devanagari danda
|
||||
ADD_DATACHUNK(hindicharData, "\\u0950", 0, status); //devanagari OM
|
||||
ADD_DATACHUNK(hindicharData, "\\u0915\\u0943", 0, status); //devanagari KA+dependent vowel RI->KRI
|
||||
|
|
Loading…
Add table
Reference in a new issue