mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-04 21:15:35 +00:00
ICU-22956 Use InCB for grapheme cluster segmentation
This commit is contained in:
parent
700c5e36a1
commit
0b9eb9ca71
8 changed files with 60 additions and 88 deletions
|
@ -24,13 +24,9 @@ $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
|
|||
$Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
|
||||
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
|
||||
|
||||
#
|
||||
# From cldr/common/properties/segments/
|
||||
# and issue CLDR-10994
|
||||
#
|
||||
$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}];
|
||||
$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}];
|
||||
$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}];
|
||||
$InCBConsonant = [\p{InCB=Consonant}];
|
||||
$InCBExtend = [\p{InCB=Extend}];
|
||||
$InCBLinker = [\p{InCB=Linker}];
|
||||
|
||||
# Korean Syllable Definitions
|
||||
#
|
||||
|
@ -64,8 +60,8 @@ $L ($L | $V | $LV | $LVT);
|
|||
# GB 9b
|
||||
$Prepend [^$Control $CR $LF];
|
||||
|
||||
# GB 9.3, from CLDR-10994
|
||||
$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant;
|
||||
# GB 9c
|
||||
$InCBConsonant [ $InCBExtend $InCBLinker ]* $InCBLinker [ $InCBExtend $InCBLinker ]* $InCBConsonant;
|
||||
|
||||
# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
|
||||
$Extended_Pict $Extend* $ZWJ $Extended_Pict;
|
||||
|
|
|
@ -1655,9 +1655,9 @@ private:
|
|||
UnicodeSet *fLVTSet;
|
||||
UnicodeSet *fHangulSet;
|
||||
UnicodeSet *fExtendedPictSet;
|
||||
UnicodeSet *fViramaSet;
|
||||
UnicodeSet *fLinkingConsonantSet;
|
||||
UnicodeSet *fExtCccZwjSet;
|
||||
UnicodeSet *fInCBLinkerSet;
|
||||
UnicodeSet *fInCBConsonantSet;
|
||||
UnicodeSet *fInCBExtendSet;
|
||||
UnicodeSet *fAnySet;
|
||||
|
||||
const UnicodeString *fText;
|
||||
|
@ -1690,11 +1690,9 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
fHangulSet->addAll(*fLVTSet);
|
||||
|
||||
fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
|
||||
fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
|
||||
"\\p{Indic_Syllabic_Category=Virama}]", status);
|
||||
fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
|
||||
"\\p{Indic_Syllabic_Category=Consonant}]", status);
|
||||
fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
|
||||
fInCBLinkerSet = new UnicodeSet(u"[\\p{InCB=Linker}]", status);
|
||||
fInCBConsonantSet = new UnicodeSet(u"[\\p{InCB=Consonant}]", status);
|
||||
fInCBExtendSet = new UnicodeSet(u"[\\p{InCB=Extend}]", status);
|
||||
fAnySet = new UnicodeSet(0, 0x10ffff);
|
||||
|
||||
// Create sets of characters, and add the names of the above character sets.
|
||||
|
@ -1713,9 +1711,9 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
sets.emplace_back(*fHangulSet); classNames.emplace_back("Hangul");
|
||||
sets.emplace_back(*fZWJSet); classNames.emplace_back("ZWJ");
|
||||
sets.emplace_back(*fExtendedPictSet); classNames.emplace_back("ExtendedPict");
|
||||
sets.emplace_back(*fViramaSet); classNames.emplace_back("Virama");
|
||||
sets.emplace_back(*fLinkingConsonantSet); classNames.emplace_back("LinkingConsonant");
|
||||
sets.emplace_back(*fExtCccZwjSet); classNames.emplace_back("ExtCcccZwj");
|
||||
sets.emplace_back(*fInCBLinkerSet); classNames.emplace_back("InCB=Linker");
|
||||
sets.emplace_back(*fInCBConsonantSet); classNames.emplace_back("InCB=Consonant");
|
||||
sets.emplace_back(*fInCBExtendSet); classNames.emplace_back("InCB=Extend");
|
||||
sets.emplace_back(*fAnySet); classNames.emplace_back("Any");
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -1838,19 +1836,20 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
|
|||
continue;
|
||||
}
|
||||
|
||||
// Note: Viramas are also included in the ExtCccZwj class.
|
||||
if (fLinkingConsonantSet->contains(c2)) {
|
||||
if (fInCBConsonantSet->contains(c2)) {
|
||||
int pi = p1;
|
||||
bool sawVirama = false;
|
||||
while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
|
||||
if (fViramaSet->contains(fText->char32At(pi))) {
|
||||
while (pi > 0 && (fInCBExtendSet->contains(fText->char32At(pi)) ||
|
||||
fInCBLinkerSet->contains(fText->char32At(pi)))) {
|
||||
if (fInCBLinkerSet->contains(fText->char32At(pi))) {
|
||||
sawVirama = true;
|
||||
}
|
||||
pi = fText->moveIndex32(pi, -1);
|
||||
}
|
||||
if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
|
||||
setAppliedRule(p2, "GB9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* x LinkingConsonant");
|
||||
continue;
|
||||
if (sawVirama && fInCBConsonantSet->contains(fText->char32At(pi))) {
|
||||
setAppliedRule(
|
||||
p2, R"(GB9c \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant})");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1903,9 +1902,9 @@ RBBICharMonkey::~RBBICharMonkey() {
|
|||
delete fAnySet;
|
||||
delete fZWJSet;
|
||||
delete fExtendedPictSet;
|
||||
delete fViramaSet;
|
||||
delete fLinkingConsonantSet;
|
||||
delete fExtCccZwjSet;
|
||||
delete fInCBLinkerSet;
|
||||
delete fInCBConsonantSet;
|
||||
delete fInCBExtendSet;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
|
|
|
@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
|
|||
LF = [\p{Grapheme_Cluster_Break = LF}];
|
||||
|
||||
Control = [[\p{Grapheme_Cluster_Break = Control}]];
|
||||
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
|
||||
Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
|
||||
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
|
||||
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
|
||||
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
|
||||
|
@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
|||
Extended_Pict = [:ExtPict:];
|
||||
|
||||
# Indic Sequences
|
||||
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];
|
||||
|
||||
LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];
|
||||
|
||||
ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
|
||||
InCBLinker = [\p{InCB=Linker}];
|
||||
InCBConsonant = [\p{InCB=Consonant}];
|
||||
InCBExtend = [\p{InCB=Extend}];
|
||||
|
||||
GB3: CR LF;
|
||||
GB4: (Control | CR | LF) ÷;
|
||||
|
@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
|
|||
GB7: (LV | V) (V | T);
|
||||
GB8: (LVT | T) T;
|
||||
|
||||
GB11: Extended_Pict Extend* ZWJ Extended_Pict;
|
||||
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
|
||||
GB9: . (Extend | ZWJ);
|
||||
GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
|
||||
GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
|
||||
GB9: . (Extend_ | ZWJ);
|
||||
|
||||
GB9a: . SpacingMark;
|
||||
GB9b: Prepend .;
|
||||
|
|
15
icu4c/source/test/testdata/rbbitst.txt
vendored
15
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -169,18 +169,9 @@
|
|||
#
|
||||
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
|
||||
|
||||
#
|
||||
# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
|
||||
# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
|
||||
# Sample Chars: LinkingConsonant: \u0915
|
||||
# Virama: \u094d [also Extend]
|
||||
# ExtCccZWJ: \u0308
|
||||
# Extend but not ExtCCCZWJ \u093A
|
||||
<char>
|
||||
<data>•\u0915\u094d\u0915•</data>
|
||||
<data>•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•</data>
|
||||
<data>•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•</data>
|
||||
<data>•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•</data>
|
||||
# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
|
||||
# This test would have caught ICU-22956.
|
||||
<data>•સૻ્સૻ•</data>
|
||||
|
||||
#
|
||||
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt
|
||||
|
|
Binary file not shown.
|
@ -145,9 +145,9 @@ public class RBBITestMonkey extends CoreTestFmwk {
|
|||
UnicodeSet fHangulSet;
|
||||
UnicodeSet fZWJSet;
|
||||
UnicodeSet fExtendedPictSet;
|
||||
UnicodeSet fViramaSet;
|
||||
UnicodeSet fLinkingConsonantSet;
|
||||
UnicodeSet fExtCccZwjSet;
|
||||
UnicodeSet fInCBLinkerSet;
|
||||
UnicodeSet fInCBConsonantSet;
|
||||
UnicodeSet fInCBExtendSet;
|
||||
UnicodeSet fAnySet;
|
||||
|
||||
|
||||
|
@ -176,11 +176,9 @@ public class RBBITestMonkey extends CoreTestFmwk {
|
|||
fHangulSet.addAll(fLVTSet);
|
||||
|
||||
fExtendedPictSet = new UnicodeSet("[:Extended_Pictographic:]");
|
||||
fViramaSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
|
||||
+ "\\p{Indic_Syllabic_Category=Virama}]");
|
||||
fLinkingConsonantSet = new UnicodeSet("[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
|
||||
+ "\\p{Indic_Syllabic_Category=Consonant}]");
|
||||
fExtCccZwjSet = new UnicodeSet("[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]");
|
||||
fInCBLinkerSet = new UnicodeSet("[\\p{InCB=Linker}]");
|
||||
fInCBConsonantSet = new UnicodeSet("[\\p{InCB=Consonant}]");
|
||||
fInCBExtendSet = new UnicodeSet("[\\p{InCB=Extend}]");
|
||||
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]");
|
||||
|
||||
|
||||
|
@ -196,9 +194,9 @@ public class RBBITestMonkey extends CoreTestFmwk {
|
|||
fSets.add(fAnySet); fClassNames.add("Any");
|
||||
fSets.add(fZWJSet); fClassNames.add("ZWJ");
|
||||
fSets.add(fExtendedPictSet); fClassNames.add("ExtendedPict");
|
||||
fSets.add(fViramaSet); fClassNames.add("Virama");
|
||||
fSets.add(fLinkingConsonantSet); fClassNames.add("LinkingConsonant");
|
||||
fSets.add(fExtCccZwjSet); fClassNames.add("ExtCccZwj");
|
||||
fSets.add(fInCBLinkerSet); fClassNames.add("InCB=Linker");
|
||||
fSets.add(fInCBConsonantSet); fClassNames.add("InCB=Consonant");
|
||||
fSets.add(fInCBExtendSet); fClassNames.add("InCB=Extend");
|
||||
}
|
||||
|
||||
|
||||
|
@ -315,17 +313,18 @@ public class RBBITestMonkey extends CoreTestFmwk {
|
|||
}
|
||||
|
||||
// Note: Viramas are also included in the ExtCccZwj class.
|
||||
if (fLinkingConsonantSet.contains(c2)) {
|
||||
if (fInCBConsonantSet.contains(c2)) {
|
||||
int pi = p1;
|
||||
boolean sawVirama = false;
|
||||
while (pi > 0 && fExtCccZwjSet.contains(fText.codePointAt(pi))) {
|
||||
if (fViramaSet.contains(fText.codePointAt(pi))) {
|
||||
while (pi > 0 && (fInCBExtendSet.contains(fText.codePointAt(pi)) ||
|
||||
fInCBLinkerSet.contains(fText.codePointAt(pi)))) {
|
||||
if (fInCBLinkerSet.contains(fText.codePointAt(pi))) {
|
||||
sawVirama = true;
|
||||
}
|
||||
pi = fText.offsetByCodePoints(pi, -1);
|
||||
}
|
||||
if (sawVirama && fLinkingConsonantSet.contains(fText.codePointAt(pi))) {
|
||||
setAppliedRule(p2, "GB 9.3 LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
|
||||
if (sawVirama && fInCBConsonantSet.contains(fText.codePointAt(pi))) {
|
||||
setAppliedRule(p2, "GB9c \\p{InCB=Consonant} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* \\p{InCB=Linker} [ \\p{InCB=Extend} \\p{InCB=Linker} ]* × \\p{InCB=Consonant})");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,7 @@ CR = [\p{Grapheme_Cluster_Break = CR}];
|
|||
LF = [\p{Grapheme_Cluster_Break = LF}];
|
||||
|
||||
Control = [[\p{Grapheme_Cluster_Break = Control}]];
|
||||
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
|
||||
Extend_ = [[\p{Grapheme_Cluster_Break = Extend}]];
|
||||
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
|
||||
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
|
||||
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
|
||||
|
@ -38,11 +38,9 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
|||
Extended_Pict = [:ExtPict:];
|
||||
|
||||
# Indic Sequences
|
||||
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];
|
||||
|
||||
LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];
|
||||
|
||||
ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
|
||||
InCBLinker = [\p{InCB=Linker}];
|
||||
InCBConsonant = [\p{InCB=Consonant}];
|
||||
InCBExtend = [\p{InCB=Extend}];
|
||||
|
||||
GB3: CR LF;
|
||||
GB4: (Control | CR | LF) ÷;
|
||||
|
@ -52,9 +50,9 @@ GB6: L (L | V | LV | LVT);
|
|||
GB7: (LV | V) (V | T);
|
||||
GB8: (LVT | T) T;
|
||||
|
||||
GB11: Extended_Pict Extend* ZWJ Extended_Pict;
|
||||
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
|
||||
GB9: . (Extend | ZWJ);
|
||||
GB11: Extended_Pict Extend_* ZWJ Extended_Pict;
|
||||
GB9c: InCBConsonant ( InCBExtend | InCBLinker )* InCBLinker ( InCBExtend | InCBLinker )* InCBConsonant;
|
||||
GB9: . (Extend_ | ZWJ);
|
||||
|
||||
GB9a: . SpacingMark;
|
||||
GB9b: Prepend .;
|
||||
|
|
|
@ -169,18 +169,9 @@
|
|||
#
|
||||
#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
|
||||
|
||||
#
|
||||
# ICU-13637 and CLDR-10994 - Indic Grapheme Cluster Boundary changes to support aksaras
|
||||
# New rule: LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
|
||||
# Sample Chars: LinkingConsonant: \u0915
|
||||
# Virama: \u094d [also Extend]
|
||||
# ExtCccZWJ: \u0308
|
||||
# Extend but not ExtCCCZWJ \u093A
|
||||
<char>
|
||||
<data>•\u0915\u094d\u0915•</data>
|
||||
<data>•\u0915\u0308\u0308\u094d\u0308\u0308\u0915•</data>
|
||||
<data>•\u0915\u0308\u0308\u094d\u0308\u0308•\u0041•</data>
|
||||
<data>•\u0915\u0308\u0308\u094d\u093A\u093A•\u0915•</data>
|
||||
# From L2/14-131, §3.2; made into a single EGC by UTC-179-C31.
|
||||
# This test would have caught ICU-22956.
|
||||
<data>•સૻ્સૻ•</data>
|
||||
|
||||
#
|
||||
# From cldr/common/testData/segmentation/graphemeCluster/TestSegmenter-Bengali.txt
|
||||
|
|
Loading…
Add table
Reference in a new issue