diff --git a/icu4c/source/data/brkitr/char.txt b/icu4c/source/data/brkitr/char.txt index 04272b32768..e48b99f1d35 100644 --- a/icu4c/source/data/brkitr/char.txt +++ b/icu4c/source/data/brkitr/char.txt @@ -15,9 +15,9 @@ # $CR = \r; $LF = \n; -$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]]; +$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - [:Grapheme_Extend = TRUE:]]; -$Extend = [[:Grapheme_Extend = TRUE:] - [$Control]]; +$Extend = [[:Grapheme_Extend = TRUE:]]; # # Korean Syllable Definitions diff --git a/icu4c/source/data/brkitr/sent.txt b/icu4c/source/data/brkitr/sent.txt index 0656be96c49..7ae215caaa6 100644 --- a/icu4c/source/data/brkitr/sent.txt +++ b/icu4c/source/data/brkitr/sent.txt @@ -14,7 +14,7 @@ # Character categories as defined in TR 29 # $Sep = [\u000a \u000d \u0085 \u2028 \u2029]; -$Format = [[:Format:]]; +$Format = [[:Format:] - [:Grapheme_Extend:]]; $Sp = [[:Whitespace:] - $Sep]; $Lower = [[:Lowercase:]]; $Upper = [[:TitleCase_Letter:] [:Uppercase:]]; diff --git a/icu4c/source/data/brkitr/word.txt b/icu4c/source/data/brkitr/word.txt index aaea0506db0..21de6ed71fa 100644 --- a/icu4c/source/data/brkitr/word.txt +++ b/icu4c/source/data/brkitr/word.txt @@ -54,8 +54,8 @@ $Numeric = [:LineBreak = Numeric:]; $CR = \u000d; $LF = \u000a; $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]]; -$Extend = [[:Grapheme_Extend = TRUE:] - $Control]; -$Format = [[:Cf:]]; +$Extend = [[:Grapheme_Extend = TRUE:]]; +$Format = [[:Cf:] - $Extend]; $Hiragana = [:Hiragana:]; $Ideographic = [:IDEOGRAPHIC:]; diff --git a/icu4c/source/i18n/regexst.cpp b/icu4c/source/i18n/regexst.cpp index 4a14f975517..1aa55126ec4 100644 --- a/icu4c/source/i18n/regexst.cpp +++ b/icu4c/source/i18n/regexst.cpp @@ -108,14 +108,18 @@ static const UChar gIsWordPattern[] = { static const UChar gGC_ControlPattern[] = { // [ [ : Z l : ] [ : Z p : ] 0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, -// [ : C c : ] [ : C f : ] ] - 0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0}; +// [ : C c : ] [ : C f : ] - + 0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d, +// [ : G r a p h e m e _ + 0x5b, 0x3a, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f, +// E x t e n d : ] ] + 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0}; static const UChar gGC_ExtendPattern[] = { // [ \ p { G r a p h e m e _ 0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f, -// E x t e n d } - \ p { C f } ] - 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x2d, 0x5c, 0x70, 0x7b, 0x43, 0x66, 0x7d, 0x5d, 0}; +// E x t e n d } ] + 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0}; static const UChar gGC_LPattern[] = { // [ \ p { H a n g u l _ S y l diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index a39556e80fd..811a1e308d3 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -2091,7 +2091,7 @@ RBBICharMonkey::RBBICharMonkey() { fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster fCRLFSet = new UnicodeSet("[\\r\\n]", status); - fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]", status); + fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status); fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status); fHangulSet = new UnicodeSet( "[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}" @@ -2200,7 +2200,7 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0), fMidNumLetSet = new UnicodeSet("[\\u002e\\u003a]", status); fMidNumSet = new UnicodeSet("[\\p{Line_Break=Infix_Numeric}]", status); fNumericSet = new UnicodeSet("[\\p{Line_Break=Numeric}]", status); - fFormatSet = new UnicodeSet("[\\p{Format}]", status); + fFormatSet = new UnicodeSet("[\\p{Format}-\\p{Grapheme_Extend}]", status); fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status); fOtherSet = new UnicodeSet(); if(U_FAILURE(status)) {