mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-12079 Full width digits become numeric for word break.
This commit is contained in:
parent
0a258170f9
commit
58786f5cbe
11 changed files with 12 additions and 13 deletions
|
@ -44,7 +44,7 @@ $Double_Quote = [\p{Word_Break = Double_Quote}];
|
|||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [:ExtPict:];
|
||||
|
|
|
@ -44,7 +44,7 @@ $Double_Quote = [\p{Word_Break = Double_Quote}];
|
|||
$MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
|
||||
$MidLetter = [\p{Word_Break = MidLetter} - [\:]];
|
||||
$MidNum = [\p{Word_Break = MidNum} [.]];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [:ExtPict:];
|
||||
|
|
|
@ -1872,7 +1872,7 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
|
||||
fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status);
|
||||
fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
|
||||
fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
|
||||
fNumericSet = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
|
||||
fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
|
||||
fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
|
||||
fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
|
||||
|
|
|
@ -30,7 +30,7 @@ Double_Quote = [\p{Word_Break = Double_Quote}];
|
|||
MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
MidLetter = [\p{Word_Break = MidLetter}];
|
||||
MidNum = [\p{Word_Break = MidNum}];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079;
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
|
|
@ -29,7 +29,7 @@ Double_Quote = [\p{Word_Break = Double_Quote}];
|
|||
MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
|
||||
MidLetter = [\p{Word_Break = MidLetter} - [\:]];
|
||||
MidNum = [\p{Word_Break = MidNum} [.]];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079;
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
|
3
icu4c/source/test/testdata/rbbitst.txt
vendored
3
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -240,8 +240,7 @@
|
|||
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
|
||||
|
||||
# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
|
||||
# <data>•ISN'T<200> •19<100>日<400></data>
|
||||
# why was this added with the dbbi stuff?
|
||||
<data>•ISN'T<200> •19<100>日<400></data>
|
||||
|
||||
# to test for bug #4098467
|
||||
# What follows is a string of Korean characters (I found it in the Yellow Pages
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:70f6389c315809aac32ac3145b5fe813e59de7b8ac13f4a8ead60f58c443cca5
|
||||
size 12697817
|
||||
oid sha256:1b8bb0208f9fd791029d55f17dd9722d7b4062f5478e55c28722cc7188435507
|
||||
size 12690372
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f810eeeeb46325f0b314b2ea0d971ee7c3c850e6160660651aff0be8aaaa27f7
|
||||
oid sha256:99a35b2f985a8a281b8474cc074195d23bdc6757ea561d082b5fb94a2d749cb2
|
||||
size 92787
|
||||
|
|
|
@ -330,7 +330,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
|
||||
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
|
||||
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
|
||||
fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
|
||||
fNumericSet = new UnicodeSet("[[\\p{Word_Break = Numeric}][\\uFF10-\\uff19]]");
|
||||
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
|
||||
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
|
||||
fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");
|
||||
|
|
|
@ -30,7 +30,7 @@ Double_Quote = [\p{Word_Break = Double_Quote}];
|
|||
MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
MidLetter = [\p{Word_Break = MidLetter}];
|
||||
MidNum = [\p{Word_Break = MidNum}];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079;
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
|
|
@ -29,7 +29,7 @@ Double_Quote = [\p{Word_Break = Double_Quote}];
|
|||
MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
|
||||
MidLetter = [\p{Word_Break = MidLetter} - [\:]];
|
||||
MidNum = [\p{Word_Break = MidNum} [.]];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079;
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
Extended_Pict = [:ExtPict:];
|
||||
|
|
Loading…
Add table
Reference in a new issue