ICU-12079 Full width digits become numeric for word break.

This commit is contained in:
Andy Heninger 2018-12-03 17:34:38 -08:00
parent 0a258170f9
commit 58786f5cbe
11 changed files with 12 additions and 13 deletions

View file

@ -44,7 +44,7 @@ $Double_Quote = [\p{Word_Break = Double_Quote}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$WSegSpace = [\p{Word_Break = WSegSpace}];
$Extended_Pict = [:ExtPict:];

View file

@ -44,7 +44,7 @@ $Double_Quote = [\p{Word_Break = Double_Quote}];
$MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
$MidLetter = [\p{Word_Break = MidLetter} - [\:]];
$MidNum = [\p{Word_Break = MidNum} [.]];
$Numeric = [\p{Word_Break = Numeric}];
$Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$WSegSpace = [\p{Word_Break = WSegSpace}];
$Extended_Pict = [:ExtPict:];

View file

@ -1872,7 +1872,7 @@ RBBIWordMonkey::RBBIWordMonkey()
fMidNumLetSet = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]", status);
fMidLetterSet = new UnicodeSet(u"[\\p{Word_Break = MidLetter}]", status);
fMidNumSet = new UnicodeSet(u"[\\p{Word_Break = MidNum}]", status);
fNumericSet = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
fNumericSet = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
fFormatSet = new UnicodeSet(u"[\\p{Word_Break = Format}]", status);
fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);

View file

@ -30,7 +30,7 @@ Double_Quote = [\p{Word_Break = Double_Quote}];
MidNumLet = [\p{Word_Break = MidNumLet}];
MidLetter = [\p{Word_Break = MidLetter}];
MidNum = [\p{Word_Break = MidNum}];
Numeric = [\p{Word_Break = Numeric}];
Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079;
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
WSegSpace = [\p{Word_Break = WSegSpace}];
Extended_Pict = [:ExtPict:];

View file

@ -29,7 +29,7 @@ Double_Quote = [\p{Word_Break = Double_Quote}];
MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
MidLetter = [\p{Word_Break = MidLetter} - [\:]];
MidNum = [\p{Word_Break = MidNum} [.]];
Numeric = [\p{Word_Break = Numeric}];
Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079;
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
WSegSpace = [\p{Word_Break = WSegSpace}];
Extended_Pict = [:ExtPict:];

View file

@ -240,8 +240,7 @@
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
# <data>•ISN'<200> •19<100>日<400></data>
# why was this added with the dbbi stuff?
<data>•ISN'<200> •19<100>日<400></data>
# to test for bug #4098467
# What follows is a string of Korean characters (I found it in the Yellow Pages

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:70f6389c315809aac32ac3145b5fe813e59de7b8ac13f4a8ead60f58c443cca5
size 12697817
oid sha256:1b8bb0208f9fd791029d55f17dd9722d7b4062f5478e55c28722cc7188435507
size 12690372

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f810eeeeb46325f0b314b2ea0d971ee7c3c850e6160660651aff0be8aaaa27f7
oid sha256:99a35b2f985a8a281b8474cc074195d23bdc6757ea561d082b5fb94a2d749cb2
size 92787

View file

@ -330,7 +330,7 @@ public class RBBITestMonkey extends TestFmwk {
fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
fNumericSet = new UnicodeSet("[[\\p{Word_Break = Numeric}][\\uFF10-\\uff19]]");
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]");

View file

@ -30,7 +30,7 @@ Double_Quote = [\p{Word_Break = Double_Quote}];
MidNumLet = [\p{Word_Break = MidNumLet}];
MidLetter = [\p{Word_Break = MidLetter}];
MidNum = [\p{Word_Break = MidNum}];
Numeric = [\p{Word_Break = Numeric}];
Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079;
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
WSegSpace = [\p{Word_Break = WSegSpace}];
Extended_Pict = [:ExtPict:];

View file

@ -29,7 +29,7 @@ Double_Quote = [\p{Word_Break = Double_Quote}];
MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
MidLetter = [\p{Word_Break = MidLetter} - [\:]];
MidNum = [\p{Word_Break = MidNum} [.]];
Numeric = [\p{Word_Break = Numeric}];
Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079;
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
WSegSpace = [\p{Word_Break = WSegSpace}];
Extended_Pict = [:ExtPict:];