From 839ec55eab09f6193994b2026d34bd313067510e Mon Sep 17 00:00:00 2001 From: Peter Edberg Date: Fri, 30 Aug 2013 06:39:01 +0000 Subject: [PATCH] ICU-10176 No line break in $SY $HL; update tests, including old missing updates for RBBITestMonkey (J) X-SVN-Rev: 34143 --- icu4j/main/shared/data/icudata.jar | 4 +- icu4j/main/shared/data/testdata.jar | 2 +- .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 38 ++++++++++++++----- .../src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 23 ++++++++++- 4 files changed, 53 insertions(+), 14 deletions(-) diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 87eb3b7ec3c..8c6f95b248d 100755 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec461f02d26e5167a56539e4bf2aabb81c038cec0498eb2bbbc44378977b719b -size 10966492 +oid sha256:0281eb436d3f76c50252cc66bbe357ba00aeb6db06839224cfefa18d386f3338 +size 10966706 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index 18a33e3b107..099c6e2cb12 100755 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d08aec73aa20b1669db6e144039aae4bd6357e39e8445986c02be057c169e7f1 +oid sha256:45587463e8dcef07be8a580b224bfbc9ccafd9e9f14d435f401c7b08294c36d9 size 725607 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index b4ab8b353e5..25e2d067890 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -557,6 +557,7 @@ public class RBBITestMonkey extends TestFmwk { UnicodeSet fSY; UnicodeSet fAI; UnicodeSet fAL; + UnicodeSet fHL; UnicodeSet fID; UnicodeSet fSA; UnicodeSet fJL; @@ -605,6 +606,7 @@ public class RBBITestMonkey extends TestFmwk { fSY = new UnicodeSet("[\\p{Line_break=SY}]"); fAI = new UnicodeSet("[\\p{Line_break=AI}]"); fAL = new UnicodeSet("[\\p{Line_break=AL}]"); + fHL = new UnicodeSet("[\\p{Line_break=HL}]"); fID = new UnicodeSet("[\\p{Line_break=ID}]"); fSA = new UnicodeSet("[\\p{Line_break=SA}]"); fJL = new UnicodeSet("[\\p{Line_break=JL}]"); @@ -657,6 +659,7 @@ public class RBBITestMonkey extends TestFmwk { fSets.add(fSY); fSets.add(fAI); fSets.add(fAL); + fSets.add(fHL); fSets.add(fID); fSets.add(fWJ); fSets.add(fSA); @@ -679,6 +682,7 @@ public class RBBITestMonkey extends TestFmwk { int prevChar; // Character at above position. Note that prevChar // and thisChar may not be adjacent because combining // characters between them will be ignored. + int prevCharX2; // Character before prevChar, more contex for LB 21a int nextPos; // Index of the next character following pos. // Usually skips over combining marks. @@ -695,7 +699,7 @@ public class RBBITestMonkey extends TestFmwk { // while the invalid values shift out and the "this" and // "prev" positions are filled in with good values. pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. - thisChar = prevChar = 0; + thisChar = prevChar = prevCharX2 = 0; nextPos = startPos; @@ -706,6 +710,7 @@ public class RBBITestMonkey extends TestFmwk { // "prevPos" can be arbitrarily far before "pos". for (;;) { // Advance to the next position to be tested. + prevCharX2 = prevChar; prevPos = pos; prevChar = thisChar; pos = nextPos; @@ -920,8 +925,19 @@ public class RBBITestMonkey extends TestFmwk { continue; } - // LB 22 + // LB 21a, HL (HY | BA) x + if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) { + continue; + } + + // LB 21b, SY x HL + if (fSY.contains(prevChar) && fHL.contains(thisChar)) { + continue; + } + + // LB 22 if (fAL.contains(prevChar) && fIN.contains(thisChar) || + fHL.contains(prevChar) && fIN.contains(thisChar) || fID.contains(prevChar) && fIN.contains(thisChar) || fIN.contains(prevChar) && fIN.contains(thisChar) || fNU.contains(prevChar) && fIN.contains(thisChar) ) { @@ -934,8 +950,10 @@ public class RBBITestMonkey extends TestFmwk { // NU x AL if (fID.contains(prevChar) && fPO.contains(thisChar) || fAL.contains(prevChar) && fNU.contains(thisChar) || - fNU.contains(prevChar) && fAL.contains(thisChar) ) { - continue; + fHL.contains(prevChar) && fNU.contains(thisChar) || + fNU.contains(prevChar) && fAL.contains(thisChar) || + fNU.contains(prevChar) && fHL.contains(thisChar) ) { + continue; } // LB 24 Do not break between prefix and letters or ideographs. @@ -943,8 +961,8 @@ public class RBBITestMonkey extends TestFmwk { // PR x AL // PO x AL if (fPR.contains(prevChar) && fID.contains(thisChar) || - fPR.contains(prevChar) && fAL.contains(thisChar) || - fPO.contains(prevChar) && fAL.contains(thisChar)) { + fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) || + fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { continue; } @@ -1011,22 +1029,22 @@ public class RBBITestMonkey extends TestFmwk { // LB 28 Do not break between alphabetics - if (fAL.contains(prevChar) && fAL.contains(thisChar)) { + if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { continue; } // LB 29 Do not break between numeric punctuation and alphabetics - if (fIS.contains(prevChar) && fAL.contains(thisChar)) { + if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { continue; } // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. // (AL | NU) x OP // CP x (AL | NU) - if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) { + if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) { continue; } - if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) { + if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) { continue; } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 0b8d0758e82..e5307124945 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -550,7 +550,14 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal •\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb• •\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3• •\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc• - + +# Test for #10176 (in root) + +•abc/•s •def• +•abc/\u05D9 •def• +•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC• +•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA• + ######################################################################################## # @@ -696,6 +703,13 @@ Bangkok)• •私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。• +# Test for #10176 (in ja) + +•abc/•s •def• +•abc/\u05D9 •def• +•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC• +•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA• + •私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈<400>々<400>は<400>ワ<400>ー<400>ドで<400>あ<400>る<400>。• @@ -772,3 +786,10 @@ Bangkok)• •abc •- •def •abc •-def •abc- •def • # With ASCII hyphen •abc •‐ •def •abc •‐def •abc‐ •def • # With Unicode u2010 hyphen + +# Test for #10176 (in fi) + +•abc/•s •def• +•abc/\u05D9 •def• +•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC• +•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•