From 8b6db596947b88143b00ba9cc9e0581a7f5b9a85 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 20 May 2004 21:27:44 +0000 Subject: [PATCH] ICU-3295 rbbi rt port to Java, bugfixes ported over from C++ X-SVN-Rev: 15441 --- .../icu/dev/test/rbbi/BreakIteratorTest.java | 9 +- .../com/ibm/icu/dev/test/rbbi/RBBITest.java | 4 +- .../icu/dev/test/rbbi/RBBITestExtended.java | 33 ------- .../ibm/icu/dev/test/rbbi/RBBITestMonkey.java | 91 +++++++++++++++++-- .../src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 4 +- icu4j/src/com/ibm/icu/impl/data/icudata.jar | 4 +- 6 files changed, 90 insertions(+), 55 deletions(-) diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java index adc9b80eb56..08061daa9f7 100755 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/BreakIteratorTest.java @@ -800,9 +800,7 @@ public class BreakIteratorTest extends TestFmwk public void TestBug4097920() { Vector lineSelectionData = new Vector(); - lineSelectionData.addElement("dog,"); - lineSelectionData.addElement("cat,"); - lineSelectionData.addElement("mouse "); + lineSelectionData.addElement("dog,cat,mouse "); lineSelectionData.addElement("(one)"); lineSelectionData.addElement("(two)\n"); generalIteratorTest(lineBreak, lineSelectionData); @@ -964,8 +962,9 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\ lineSelectionData.addElement("\u0e40\u0e03\u0e35\u0e22\u0e07"); lineSelectionData.addElement("\u0e43\u0e2b\u0e21\u0e48"); - generalIteratorTest(BreakIterator.getLineInstance(new Locale("th", "", "")), - lineSelectionData); + Locale loc = new Locale("th", "", ""); + BreakIterator bi = BreakIterator.getLineInstance(loc); + generalIteratorTest(bi, lineSelectionData); } /** diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index f868b60229e..d6a346e286b 100755 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -510,9 +510,7 @@ public class RBBITest extends TestFmwk // linedata.addElement("foo\ufeffbar"); // to test for bug #4097920 - linedata.addElement("dog,"); - linedata.addElement("cat,"); - linedata.addElement("mouse "); + linedata.addElement("dog,cat,mouse "); linedata.addElement("(one)"); linedata.addElement("(two)\n"); diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java index e067dd21f2b..46a378c916a 100644 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java @@ -379,8 +379,6 @@ void executeTest(TestParams t) { // and this one. for (i=prevBP+1; i 0) { - int k; - for (k = 0; k < expectedcount; k ++) { - if (j == expected[k]) { - System.out.println("------------------------------------------------ " + j); - } - } - } - int c = UTF16.charAt(ustr, j); - if (c > 0xffff) { - j ++; - } - name = UCharacter.getName(c); - System.out.println( UCharacter.isUAlphabetic(c) + " " + - UCharacter.hasBinaryProperty(c, UProperty.GRAPHEME_EXTEND) + " " + - UCharacter.isLetterOrDigit(c) + " " + - UCharacter.getPropertyValueName(UProperty.LINE_BREAK, - UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK), - UProperty.NameChoice.SHORT) - ); - } -} } \ No newline at end of file diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 9b70cf64c12..7dd8872b92c 100644 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -1,8 +1,10 @@ /* - * Created on Apr 23, 2004 - * + ******************************************************************************* + * Copyright (C) 2003-2004 International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* */ -package com.ibm.icu.dev.test.rbbi; + package com.ibm.icu.dev.test.rbbi; // Monkey testing of RuleBasedBreakIterator @@ -22,6 +24,10 @@ import java.util.Locale; * Monkey tests for RBBI. These tests have independent implementations of * the Unicode TR boundary rules, and compare results between these and ICU's * implementation, using random data. + * + * Tests cover Grapheme Cluster (char), Word and Line breaks + * + * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp * */ public class RBBITestMonkey extends TestFmwk { @@ -685,7 +691,7 @@ public class RBBITestMonkey extends TestFmwk { if (!fNU.contains(prevChar) && fCL.contains(thisChar) || fEX.contains(thisChar) || !fNU.contains(prevChar) && fIS.contains(thisChar) || - fSY.contains(thisChar)) { + !fNU.contains(prevChar) && fSY.contains(thisChar)) { continue; } @@ -793,7 +799,12 @@ public class RBBITestMonkey extends TestFmwk { continue; } } - + if (fPR.contains(prevChar) && fAL.contains(thisChar)) { + continue; + } + if (fPR.contains(prevChar) && fID.contains(thisChar)) { + continue; + } // LB 18b if (fHY.contains(prevChar) || fBB.contains(thisChar)) { break; @@ -803,6 +814,11 @@ public class RBBITestMonkey extends TestFmwk { if (fAL.contains(prevChar) && fAL.contains(thisChar)) { continue; } + + // LB 19b + if (fIS.contains(prevChar) && fAL.contains(thisChar)) { + continue; + } // LB 20 Break everywhere else break; @@ -927,8 +943,8 @@ public class RBBITestMonkey extends TestFmwk { // Match the following regular expression in the input text. - // (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS) CM*) * (CL CM*)? (PO CM*)? - // 0 1 3 3 3 7 7 7 7 9 9 11 11 (match states) + // (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)? + // 0 1 3 3 3 7 7 7 7 7 9 9 11 11 (match states) // retVals array [0] index of the start of the match, or -1 if no match // [1] index of first char following the match. private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) { @@ -992,8 +1008,8 @@ public class RBBITestMonkey extends TestFmwk { break; } break matchLoop; /* No Match */ - // (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS) CM*) * (CL CM*)? (PO CM*)? - // 0 1 3 3 4 7 7 7 7 7 9 9 11 11 (match states) + // (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)? + // 0 1 3 3 4 7 7 7 7 7 7 9 9 11 11 (match states) case 7: if (cLBType == UCharacter.LineBreak.COMBINING_MARK) { @@ -1008,6 +1024,10 @@ public class RBBITestMonkey extends TestFmwk { matchState = 7; break; } + if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) { + matchState = 7; + break; + } if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) { matchState = 9; break; @@ -1121,7 +1141,9 @@ public class RBBITestMonkey extends TestFmwk { } - + // + // The following UnicodeSets are used in matching a Grapheme Cluster + // private static UnicodeSet GC_Control = new UnicodeSet("[[:Zl:][:Zp:][:Cc:][:Cf:]-[\\u000d\\u000a]-[:Grapheme_Extend:]]"); @@ -1308,6 +1330,8 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1]; boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1]; boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1]; + boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; + boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; int i; int loopCount = 0; boolean printTestData = false; @@ -1388,6 +1412,8 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int Arrays.fill(forwardBreaks, false); Arrays.fill(reverseBreaks, false); Arrays.fill(isBoundaryBreaks, false); + Arrays.fill(followingBreaks, false); + Arrays.fill(precedingBreaks, false); // Calculate the expected results for this test string. mk.setText(testText); @@ -1446,6 +1472,47 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int isBoundaryBreaks[i] = bi.isBoundary(i); } + // Find the break positions using the following() function. + lastBreakPos = 0; + followingBreaks[0] = true; + for (i=0; i testText.length() || + breakPos > lastBreakPos && lastBreakPos > i ) { + errln(name + " break monkey test: " + + "Out of range value returned by BreakIterator::following().\n" + + "index=" + i + "following returned=" + breakPos + + "lastBreak=" + lastBreakPos); + precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. + } else { + followingBreaks[breakPos] = true; + lastBreakPos = breakPos; + } + } + + // Find the break positions using the preceding() function. + lastBreakPos = testText.length(); + precedingBreaks[testText.length()] = true; + for (i=testText.length(); i>0; i--) { + breakPos = bi.preceding(i); + if (breakPos >= i || + breakPos > lastBreakPos || + breakPos < 0 || + breakPos < lastBreakPos && lastBreakPos < i ) { + errln(name + " break monkey test: " + + "Out of range value returned by BreakIterator::preceding().\n" + + "index=" + i + "preceding returned=" + breakPos + + "lastBreak=" + lastBreakPos); + precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. + } else { + precedingBreaks[breakPos] = true; + lastBreakPos = breakPos; + } + } + + // Compare the expected and actual results. for (i=0; i<=testText.length(); i++) { @@ -1456,6 +1523,10 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int errorType = "previous()"; } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { errorType = "isBoundary()"; + } else if (followingBreaks[i] != expectedBreaks[i]) { + errorType = "following()"; + } else if (precedingBreaks[i] != expectedBreaks[i]) { + errorType = "preceding()"; } diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index dfdaed67fd7..26f83b8c6d5 100644 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2001-2003 International Business Machines +# Copyright (c) 2001-2004 International Business Machines # Corporation and others. All Rights Reserved. # # RBBI Test Data @@ -445,7 +445,7 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12" •foo\u00a0bar• # to test for bug #4097920 -•dog,•cat,•mouse •(one)•(two)\n<100> +•dog,cat,mouse •(one)•(two)\n<100> # to test for bug #4035266 •The •balance •is •$-23,456.78, •not •-•$32,456.78!\n<100> diff --git a/icu4j/src/com/ibm/icu/impl/data/icudata.jar b/icu4j/src/com/ibm/icu/impl/data/icudata.jar index dce9a32bbc6..50f07d86ab6 100644 --- a/icu4j/src/com/ibm/icu/impl/data/icudata.jar +++ b/icu4j/src/com/ibm/icu/impl/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bdc90082678b551c99d1eaf505af262fc39509442d2ab71874cdf5f70bb2c83 -size 1495983 +oid sha256:60dbf83606b385c96bff52ccfddb4dc4cbfc7ef062aa8ee9ef37ede6b4e7375c +size 1572633