mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-3295 rbbi rt port to Java, bugfixes ported over from C++
X-SVN-Rev: 15441
This commit is contained in:
parent
4b51dcf477
commit
8b6db59694
6 changed files with 90 additions and 55 deletions
|
@ -800,9 +800,7 @@ public class BreakIteratorTest extends TestFmwk
|
|||
public void TestBug4097920() {
|
||||
Vector lineSelectionData = new Vector();
|
||||
|
||||
lineSelectionData.addElement("dog,");
|
||||
lineSelectionData.addElement("cat,");
|
||||
lineSelectionData.addElement("mouse ");
|
||||
lineSelectionData.addElement("dog,cat,mouse ");
|
||||
lineSelectionData.addElement("(one)");
|
||||
lineSelectionData.addElement("(two)\n");
|
||||
generalIteratorTest(lineBreak, lineSelectionData);
|
||||
|
@ -964,8 +962,9 @@ lineSelectionData.addElement("(\u0e1b\u0e23\u0e30\u0e40\u0e17\u0e28\u0e44\u0e17\
|
|||
lineSelectionData.addElement("\u0e40\u0e03\u0e35\u0e22\u0e07");
|
||||
lineSelectionData.addElement("\u0e43\u0e2b\u0e21\u0e48");
|
||||
|
||||
generalIteratorTest(BreakIterator.getLineInstance(new Locale("th", "", "")),
|
||||
lineSelectionData);
|
||||
Locale loc = new Locale("th", "", "");
|
||||
BreakIterator bi = BreakIterator.getLineInstance(loc);
|
||||
generalIteratorTest(bi, lineSelectionData);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -510,9 +510,7 @@ public class RBBITest extends TestFmwk
|
|||
// linedata.addElement("foo\ufeffbar");
|
||||
|
||||
// to test for bug #4097920
|
||||
linedata.addElement("dog,");
|
||||
linedata.addElement("cat,");
|
||||
linedata.addElement("mouse ");
|
||||
linedata.addElement("dog,cat,mouse ");
|
||||
linedata.addElement("(one)");
|
||||
linedata.addElement("(two)\n");
|
||||
|
||||
|
|
|
@ -379,8 +379,6 @@ void executeTest(TestParams t) {
|
|||
// and this one.
|
||||
for (i=prevBP+1; i<bp; i++) {
|
||||
if (t.expectedBreaks[i] != 0) {
|
||||
int expected[] = {0, i};
|
||||
printStringBreaks(t.dataToBreak, expected, 2);
|
||||
errln("Forward Iteration, break expected, but not found. Pos=" + i +
|
||||
" File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
|
||||
}
|
||||
|
@ -388,8 +386,6 @@ void executeTest(TestParams t) {
|
|||
|
||||
// Check that the break we did find was expected
|
||||
if (t.expectedBreaks[bp] == 0) {
|
||||
int expected[] = {0, bp};
|
||||
printStringBreaks(t.dataToBreak, expected, 2);
|
||||
errln("Forward Iteration, break found, but not expected. Pos=" + bp +
|
||||
" File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
|
||||
} else {
|
||||
|
@ -472,35 +468,6 @@ void executeTest(TestParams t) {
|
|||
}
|
||||
|
||||
|
||||
void printStringBreaks(StringBuffer ustr, int expected[],
|
||||
int expectedcount)
|
||||
{
|
||||
String name;
|
||||
System.out.println("code alpha extend alphanum type line name");
|
||||
int j;
|
||||
for (j = 0; j < ustr.length(); j ++) {
|
||||
if (expectedcount > 0) {
|
||||
int k;
|
||||
for (k = 0; k < expectedcount; k ++) {
|
||||
if (j == expected[k]) {
|
||||
System.out.println("------------------------------------------------ " + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
int c = UTF16.charAt(ustr, j);
|
||||
if (c > 0xffff) {
|
||||
j ++;
|
||||
}
|
||||
name = UCharacter.getName(c);
|
||||
System.out.println( UCharacter.isUAlphabetic(c) + " " +
|
||||
UCharacter.hasBinaryProperty(c, UProperty.GRAPHEME_EXTEND) + " " +
|
||||
UCharacter.isLetterOrDigit(c) + " " +
|
||||
UCharacter.getPropertyValueName(UProperty.LINE_BREAK,
|
||||
UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK),
|
||||
UProperty.NameChoice.SHORT)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,8 +1,10 @@
|
|||
/*
|
||||
* Created on Apr 23, 2004
|
||||
*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2004 International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.rbbi;
|
||||
package com.ibm.icu.dev.test.rbbi;
|
||||
|
||||
|
||||
// Monkey testing of RuleBasedBreakIterator
|
||||
|
@ -22,6 +24,10 @@ import java.util.Locale;
|
|||
* Monkey tests for RBBI. These tests have independent implementations of
|
||||
* the Unicode TR boundary rules, and compare results between these and ICU's
|
||||
* implementation, using random data.
|
||||
*
|
||||
* Tests cover Grapheme Cluster (char), Word and Line breaks
|
||||
*
|
||||
* Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
|
||||
*
|
||||
*/
|
||||
public class RBBITestMonkey extends TestFmwk {
|
||||
|
@ -685,7 +691,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
|
||||
fEX.contains(thisChar) ||
|
||||
!fNU.contains(prevChar) && fIS.contains(thisChar) ||
|
||||
fSY.contains(thisChar)) {
|
||||
!fNU.contains(prevChar) && fSY.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -793,7 +799,12 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (fPR.contains(prevChar) && fAL.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
if (fPR.contains(prevChar) && fID.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
// LB 18b
|
||||
if (fHY.contains(prevChar) || fBB.contains(thisChar)) {
|
||||
break;
|
||||
|
@ -803,6 +814,11 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 19b
|
||||
if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 20 Break everywhere else
|
||||
break;
|
||||
|
@ -927,8 +943,8 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
|
||||
|
||||
// Match the following regular expression in the input text.
|
||||
// (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS) CM*) * (CL CM*)? (PO CM*)?
|
||||
// 0 1 3 3 3 7 7 7 7 9 9 11 11 (match states)
|
||||
// (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)?
|
||||
// 0 1 3 3 3 7 7 7 7 7 9 9 11 11 (match states)
|
||||
// retVals array [0] index of the start of the match, or -1 if no match
|
||||
// [1] index of first char following the match.
|
||||
private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
|
||||
|
@ -992,8 +1008,8 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
break;
|
||||
}
|
||||
break matchLoop; /* No Match */
|
||||
// (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS) CM*) * (CL CM*)? (PO CM*)?
|
||||
// 0 1 3 3 4 7 7 7 7 7 9 9 11 11 (match states)
|
||||
// (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)?
|
||||
// 0 1 3 3 4 7 7 7 7 7 7 9 9 11 11 (match states)
|
||||
|
||||
case 7:
|
||||
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
|
||||
|
@ -1008,6 +1024,10 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
matchState = 7;
|
||||
break;
|
||||
}
|
||||
if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
|
||||
matchState = 7;
|
||||
break;
|
||||
}
|
||||
if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
|
||||
matchState = 9;
|
||||
break;
|
||||
|
@ -1121,7 +1141,9 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// The following UnicodeSets are used in matching a Grapheme Cluster
|
||||
//
|
||||
private static UnicodeSet GC_Control =
|
||||
new UnicodeSet("[[:Zl:][:Zp:][:Cc:][:Cf:]-[\\u000d\\u000a]-[:Grapheme_Extend:]]");
|
||||
|
||||
|
@ -1308,6 +1330,8 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
|
|||
boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1];
|
||||
boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1];
|
||||
boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
|
||||
boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
|
||||
boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1];
|
||||
int i;
|
||||
int loopCount = 0;
|
||||
boolean printTestData = false;
|
||||
|
@ -1388,6 +1412,8 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
|
|||
Arrays.fill(forwardBreaks, false);
|
||||
Arrays.fill(reverseBreaks, false);
|
||||
Arrays.fill(isBoundaryBreaks, false);
|
||||
Arrays.fill(followingBreaks, false);
|
||||
Arrays.fill(precedingBreaks, false);
|
||||
|
||||
// Calculate the expected results for this test string.
|
||||
mk.setText(testText);
|
||||
|
@ -1446,6 +1472,47 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
|
|||
isBoundaryBreaks[i] = bi.isBoundary(i);
|
||||
}
|
||||
|
||||
// Find the break positions using the following() function.
|
||||
lastBreakPos = 0;
|
||||
followingBreaks[0] = true;
|
||||
for (i=0; i<testText.length(); i++) {
|
||||
breakPos = bi.following(i);
|
||||
if (breakPos <= i ||
|
||||
breakPos < lastBreakPos ||
|
||||
breakPos > testText.length() ||
|
||||
breakPos > lastBreakPos && lastBreakPos > i ) {
|
||||
errln(name + " break monkey test: " +
|
||||
"Out of range value returned by BreakIterator::following().\n" +
|
||||
"index=" + i + "following returned=" + breakPos +
|
||||
"lastBreak=" + lastBreakPos);
|
||||
precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
|
||||
} else {
|
||||
followingBreaks[breakPos] = true;
|
||||
lastBreakPos = breakPos;
|
||||
}
|
||||
}
|
||||
|
||||
// Find the break positions using the preceding() function.
|
||||
lastBreakPos = testText.length();
|
||||
precedingBreaks[testText.length()] = true;
|
||||
for (i=testText.length(); i>0; i--) {
|
||||
breakPos = bi.preceding(i);
|
||||
if (breakPos >= i ||
|
||||
breakPos > lastBreakPos ||
|
||||
breakPos < 0 ||
|
||||
breakPos < lastBreakPos && lastBreakPos < i ) {
|
||||
errln(name + " break monkey test: " +
|
||||
"Out of range value returned by BreakIterator::preceding().\n" +
|
||||
"index=" + i + "preceding returned=" + breakPos +
|
||||
"lastBreak=" + lastBreakPos);
|
||||
precedingBreaks[i] = !expectedBreaks[i]; // Forces an error.
|
||||
} else {
|
||||
precedingBreaks[breakPos] = true;
|
||||
lastBreakPos = breakPos;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Compare the expected and actual results.
|
||||
for (i=0; i<=testText.length(); i++) {
|
||||
|
@ -1456,6 +1523,10 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
|
|||
errorType = "previous()";
|
||||
} else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
|
||||
errorType = "isBoundary()";
|
||||
} else if (followingBreaks[i] != expectedBreaks[i]) {
|
||||
errorType = "following()";
|
||||
} else if (precedingBreaks[i] != expectedBreaks[i]) {
|
||||
errorType = "preceding()";
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2001-2003 International Business Machines
|
||||
# Copyright (c) 2001-2004 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# RBBI Test Data
|
||||
|
@ -445,7 +445,7 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
|
|||
<data>•foo\u00a0bar•</data>
|
||||
|
||||
# to test for bug #4097920
|
||||
<data>•dog,•cat,•mouse •(one)•(two)\n<100></data>
|
||||
<data>•dog,cat,mouse •(one)•(two)\n<100></data>
|
||||
|
||||
# to test for bug #4035266
|
||||
<data>•The •balance •is •$-23,456.78, •not •-•$32,456.78!\n<100></data>
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9bdc90082678b551c99d1eaf505af262fc39509442d2ab71874cdf5f70bb2c83
|
||||
size 1495983
|
||||
oid sha256:60dbf83606b385c96bff52ccfddb4dc4cbfc7ef062aa8ee9ef37ede6b4e7375c
|
||||
size 1572633
|
||||
|
|
Loading…
Add table
Reference in a new issue