mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-5359 rbbi, port of C changes to Java, in progress.
X-SVN-Rev: 20230
This commit is contained in:
parent
16c599e73d
commit
d7167854cb
1 changed files with 180 additions and 132 deletions
|
@ -135,14 +135,17 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
RBBIWordMonkey() {
|
||||
fSets = new ArrayList();
|
||||
|
||||
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]");
|
||||
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]");
|
||||
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" +
|
||||
"[\\p{Line_Break = Complex_Context}" +
|
||||
"-\\p{Grapheme_Cluster_Break = Extend}" +
|
||||
"-\\p{Grapheme_Cluster_Break = Control}]]");
|
||||
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}-[\\uff9e\\uff9f]]");
|
||||
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
|
||||
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]");
|
||||
fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]");
|
||||
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]");
|
||||
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}\\uff9e\\uff9f]");
|
||||
fOtherSet = new UnicodeSet();
|
||||
|
||||
fOtherSet.complement();
|
||||
|
@ -200,21 +203,16 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
p1 = p2; c1 = c2;
|
||||
p2 = p3; c2 = c3;
|
||||
|
||||
// Advancd p3 by (GC Format*) Rules 3, 4
|
||||
p3 = nextGC(fText, p3);
|
||||
if (p3 == -1 || p3 >= fText.length()) {
|
||||
p3 = fText.length();
|
||||
c3 = 0;
|
||||
} else {
|
||||
c3 = UTF16.charAt(fText, p3);
|
||||
while (fFormatSet.contains(c3)) {
|
||||
p3 = moveIndex32(fText, p3, 1);
|
||||
c3 = 0;
|
||||
if (p3 < fText.length()) {
|
||||
c3 = UTF16.charAt(fText, p3);
|
||||
}
|
||||
// Advancd p3 by X(Extend | Format)* Rule 4
|
||||
do {
|
||||
p3 = moveIndex32(fText, p3, 1);
|
||||
c3 = -1;
|
||||
if (p3>=fText.length()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
c3 = UTF16.charAt(fText, p3);
|
||||
}
|
||||
while (fFormatSet.contains(c3) || fExtendSet.contains(c3));
|
||||
|
||||
if (p1 == p2) {
|
||||
// Still warming up the loop. (won't work with zero length strings, but we don't care)
|
||||
|
@ -225,6 +223,14 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
break;
|
||||
}
|
||||
|
||||
// Rule (3) CR x LF
|
||||
// No Extend or Format characters may appear between the CR and LF,
|
||||
// which requires the additional check for p2 immediately following p1.
|
||||
//
|
||||
if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (5). ALetter x ALetter
|
||||
if (fALetterSet.contains(c1) &&
|
||||
fALetterSet.contains(c2)) {
|
||||
|
@ -417,9 +423,14 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fSets.add(fBA);
|
||||
fSets.add(fBB);
|
||||
fSets.add(fHY);
|
||||
fSets.add(fH2);
|
||||
fSets.add(fH3);
|
||||
fSets.add(fCL);
|
||||
fSets.add(fEX);
|
||||
fSets.add(fIN);
|
||||
fSets.add(fJL);
|
||||
fSets.add(fJT);
|
||||
fSets.add(fJV);
|
||||
fSets.add(fNS);
|
||||
fSets.add(fOP);
|
||||
fSets.add(fQU);
|
||||
|
@ -433,6 +444,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fSets.add(fID);
|
||||
fSets.add(fWJ);
|
||||
fSets.add(fSA);
|
||||
fSets.add(fSG);
|
||||
|
||||
}
|
||||
|
||||
|
@ -489,14 +501,14 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
break;
|
||||
}
|
||||
|
||||
// Rule LB 7 - adjust for combining sequences.
|
||||
// Rule LB 9 - adjust for combining sequences.
|
||||
// We do this rule out-of-order because the adjustment does
|
||||
// not effect the way that rules LB 3 through LB 6 match,
|
||||
// and doing it here rather than after LB 6 is substantially
|
||||
// simpler when combining sequences do occur.
|
||||
|
||||
|
||||
// LB 7b Keep combining sequences together.
|
||||
// LB 9 Keep combining sequences together.
|
||||
// advance over any CM class chars at "pos",
|
||||
// result is "nextPos" for the following loop iteration.
|
||||
thisChar = UTF16.charAt(fText, pos);
|
||||
|
@ -514,7 +526,10 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
// LB 7c Treat unattached combining chars as AL
|
||||
// LB 9 Treat X CM* as if it were X
|
||||
// No explicit action required.
|
||||
|
||||
// LB 10 Treat any remaining combining mark as AL
|
||||
if (fCM.contains(thisChar)) {
|
||||
thisChar = 'A';
|
||||
}
|
||||
|
@ -527,12 +542,12 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB 3a Always break after hard line breaks,
|
||||
// LB 4 Always break after hard line breaks,
|
||||
if (fBK.contains(prevChar)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// LB 3b Break after CR, LF, NL, but not inside CR LF
|
||||
// LB 5 Break after CR, LF, NL, but not inside CR LF
|
||||
if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
@ -542,14 +557,14 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB 3c Don't break before hard line breaks
|
||||
// LB 6 Don't break before hard line breaks
|
||||
if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
|
||||
fLF.contains(thisChar) || fNL.contains(thisChar) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 4 Don't break before spaces or zero-width space.
|
||||
// LB 7 Don't break before spaces or zero-width space.
|
||||
if (fSP.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
@ -558,16 +573,33 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB 5 Break after zero width space
|
||||
// LB 8 Break after zero width space
|
||||
if (fZW.contains(prevChar)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// LB 7 Already done, at top of loop.
|
||||
// LB 9, 10 Already done, at top of loop.
|
||||
//
|
||||
|
||||
|
||||
// LB 8 Don't break before closings.
|
||||
// LB 11
|
||||
// x WJ
|
||||
// WJ x
|
||||
if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 12
|
||||
// (!SP) x GL
|
||||
// GL x
|
||||
if ((!fSP.contains(prevChar)) && fGL.contains(thisChar) ||
|
||||
fGL.contains(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 13 Don't break before closings.
|
||||
// NU x CL and NU x IS are not matched here so that they will
|
||||
// fall into LB 17 and the more general number regular expression.
|
||||
//
|
||||
|
@ -578,13 +610,15 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB 9 Don't break after OP SP*
|
||||
// LB 14 Don't break after OP SP*
|
||||
// Scan backwards, checking for this sequence.
|
||||
// The OP char could include combining marks, so we acually check for
|
||||
// OP CM* SP* x
|
||||
tPos = prevPos;
|
||||
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
|
||||
tPos=moveIndex32(fText, tPos, -1);
|
||||
if (fSP.contains(prevChar)) {
|
||||
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
|
||||
tPos=moveIndex32(fText, tPos, -1);
|
||||
}
|
||||
}
|
||||
while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
|
||||
tPos=moveIndex32(fText, tPos, -1);
|
||||
|
@ -593,9 +627,10 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB 10 Do not break withing "[
|
||||
// LB 15 Do not break withing "[
|
||||
// QU CM* SP* x OP
|
||||
if (fOP.contains(thisChar)) {
|
||||
// Scan backwards from prevChar to see if it is preceded by QU CM* SP*
|
||||
tPos = prevPos;
|
||||
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
|
||||
tPos = moveIndex32(fText, tPos, -1);
|
||||
|
@ -608,7 +643,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
// LB 11 CL SP* x NS
|
||||
// LB 16 CL SP* x NS
|
||||
if (fNS.contains(thisChar)) {
|
||||
tPos = prevPos;
|
||||
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
|
||||
|
@ -623,7 +658,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
}
|
||||
|
||||
|
||||
// LB 11a B2 SP* x B2
|
||||
// LB 17 B2 SP* x B2
|
||||
if (fB2.contains(thisChar)) {
|
||||
tPos = prevPos;
|
||||
while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
|
||||
|
@ -637,39 +672,24 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
// LB 11b
|
||||
// x WJ
|
||||
// WJ x
|
||||
if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 12 break after space
|
||||
// LB 18 break after space
|
||||
if (fSP.contains(prevChar)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// LB 13
|
||||
// x GL
|
||||
// GL x
|
||||
if (fGL.contains(thisChar) || fGL.contains(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 14
|
||||
// LB 19
|
||||
// x QU
|
||||
// QU x
|
||||
if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 14a Break around a CB
|
||||
// LB 20 Break around a CB
|
||||
if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// LB 15
|
||||
// LB 21
|
||||
if (fBA.contains(thisChar) ||
|
||||
fHY.contains(thisChar) ||
|
||||
fNS.contains(thisChar) ||
|
||||
|
@ -677,7 +697,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB 16
|
||||
// LB 22
|
||||
if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
|
||||
fID.contains(prevChar) && fIN.contains(thisChar) ||
|
||||
fIN.contains(prevChar) && fIN.contains(thisChar) ||
|
||||
|
@ -686,7 +706,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
}
|
||||
|
||||
|
||||
// LB 17 ID x PO (Note: Leading CM behaves like ID)
|
||||
// LB 23 ID x PO (Note: Leading CM behaves like ID)
|
||||
// AL x NU
|
||||
// NU x AL
|
||||
if (fID.contains(prevChar) && fPO.contains(thisChar) ||
|
||||
|
@ -695,7 +715,18 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB 18 Numbers
|
||||
// LB 24 Do not break between prefix and letters or ideographs.
|
||||
// PR x ID
|
||||
// PR x AL
|
||||
// PO x AL
|
||||
if (fPR.contains(prevChar) && fID.contains(thisChar) ||
|
||||
fPR.contains(prevChar) && fAL.contains(thisChar) ||
|
||||
fPO.contains(prevChar) && fAL.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 25 Numbers
|
||||
matchVals = LBNumberCheck(fText, prevPos, matchVals);
|
||||
if (matchVals[0] != -1) {
|
||||
// Matched a number. But could have been just a single digit, which would
|
||||
|
@ -718,15 +749,9 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
if (fPR.contains(prevChar) && fAL.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
if (fPR.contains(prevChar) && fID.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 18b Do not break Korean Syllables
|
||||
// LB 26 Do not break Korean Syllables
|
||||
if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
|
||||
fJV.contains(thisChar) ||
|
||||
fH2.contains(thisChar) ||
|
||||
|
@ -744,7 +769,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB 18c more Korean
|
||||
// LB 27 Treat a Korean Syllable Block the same as ID
|
||||
if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
|
||||
fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
|
||||
fIN.contains(thisChar)) {
|
||||
|
@ -762,19 +787,31 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
|
||||
|
||||
|
||||
// LB 19
|
||||
// LB 28 Do not break between alphabetics
|
||||
if (fAL.contains(prevChar) && fAL.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 19b
|
||||
// LB 29 Do not break between numeric punctuation and alphabetics
|
||||
if (fIS.contains(prevChar) && fAL.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 20 Break everywhere else
|
||||
break;
|
||||
|
||||
// LB 30 Do not break between letters, numbers or oridnary symbols and
|
||||
// opening or closing punctuation.
|
||||
// (AL | NU) x OP
|
||||
// CL x (AL | NU)
|
||||
if ((fAL.contains(prevChar) || fNU.contains(prevChar)) &&
|
||||
fOP.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
if (fCL.contains(prevChar) &&
|
||||
(fAL.contains(thisChar) || fNU.contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 31 Break everywhere else
|
||||
break;
|
||||
}
|
||||
|
||||
return pos;
|
||||
|
@ -783,8 +820,8 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
|
||||
|
||||
// Match the following regular expression in the input text.
|
||||
// (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)?
|
||||
// 0 1 3 3 3 7 7 7 7 7 9 9 11 11 (match states)
|
||||
// ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
|
||||
// 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
|
||||
// retVals array [0] index of the start of the match, or -1 if no match
|
||||
// [1] index of first char following the match.
|
||||
// Can not use Java regex because need supplementary character support,
|
||||
|
@ -803,7 +840,8 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
|
||||
switch (matchState) {
|
||||
case 0:
|
||||
if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
|
||||
if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
|
||||
cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
|
||||
matchState = 1;
|
||||
break;
|
||||
}
|
||||
|
@ -851,9 +889,9 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
break;
|
||||
}
|
||||
break matchLoop; /* No Match */
|
||||
// (PR CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PO CM*)?
|
||||
// 0 1 3 3 4 7 7 7 7 7 7 9 9 11 11 (match states)
|
||||
|
||||
// ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)?
|
||||
// 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states)
|
||||
|
||||
case 7:
|
||||
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
|
||||
matchState = 7;
|
||||
|
@ -879,6 +917,11 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
matchState = 11;
|
||||
break;
|
||||
}
|
||||
if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
|
||||
matchState = 11;
|
||||
break;
|
||||
}
|
||||
|
||||
break matchLoop; // Match Complete.
|
||||
case 9:
|
||||
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
|
||||
|
@ -889,6 +932,10 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
matchState = 11;
|
||||
break;
|
||||
}
|
||||
if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
|
||||
matchState = 11;
|
||||
break;
|
||||
}
|
||||
break matchLoop; // Match Complete.
|
||||
case 11:
|
||||
if (cLBType == UCharacter.LineBreak.COMBINING_MARK) {
|
||||
|
@ -937,6 +984,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
UnicodeSet fSTermSet;
|
||||
UnicodeSet fCloseSet;
|
||||
UnicodeSet fOtherSet;
|
||||
UnicodeSet fExtendSet;
|
||||
|
||||
|
||||
|
||||
|
@ -948,11 +996,12 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
|
||||
fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
|
||||
fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
|
||||
fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
|
||||
fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}-[\\uff9e\\uff9f]]");
|
||||
fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
|
||||
fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
|
||||
fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
|
||||
fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]");
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}\\uff9e\\uff9f]");
|
||||
fOtherSet = new UnicodeSet();
|
||||
|
||||
|
||||
|
@ -967,6 +1016,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fOtherSet.removeAll(fATermSet);
|
||||
fOtherSet.removeAll(fSTermSet);
|
||||
fOtherSet.removeAll(fCloseSet);
|
||||
fOtherSet.removeAll(fExtendSet);
|
||||
|
||||
fSets.add(fSepSet);
|
||||
fSets.add(fFormatSet);
|
||||
|
@ -980,6 +1030,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fSets.add(fSTermSet);
|
||||
fSets.add(fCloseSet);
|
||||
fSets.add(fOtherSet);
|
||||
fSets.add(fExtendSet);
|
||||
}
|
||||
|
||||
|
||||
|
@ -991,64 +1042,40 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fText = s;
|
||||
}
|
||||
|
||||
/*
|
||||
//
|
||||
// moveIndex32. Utility to move an index, needed to avoid
|
||||
// onewanted exceptions, and to simplify porting from C.
|
||||
//
|
||||
static int moveIndex32(StringBuffer s, int from, int delta) {
|
||||
int result;
|
||||
try {
|
||||
result = UTF16.moveCodePointOffset(s, from, delta);
|
||||
}
|
||||
catch(StringIndexOutOfBoundsException e) {
|
||||
result = delta < 0? 0: s.length();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
*/
|
||||
|
||||
// moveBack() Find the "significant" code point preceding the index i.
|
||||
// Skips over format chars, and 2nd-nth chars of grapheme clusters.
|
||||
// The incoming parameter i must be on a boundary already.
|
||||
// Skips over ($Extend | $Format)*
|
||||
//
|
||||
private int moveBack(int i) {
|
||||
int testPos;
|
||||
|
||||
if (i <= 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We are looking for the index of the first chunk that immediately
|
||||
// precedes the incoming index.
|
||||
testPos = i;
|
||||
for (;;) {
|
||||
testPos = moveIndex32(fText, testPos, -1);
|
||||
int endPos = moveForward(testPos);
|
||||
if (endPos < i) {
|
||||
return endPos;
|
||||
}
|
||||
if (testPos == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int c;
|
||||
int j = i;
|
||||
do {
|
||||
j = moveIndex32(fText, j, -1);
|
||||
c = UTF16.charAt(fText, j);
|
||||
}
|
||||
while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
|
||||
return j;
|
||||
}
|
||||
|
||||
|
||||
int moveForward(int i) {
|
||||
int result = fText.length();
|
||||
if (i < fText.length()) {
|
||||
result = nextGC(fText, i);
|
||||
if (i < 0) {
|
||||
i = fText.length();
|
||||
} else {
|
||||
if (!fSepSet.contains(cAt(i))) {
|
||||
while (result<fText.length() && fFormatSet.contains(cAt(result))) {
|
||||
result = moveIndex32(fText, result, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i>=fText.length()) {
|
||||
return fText.length();
|
||||
}
|
||||
return result;
|
||||
int c;
|
||||
int j = i;
|
||||
do {
|
||||
j = moveIndex32(fText, j, 1);
|
||||
c = cAt(j);
|
||||
}
|
||||
while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
|
||||
return j;
|
||||
|
||||
}
|
||||
|
||||
int cAt(int pos) {
|
||||
|
@ -1081,9 +1108,21 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
p0 = p1; c0 = c1;
|
||||
p1 = p2; c1 = c2;
|
||||
p2 = p3; c2 = c3;
|
||||
// Advancd p3 by a grapheme cluster. Rules 3, 4
|
||||
|
||||
// Advancd p3 by X(Extend | Format)* Rule 4
|
||||
p3 = moveForward(p3);
|
||||
c3 = cAt(p3);
|
||||
|
||||
// Rule (3) CR x LF
|
||||
if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rule (4) Sep <break>
|
||||
if (fSepSet.contains(c1)) {
|
||||
p2 = p1+1; // Separators don't combine with Extend or Format
|
||||
break;
|
||||
}
|
||||
|
||||
if (p2 >= fText.length()) {
|
||||
// Reached end of string. Always a break position.
|
||||
|
@ -1094,10 +1133,6 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
// Still warming up the loop. (won't work with zero length strings, but we don't care)
|
||||
continue;
|
||||
}
|
||||
// Rule (3). Sep <break>
|
||||
if (fSepSet.contains(c1)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Rule (6). ATerm x Numeric
|
||||
if (fATermSet.contains(c1) && fNumericSet.contains(c2)) {
|
||||
|
@ -1110,6 +1145,8 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
}
|
||||
|
||||
// Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower
|
||||
// Note: Sterm | ATerm are added to the negated part of the expression by a
|
||||
// note to the Unicode 5.0 documents.
|
||||
int p8 = p1;
|
||||
while (p8>0 && fSpSet.contains(cAt(p8))) {
|
||||
p8 = moveBack(p8);
|
||||
|
@ -1123,9 +1160,8 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
c = cAt(p8);
|
||||
if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
|
||||
fLowerSet.contains(c) || fSepSet.contains(c) ||
|
||||
fATermSet.contains(c) || fSTermSet.contains(c)) // This last line deviates from
|
||||
// the TR. The TR is wacky.
|
||||
{
|
||||
fATermSet.contains(c) || fSTermSet.contains(c))
|
||||
{
|
||||
break;
|
||||
}
|
||||
p8 = moveForward(p8);
|
||||
|
@ -1134,6 +1170,21 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Rule 8a (STerm | ATerm) Close* Sp* x (Sterm | ATerm)
|
||||
if (fSTermSet.contains(c2) || fATermSet.contains(c2)) {
|
||||
p8 = p1;
|
||||
while (fSpSet.contains(cAt(p8))) {
|
||||
p8 = moveBack(p8);
|
||||
}
|
||||
while (fCloseSet.contains(cAt(p8))) {
|
||||
p8 = moveBack(p8);
|
||||
}
|
||||
c = cAt(p8);
|
||||
if (fSTermSet.contains(c) || fATermSet.contains(c)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep)
|
||||
|
@ -1186,9 +1237,6 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Move an index into a string by n code points.
|
||||
* Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
|
||||
|
@ -1496,7 +1544,7 @@ void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int
|
|||
//--------------------------------------------------------------------------------------------
|
||||
|
||||
int dotsOnLine = 0;
|
||||
while (loopCount < numIterations || numIterations == -1) {
|
||||
while (loopCount < numIterations || numIterations == -1) {
|
||||
if (numIterations == -1 && loopCount % 10 == 0) {
|
||||
// If test is running in an infinite loop, display a periodic tic so
|
||||
// we can tell that it is making progress.
|
||||
|
|
Loading…
Add table
Reference in a new issue