ICU-2924 RBBI, line break monkey test, better conformance to spec

X-SVN-Rev: 13391
This commit is contained in:
Andy Heninger 2003-10-10 18:57:42 +00:00
parent 58880d85c6
commit 42281a6605
2 changed files with 38 additions and 53 deletions

View file

@ -2509,19 +2509,33 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
fNumberMatcher->reset(s);
}
//
// rule67Adjust
// Line Break TR rules 6 and 7 implementation.
// This deals with combining marks, Hangul Syllables, and other sequences that
// that must be treated as if they were something other than what they actually are.
//
// This is factored out into a separate function because it must be applied twice for
// each potential break, once to the chars before the position being checked, then
// again to the text following the possible break.
//
void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
if (pos == -1) {
// Invalid initial position. Happens during the warmup iteration of the
// main loop in next().
return;
}
int32_t nPos = *nextPos;
// LB 6 Treat Korean Syllables as a single unit
int32_t hangultype = u_getIntPropertyValue(*posChar, UCHAR_HANGUL_SYLLABLE_TYPE);
if (hangultype != U_HST_NOT_APPLICABLE) {
nPos = fCharBI->following(pos); // Advance by grapheme cluster, which
// contains the logic to locate Hangul syllables.
// contains the logic to locate Hangul syllables.
}
// LB 7b Keep combining sequences together. Here we just locate the end of "thisChar".
// (except for Hangul, which we did above.
// LB 7b Keep combining sequences together.
if (hangultype == U_HST_NOT_APPLICABLE) {
// advance over any CM class chars
for (;;) {
@ -2536,7 +2550,7 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
// LB 7a In a SP CM* sequence, treat the SP as an ID
if (nPos != *nextPos && fSP->contains(*posChar)) {
*posChar = 0x3400; // 0x3400 is a CJK Ideograph, linebreak type is ID.
*posChar = 0x4e00; // 0x4e00 is a CJK Ideograph, linebreak type is ID.
}
// LB 7b Treat X CM* as if it were x.
@ -2546,6 +2560,12 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
if (fCM->contains(*posChar)) {
*posChar = 0x41; // thisChar = 'A';
}
// Push the updated nextPos and nextChar back to our caller.
// This only makes a difference if posChar got bigger, by slurping up a
// combining sequence or Hangul syllable.
*nextPos = nPos;
*nextChar = fText->char32At(nPos);
}
@ -2630,46 +2650,11 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
break;
}
// LB 6, LB 7
rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
UChar32 c = fText->char32At(nextPos);
rule67Adjust(pos, &thisChar, &nextPos, &c);
// TODO: move this code into rule67Adjust
// LB 6 Treat Korean Syllables as a single unit
int32_t hangultype = u_getIntPropertyValue(thisChar, UCHAR_HANGUL_SYLLABLE_TYPE);
if (hangultype != U_HST_NOT_APPLICABLE) {
nextPos = fCharBI->following(pos); // Advance by grapheme cluster, which
// contains the logic to locate Hangul syllables.
}
// LB 7b Keep combining sequences together. Here we just locate the end of "thisChar".
// (except for Hangul, which we did above.
if (hangultype == U_HST_NOT_APPLICABLE) {
// advance over any CM class chars
for (;;) {
UChar32 c = fText->char32At(nextPos);
if (!fCM->contains(c)) {
break;
}
nextPos = fText->moveIndex32(nextPos, 1);
}
}
// LB 7a In a SP CM* sequence, treat the SP as an ID
if (nextCPPos != nextPos && fSP->contains(thisChar)) {
thisChar = 0x3400; // 0x3400 is a CJK Ideograph, linebreak type is ID.
}
// LB 7b Treat X CM* as if it were x.
// No explicit action required.
// LB 7c Treat any remaining combining mark as AL
if (fCM->contains(thisChar)) {
thisChar = 0x41; // thisChar = 'A';
}
// All adjustment of character values and positions is complete.
// If the loop is still warming up - if we haven't shifted the initial
// -1 positions out of prevPos yet - loop back to advance the
// position in the input without any further looking for breaks.
@ -2799,18 +2784,19 @@ fall_through_11:
fNumberMatcher->reset(subStr);
if (fNumberMatcher->lookingAt(status)) {
// TODO: Check status codes
int32_t numEndIdx = prevPos + fNumberMatcher->end(status);
// Matched a number. But could have been just a single digit, which would
// not represent a "no break here" between prevChar and thisChar
int32_t numEndIdx = prevPos + fNumberMatcher->end(status); // idx of first char following num
if (numEndIdx > pos) {
// We got a match on a number of more than one char.
// Need to move "pos" and "nextPos" to reflect the end
// of the number before continuing.
UChar32 lastCharInNumber;
nextPos = numEndIdx;
pos = numEndIdx;
do {
pos = fText->moveIndex32(pos, -1);
lastCharInNumber = fText->char32At(pos);
} while (fCM->contains(lastCharInNumber));
// Number match includes at least our two chars being checked
if (numEndIdx > nextPos) {
// Number match includes additional chars. Update pos and nextPos
// so that next loop iteration will continue at the end of the number,
// checking for breaks between last char in number & whatever follows.
nextPos = numEndIdx;
pos = fCharBI->preceding(numEndIdx);
thisChar = fText->char32At(pos);
}
continue;
}
}

View file

@ -24,7 +24,6 @@
# Temp debugging tests
<line>
<data>•\U00011efa\u275d\u0085<100>\u0c56•</data>
<data>•a\u275d\u0085<100>\u0c56•</data>
########################################################################################