ICU-2924 RBBI, line break rules, monkey test, a few more fixes

X-SVN-Rev: 13402
This commit is contained in:
Andy Heninger 2003-10-13 22:01:53 +00:00
parent e1776d90c6
commit 94a9e101e7
2 changed files with 50 additions and 33 deletions

View file

@ -46,6 +46,18 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
#
# Korean Syllable Definitions
#
$L = [:Hangul_Syllable_Type = L:];
$V = [:Hangul_Syllable_Type = V:];
$T = [:Hangul_Syllable_Type = T:];
$LV = [:Hangul_Syllable_Type = LV:];
$LVT = [:Hangul_Syllable_Type = LVT:];
$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
#
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
@ -66,7 +78,7 @@ $CLcm = $CL $CM*;
$EXcm = $EX $CM*;
$GLcm = $GL $CM*;
$HYcm = $HY $CM*;
$IDcm = $ID $CM*;
$IDcm = ($ID | HangulSyllable) $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$NScm = $NS $CM*;
@ -199,10 +211,10 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
#
# Reverse Rules.
#
# Back up to a hard break or a space that will cause a boundary.
# Not all spaces cause line breaks. $SpaceGlue represents a sequence
# containing a space that may inhibit a break from occuring.
# TODO: Something more efficient. These rules just back up to hard breaks
# Note that the initial .. is to back over both halves of a CR/LF sequence
# at the current position.
#
!.*;
!. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
#!.*;

View file

@ -2553,12 +2553,14 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
// advance over any CM class chars. (Line Break CM class is different from
// grapheme cluster CM, so we need to do this even for HangulSyllables.
// Line Break may eat additional stuff as combining, beyond what graphem cluster did.
for (;;) {
*nextChar = fText->char32At(nPos);
if (!fCM->contains(*nextChar)) {
break;
if (!(fBK->contains(*posChar) || *posChar==0x0a || *posChar==0x0d || *posChar==0x85)) {
for (;;) {
*nextChar = fText->char32At(nPos);
if (!fCM->contains(*nextChar)) {
break;
}
nPos = fText->moveIndex32(nPos, 1);
}
nPos = fText->moveIndex32(nPos, 1);
}
@ -2652,27 +2654,31 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
}
// LB 10 QU SP* x OP
UnicodeString subStr10(*fText, prevPos);
fLB10Matcher->reset(subStr10);
status = U_ZERO_ERROR;
if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
// TODO: Check status codes
pos = prevPos + fLB10Matcher->start(1, status);
nextPos = prevPos + fLB10Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
if (prevPos >= 0) {
UnicodeString subStr10(*fText, prevPos);
fLB10Matcher->reset(subStr10);
status = U_ZERO_ERROR;
if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
// TODO: Check status codes
pos = prevPos + fLB10Matcher->start(1, status);
nextPos = prevPos + fLB10Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
}
}
// LB 11 CL SP* x NS
UnicodeString subStr11(*fText, prevPos);
fLB11Matcher->reset(subStr11);
status = U_ZERO_ERROR;
if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
// TODO: Check status codes
pos = prevPos + fLB11Matcher->start(1, status);
nextPos = prevPos + fLB11Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
if (prevPos >= 0) {
UnicodeString subStr11(*fText, prevPos);
fLB11Matcher->reset(subStr11);
status = U_ZERO_ERROR;
if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
// TODO: Check status codes
pos = prevPos + fLB11Matcher->start(1, status);
nextPos = prevPos + fLB11Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
}
}
// LB 4 Don't break before spaces or zero-width space.
@ -2984,8 +2990,7 @@ void RBBITest::TestMonkey(char *params) {
RBBILineMonkey m;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
if (params == NULL) {
// TODO: Resolve rule ambiguities, unpin loop count.
loopCount = 2;
loopCount = 50;
}
RunMonkey(bi, m, "line", seed, loopCount);
delete bi;
@ -3041,8 +3046,8 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
}
}
while (loopCount <= numIterations || numIterations == -1) {
if (numIterations == -1 && loopCount % 500 == 0) {
while (loopCount < numIterations || numIterations == -1) {
if (numIterations == -1 && loopCount % 10 == 0) {
// If test is running in an infinite loop, display a periodic tic so
// we can tell that it is making progress.
fprintf(stderr, ".");