mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-2924 RBBI, line break rules, monkey test, a few more fixes
X-SVN-Rev: 13402
This commit is contained in:
parent
e1776d90c6
commit
94a9e101e7
2 changed files with 50 additions and 33 deletions
|
@ -46,6 +46,18 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
|
||||
|
||||
#
|
||||
# Korean Syllable Definitions
|
||||
#
|
||||
$L = [:Hangul_Syllable_Type = L:];
|
||||
$V = [:Hangul_Syllable_Type = V:];
|
||||
$T = [:Hangul_Syllable_Type = T:];
|
||||
|
||||
$LV = [:Hangul_Syllable_Type = LV:];
|
||||
$LVT = [:Hangul_Syllable_Type = LVT:];
|
||||
|
||||
$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
|
||||
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
|
@ -66,7 +78,7 @@ $CLcm = $CL $CM*;
|
|||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$IDcm = ($ID | HangulSyllable) $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
|
@ -199,10 +211,10 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
|||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
# Back up to a hard break or a space that will cause a boundary.
|
||||
# Not all spaces cause line breaks. $SpaceGlue represents a sequence
|
||||
# containing a space that may inhibit a break from occuring.
|
||||
# TODO: Something more efficient. These rules just back up to hard breaks
|
||||
# Note that the initial .. is to back over both halves of a CR/LF sequence
|
||||
# at the current position.
|
||||
#
|
||||
|
||||
!.*;
|
||||
!. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
|
||||
#!.*;
|
||||
|
|
|
@ -2553,12 +2553,14 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
|
|||
// advance over any CM class chars. (Line Break CM class is different from
|
||||
// grapheme cluster CM, so we need to do this even for HangulSyllables.
|
||||
// Line Break may eat additional stuff as combining, beyond what graphem cluster did.
|
||||
for (;;) {
|
||||
*nextChar = fText->char32At(nPos);
|
||||
if (!fCM->contains(*nextChar)) {
|
||||
break;
|
||||
if (!(fBK->contains(*posChar) || *posChar==0x0a || *posChar==0x0d || *posChar==0x85)) {
|
||||
for (;;) {
|
||||
*nextChar = fText->char32At(nPos);
|
||||
if (!fCM->contains(*nextChar)) {
|
||||
break;
|
||||
}
|
||||
nPos = fText->moveIndex32(nPos, 1);
|
||||
}
|
||||
nPos = fText->moveIndex32(nPos, 1);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2652,27 +2654,31 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
}
|
||||
|
||||
// LB 10 QU SP* x OP
|
||||
UnicodeString subStr10(*fText, prevPos);
|
||||
fLB10Matcher->reset(subStr10);
|
||||
status = U_ZERO_ERROR;
|
||||
if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
|
||||
// TODO: Check status codes
|
||||
pos = prevPos + fLB10Matcher->start(1, status);
|
||||
nextPos = prevPos + fLB10Matcher->end(0, status);
|
||||
thisChar = fText->char32At(pos);
|
||||
continue;
|
||||
if (prevPos >= 0) {
|
||||
UnicodeString subStr10(*fText, prevPos);
|
||||
fLB10Matcher->reset(subStr10);
|
||||
status = U_ZERO_ERROR;
|
||||
if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
|
||||
// TODO: Check status codes
|
||||
pos = prevPos + fLB10Matcher->start(1, status);
|
||||
nextPos = prevPos + fLB10Matcher->end(0, status);
|
||||
thisChar = fText->char32At(pos);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 11 CL SP* x NS
|
||||
UnicodeString subStr11(*fText, prevPos);
|
||||
fLB11Matcher->reset(subStr11);
|
||||
status = U_ZERO_ERROR;
|
||||
if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
|
||||
// TODO: Check status codes
|
||||
pos = prevPos + fLB11Matcher->start(1, status);
|
||||
nextPos = prevPos + fLB11Matcher->end(0, status);
|
||||
thisChar = fText->char32At(pos);
|
||||
continue;
|
||||
if (prevPos >= 0) {
|
||||
UnicodeString subStr11(*fText, prevPos);
|
||||
fLB11Matcher->reset(subStr11);
|
||||
status = U_ZERO_ERROR;
|
||||
if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
|
||||
// TODO: Check status codes
|
||||
pos = prevPos + fLB11Matcher->start(1, status);
|
||||
nextPos = prevPos + fLB11Matcher->end(0, status);
|
||||
thisChar = fText->char32At(pos);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 4 Don't break before spaces or zero-width space.
|
||||
|
@ -2984,8 +2990,7 @@ void RBBITest::TestMonkey(char *params) {
|
|||
RBBILineMonkey m;
|
||||
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
|
||||
if (params == NULL) {
|
||||
// TODO: Resolve rule ambiguities, unpin loop count.
|
||||
loopCount = 2;
|
||||
loopCount = 50;
|
||||
}
|
||||
RunMonkey(bi, m, "line", seed, loopCount);
|
||||
delete bi;
|
||||
|
@ -3041,8 +3046,8 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
|
|||
}
|
||||
}
|
||||
|
||||
while (loopCount <= numIterations || numIterations == -1) {
|
||||
if (numIterations == -1 && loopCount % 500 == 0) {
|
||||
while (loopCount < numIterations || numIterations == -1) {
|
||||
if (numIterations == -1 && loopCount % 10 == 0) {
|
||||
// If test is running in an infinite loop, display a periodic tic so
|
||||
// we can tell that it is making progress.
|
||||
fprintf(stderr, ".");
|
||||
|
|
Loading…
Add table
Reference in a new issue