ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes

X-SVN-Rev: 17376
This commit is contained in:
Andy Heninger 2005-03-23 02:13:53 +00:00
parent 475c03442f
commit cd85b65d35
9 changed files with 608 additions and 463 deletions

View file

@ -982,12 +982,8 @@ continueOn:
//
// handlePrevious()
//
// This method backs the iterator back up to a "safe position" in the text.
// This is a position that we know, without any context, may be any position
// not more than 2 breaks away. Occasionally, the position may be less than
// one break away.
// The various calling methods then iterate forward from this safe position to
// the appropriate position to return.
// Iterate backwards, according to the logic of the reverse rules.
// This version handles the exact style backwards rules.
//
// The logic of this function is very similar to handleNext(), above.
//
@ -1005,14 +1001,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
int32_t state = START_STATE;
int32_t category;
int32_t lastCategory = 0;
UBool hasPassedStartText = !fText->hasPrevious();
UChar32 c = fText->previous32();
// previous character
int32_t result = fText->getIndex();
int32_t lookaheadStatus = 0;
int32_t lookaheadResult = 0;
int32_t lookaheadTagIdx = 0;
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
RBBIStateTableRow *row;
@ -1031,20 +1025,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
// loop until we reach the beginning of the text or transition to state 0
for (;;) {
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
if (hasPassedStartText) {
// if we have already considered the start of the text
if (row->fLookAhead != 0 && lookaheadResult == 0) {
result = 0;
}
break;
// end of input is hardwired by rule builder as category #1.
category = 1;
} else {
// look up the current character's category
UTRIE_GET16(&fData->fTrie, c, category);
}
// save the last character's category and look up the current
// character's category
lastCategory = category;
UTRIE_GET16(&fData->fTrie, c, category);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators.
//
@ -1073,8 +1061,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
if (row->fAccepting == -1) {
// Match found, common case, could have lookahead so we move on to check it
result = fText->getIndex();
/// added
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) value.
}
if (row->fLookAhead != 0) {
@ -1083,7 +1069,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
/// i think we have to back up to read the lookahead character again
/// fText->setIndex(lookaheadResult);
@ -1100,7 +1085,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
fText->setIndex(result);
return result;
}
category = lastCategory;
fText->setIndex(result);
goto continueOn;
@ -1109,7 +1093,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
int32_t r = fText->getIndex();
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
fLastRuleStatusIndex = row->fTagIdx;
goto continueOn;
}
@ -1119,21 +1102,33 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
goto continueOn;
}
lookaheadStatus = 0; // clear out any pending look-ahead matches.
// This is a plain (non-look-ahead) accepting state
if (!lookAheadHardBreak) {
lookaheadStatus = 0; // clear out any pending look-ahead matches.
// But only if not doing the lookAheadHardBreak option,
// which needs to force a break no matter what is going
// on with the rest of the match, i.e. we can't abandon
// a partially completed look-ahead match because some
// other rule matched further than the '/' position
// in the look-ahead match.
}
continueOn:
if (state == STOP_STATE) {
break;
}
// then advance one character backwards
if (hasPassedStartText) {
break;
}
// Advance one character backwards
hasPassedStartText = !fText->hasPrevious();
c = fText->previous32();
}
// Note: the result postion isn't what is returned to the user by previous(),
// but where the implementation of previous() turns around and
// starts iterating forward again.
fText->setIndex(result);
return result;

View file

@ -3,7 +3,7 @@
//
/*
***************************************************************************
* Copyright (C) 2002-2004 International Business Machines Corporation *
* Copyright (C) 2002-2005 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
@ -147,7 +147,7 @@ void RBBISetBuilder::build() {
// Find the set of non-overlapping ranges of characters
//
int ni;
for (ni=0; ; ni++) {
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
if (usetNode==NULL) {
break;
@ -222,6 +222,10 @@ void RBBISetBuilder::build() {
// The groups are numbered, and these group numbers are the set of
// input symbols recognized by the run-time state machine.
//
// Numbering: # 0 (state table column 0) is unused.
// # 1 is reserved - table column 1 is for end-of-input
// # 2 is the first range list.
//
RangeDescriptor *rlSearchRange;
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
@ -232,12 +236,32 @@ void RBBISetBuilder::build() {
}
if (rlRange->fNum == 0) {
fGroupCount ++;
rlRange->fNum = fGroupCount;
rlRange->fNum = fGroupCount+1;
rlRange->setDictionaryFlag();
addValToSets(rlRange->fIncludesSets, fGroupCount);
addValToSets(rlRange->fIncludesSets, fGroupCount+1);
}
}
// Handle input sets that contain the special string {eof}.
// Column 1 of the state table is reserved for EOF on input.
// Add this column value (1) to the equivalent expression
// subtree for each UnicodeSet that contains the string {eof}
// Because EOF is not a character in the normal sense, it doesn't
// affect the computation of ranges or TRIE.
static UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
UnicodeString eofString(eofUString);
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
if (usetNode==NULL) {
break;
}
UnicodeSet *inputSet = usetNode->fInputSet;
if (inputSet->contains(eofString)) {
addValToSet(usetNode, 1);
}
}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
@ -296,7 +320,7 @@ void RBBISetBuilder::serializeTrie(uint8_t *where) {
//------------------------------------------------------------------------
//
// addValToSets Add a runtime-mapped input value to each uset from a
// list of uset nodes.
// list of uset nodes. (val corresponds to a state table column.)
// For each of the original Unicode sets - which correspond
// directly to uset nodes - a logically equivalent expression
// is constructed in terms of the remapped runtime input
@ -312,35 +336,38 @@ void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
for (ix=0; ix<sets->size(); ix++) {
RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
leafNode->fVal = (unsigned short)val;
if (usetNode->fLeftChild == NULL) {
usetNode->fLeftChild = leafNode;
leafNode->fParent = usetNode;
} else {
// There are already input symbols present for this set.
// Set up an OR node, with the previous stuff as the left child
// and the new value as the right child.
RBBINode *orNode = new RBBINode(RBBINode::opOr);
orNode->fLeftChild = usetNode->fLeftChild;
orNode->fRightChild = leafNode;
orNode->fLeftChild->fParent = orNode;
orNode->fRightChild->fParent = orNode;
usetNode->fLeftChild = orNode;
orNode->fParent = usetNode;
}
addValToSet(usetNode, val);
}
}
void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
leafNode->fVal = (unsigned short)val;
if (usetNode->fLeftChild == NULL) {
usetNode->fLeftChild = leafNode;
leafNode->fParent = usetNode;
} else {
// There are already input symbols present for this set.
// Set up an OR node, with the previous stuff as the left child
// and the new value as the right child.
RBBINode *orNode = new RBBINode(RBBINode::opOr);
orNode->fLeftChild = usetNode->fLeftChild;
orNode->fRightChild = leafNode;
orNode->fLeftChild->fParent = orNode;
orNode->fRightChild->fParent = orNode;
usetNode->fLeftChild = orNode;
orNode->fParent = usetNode;
}
}
//------------------------------------------------------------------------
//
// getNumOutputSets
// getNumCharCategories
//
//------------------------------------------------------------------------
int32_t RBBISetBuilder::getNumCharCategories() const {
return fGroupCount + 1;
return fGroupCount + 2;
}

View file

@ -2,7 +2,7 @@
// rbbisetb.h
/*
**********************************************************************
* Copyright (c) 2001-2004, International Business Machines
* Copyright (c) 2001-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -79,7 +79,8 @@ public:
~RBBISetBuilder();
void build();
void addValToSets(UVector *sets, uint32_t val);
void addValToSets(UVector *sets, uint32_t val);
void addValToSet (RBBINode *usetNode, uint32_t val);
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
// runtime state machine, which are the same as
// columns in the DFA state table
@ -110,8 +111,9 @@ private:
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.
// fGroupCount is the index of the last used group.
// The value is also the number of columns in the RBBI state table being compiled.
// Index 0 is not used. Funny counting.
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
// State table column 0 is not used. Column 1 is for end-of-input.
// column 2 is for group 0. Funny counting.
int32_t fGroupCount;
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class

View file

@ -4,7 +4,7 @@
/*
**********************************************************************
* Copyright (c) 2002-2004, International Business Machines
* Copyright (c) 2002-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -343,7 +343,7 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
// get a list of all endmarker nodes.
tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
// get a list all leaf nodes
// get a list all leaf nodes
tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
@ -383,10 +383,12 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
// into the rule file.
if (fRB->fLBCMNoChain) {
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
U_ASSERT(c != -1);
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
if (cLBProp == U_LB_COMBINING_MARK) {
continue;
if (c != -1) {
// c == -1 occurs with sets containing only the {eof} marker string.
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
if (cLBProp == U_LB_COMBINING_MARK) {
continue;
}
}
}
@ -572,14 +574,29 @@ void RBBITableBuilder::flagAcceptingStates() {
// Any non-zero value for fAccepting means this is an accepting node.
// The value is what will be returned to the user as the break status.
// If no other value was specified, force it to -1.
sd->fAccepting = endMarker->fVal;
if (sd->fAccepting == 0) {
sd->fAccepting = -1;
if (sd->fAccepting==0) {
// State hasn't been marked as accepting yet. Do it now.
sd->fAccepting = endMarker->fVal;
if (sd->fAccepting == 0) {
sd->fAccepting = -1;
}
}
if (sd->fAccepting==-1 && endMarker->fVal != 0) {
// Both lookahead and non-lookahead accepting for this state.
// Favor the look-ahead. Expedient for line break.
// TODO: need a more elegant resolution for conflicting rules.
sd->fAccepting = endMarker->fVal;
}
// implicit else:
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
// If the end marker node is from a look-ahead rule, set
// the fLookAhead field or this state also.
if (endMarker->fLookAheadEnd) {
// TODO: don't change value if already set?
// TODO: allow for more than one active look-ahead rule in engine.
// Make value here an index to a side array in engine?
sd->fLookAhead = sd->fAccepting;
}
}
@ -644,7 +661,7 @@ void RBBITableBuilder::flagTaggedStates() {
}
for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
tagNode = (RBBINode *)tagNodes.elementAt(i);
for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
@ -686,9 +703,9 @@ void RBBITableBuilder::mergeRuleStatusVals() {
fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
}
// For each state
for (n=0; n<fDStates->size(); n++) {
// For each state
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
UVector *thisStatesTagValues = sd->fTagVals;
if (thisStatesTagValues == NULL) {
@ -704,7 +721,7 @@ void RBBITableBuilder::mergeRuleStatusVals() {
sd->fTagsIdx = -1;
int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
int32_t nextTagGroupStart = 0;
// Loop runs once per group of tags in the global list
while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
thisTagGroupStart = nextTagGroupStart;
@ -718,21 +735,21 @@ void RBBITableBuilder::mergeRuleStatusVals() {
// The lengths match, go ahead and compare the actual tag values
// between this state and the group from the global list.
for (i=0; i<thisStatesTagValues->size(); i++) {
if (thisStatesTagValues->elementAti(i) !=
if (thisStatesTagValues->elementAti(i) !=
fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
// Mismatch.
// Mismatch.
break;
}
}
if (i == thisStatesTagValues->size()) {
// We found a set of tag values in the global list that match
// those for this state. Use them.
sd->fTagsIdx = thisTagGroupStart;
break;
break;
}
}
if (sd->fTagsIdx == -1) {
// No suitable entry in the global tag list already. Add one
sd->fTagsIdx = fRB->fRuleStatusVals->size();
@ -1027,7 +1044,7 @@ void RBBITableBuilder::printRuleStatusTable() {
RBBIDebugPrintf("index | tags \n");
RBBIDebugPrintf("-------------------\n");
while (nextRecord < tbl->size()) {
thisRecord = nextRecord;
nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
@ -1057,7 +1074,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
fTagVals = NULL;
fPositions = NULL;
fDtran = NULL;
fDtran = new UVector(lastInputSymbol+1, *fStatus);
if (U_FAILURE(*fStatus)) {
return;

View file

@ -14,7 +14,43 @@
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
#
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
# and only used for the line break rules.
#
# It is used in the implementation of the incredibly annoying rule LB 7c
# which says to treat any combining mark that is not attached to a base
# character as if it were of class AL (alphabetic).
#
# The problem occurs in the reverse rules.
#
# Consider a sequence like, with correct breaks as shown
# LF ID CM AL AL
# ^ ^ ^
# Then consider the sequence without the initial ID (ideographic)
# LF CM AL AL
# ^ ^
# Our CM, which in the first example was attached to the ideograph,
# is now unattached, becomes an alpha, and joins in with the other
# alphas.
#
# When interating forwards, these sequences do not present any problems
# When interating backwards, we need to look ahead when encountering
# a CM to see whether it attaches to something further on or not.
# (Look-ahead in a reverse rule is looking towards the start)
#
# If the CM is unattached, we need to force a break.
#
# !!lookAheadHardBreak forces the run time state machine to
# stop immediately when a look ahead rule ( '/' operator) matches,
# and set the match position to that of the look-ahead operator,
# no matter what other rules may be in play at the time.
#
# See rule LB 19 for an example.
#
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -60,7 +96,7 @@ $ZW = [:LineBreak = ZWSpace:];
# XX (Unknown, unassigned)
# as $AL (Alphabetic)
#
$ALPlus = $AL | $AI | $SA | $XX;
$ALPlus = [$AL $AI $SA $XX];
#
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
@ -87,7 +123,6 @@ $OPcm = $OP $CM*;
$POcm = $PO $CM*;
$PRcm = $PR $CM*;
$QUcm = $QU $CM*;
$SPcm = $SP $CM*;
$SYcm = $SY $CM*;
$WJcm = $WJ $CM*;
@ -120,45 +155,87 @@ $OP $CM+;
$PO $CM+;
$PR $CM+;
$QU $CM+;
$SP $CM+;
$SY $CM+;
$WJ $CM+;
#
# CAN_CM is the set of characters that may combine with CM combining chars.
# Note that Linebreak UAX 14's concept of a combining char and the rules
# for what they can combine with are _very_ different from the rest of Unicode.
#
# Note that $CM itself is left out of this set. If CM is needed as a base
# it must be listed separately in the rule.
#
$CAN_CM = [^$BK $CR $LF $NL $ZW $SP $CM]; # Bases that can take CMs
$CANT_CM = [ $BK $CR $LF $NL $ZW $SP $CM]; # Bases that can't take CMs
#
# Rule LB 3
$LB3Breaks = [$BK $CR $LF $NL];
$LB3NonBreaks = [^$BK $CR $LF $NL];
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
# AL_FOLLOW set of chars that can unconditionally follow an AL
# Needed in rules where stand-alone $CM s are treated as AL.
# Chaining is disabled with CM because it causes other failures,
# so for this one case we need to manually list out longer sequences.
#
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
$AL_FOLLOW_CM = [$CL $EX $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus];
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
$LB3NonBreaks? $LB3Breaks {100};
$LB5NonBreaks $CM* $LB3Breaks {100};
#
# Rule LB 3 Mandatory (Hard) breaks.
#
$LB3Breaks = [$BK $CR $LF $NL];
$LB3NonBreaks = [^$BK $CR $LF $NL];
$LB3NonBreaks? $LB3Breaks {100}; # LB 3c do not break before hard breaks.
$CAN_CM $CM* $LB3Breaks {100};
$CM+ $LB3Breaks {100};
$CR $LF {100};
# LB 4 x SP
# x ZW
$ZW [$SP $ZW];
$LB5NonBreaks $CM* [$SP $ZW];
$LB3NonBreaks [$SP $ZW];
$CAN_CM $CM* [$SP $ZW];
$CM+ [$SP $ZW];
# LB 5 Break after zero width space
$LB5Breaks = [$LB3Breaks $ZW];
$LB5Breaks = [$LB3Breaks $ZW];
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
# LB 7 Combining marks. $SP $CM needs to behave like $ID.
# X $CM needs to behave like X, where X is not $SP.
# LB 7 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
#
$LB5NonBreaks $CM+; # Stick together any combining sequences that don't match other rules.
# See definition of $CAN_CM.
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
$CM+;
#
# LB 8
$LB5NonBreaks $CM* $CL;
$LB5NonBreaks $CM* $EX;
$LB5NonBreaks $CM* $IS;
$LB5NonBreaks $CM* $SY;
#
$LB5NonBreaks $CL;
$CAN_CM $CM* $CL;
$CM+ $CL; # by rule 7c, stand-alone CM behaves as AL
$LB5NonBreaks $EX;
$CAN_CM $CM* $EX;
$CM+ $EX; # by rule 7c, stand-alone CM behaves as AL
$LB5NonBreaks $IS;
$CAN_CM $CM* $IS;
$CM+ $IS; # by rule 7c, stand-alone CM behaves as AL
$LB5NonBreaks $SY;
$CAN_CM $CM* $SY;
$CM+ $SY; # by rule 7c, stand-alone CM behaves as AL
#
# LB 9
$OPcm $SP* .?;
$OPcm $SP* $LB5NonBreaks $CM*;
#
$OPcm $SP* $CAN_CM $CM*;
$OPcm $SP* $CANT_CM;
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 7c, stand-alone CM behaves as AL
# LB 10
$QUcm $SP* $OPcm;
@ -167,58 +244,71 @@ $QUcm $SP* $OPcm;
$CLcm $SP* $NScm;
# LB 11a
($B2cm)+;
$B2cm $SP* $B2cm;
# LB 11b Word Joiner
$LB5NonBreaks $CM* $WJcm;
$WJcm .?;
#
$CAN_CM $CM* $WJcm;
$LB5NonBreaks $WJcm;
$CM+ $WJcm;
$WJcm [^$CAN_CM];
$WJcm $CAN_CM $CM*;
# LB 12
$LB12NonBreaks = [$LB5NonBreaks - $SP];
$LB12NonBreaks = [$LB5NonBreaks - [$SP]];
$LB12Breaks = [$LB5Breaks $SP];
# LB 13
# x GL
$LB12NonBreaks $CM* $GLcm;
$SP $CM+ $GLcm; # LB7a SP CM+ behaves as ID
$CM+ $GLcm;
# GL x
$GLcm .?;
#
$GLcm $LB12Breaks;
$GLcm $LB12NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
$GLcm $SP $CM+; # SP CM+ behaves as ID
# TODO: I don't think we need this rule.
# All but $CM will chain off of preceding rule.
# $GLcm will pick up the CM case by itself.
# LB 14
# x QU
$LB12NonBreaks $CM* $QUcm;
$SP $CM+ $QUcm; # LB7a SP CM+ behaves as ID
$CM+ $QUcm;
# QU x
$QUcm .?;
$QUcm $LB12NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
$QUcm $SP $CM+; # SP CM+ behaves as ID
# TODO: I don't think this rule is needed.
# LB 14a
# <break> $CB
# $CB <break>
$LB14NonBreaks = [$LB12NonBreaks - $CB];
$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
# LB 15
$LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
$BBcm [^$CB];
$BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*;
$LB14NonBreaks $CM* ($BAcm | $HYcm | $NScm);
$BBcm [^$CB]; # $BB x
$BBcm $LB14NonBreaks $CM*;
# LB 16
$ALcm $INcm;
$CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL
$IDcm $INcm;
$SP $CM+ $INcm; # by rule 7a, $SP $CM behaves like ID
$INcm $INcm;
$NUcm $INcm;
# $LB 17
($IDcm | $SP $CM+) $POcm;
$ALcm+ $NUcm; # includes $LB19
$CM+ $NUcm; # Rule 7c
$NUcm $ALcm+;
$IDcm $POcm;
$ALcm $NUcm; # includes $LB19
$CM+ $NUcm; # Rule 7c, any otherwise unattached CM behaves as AL
$NUcm $ALcm;
# LB 18
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? $POcm?;
@ -237,7 +327,10 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
# LB 19
$CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
$ALcm $ALcm;
$CM+ $ALcm; # The $CM+ is from rule 7C, and unattached CM is treated as AL
# LB 19b
$IScm $ALcm;
#
@ -269,39 +362,86 @@ $CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $SP;
$CM+ $SY;
$CM+ $WJ;
$CM+;
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] [whatever]
# The CM needs to behave as an AL
#
$AL_FOLLOW $CM+ / (
[$BK $CR $LF $NL $ZW {eof}] |
$SP+ $CM+ $SP |
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB9 will match, need to surpress this break.
# LB9 says OP SP* x .
# becomes OP SP* x AL
# becomes OP SP* x CM+ AL_FOLLOW
#
# Further note: the $AL in [$AL {eof}] is only to work around
# a rule compiler bug which complains about
# empty sets otherwise.
#
# Sequences of the form (shown forwards)
# [CANT_CM] <break> [CM] <break> [PR]
# The CM needs to behave as an AL
# This rule is concerned about getting the second of the two <breaks> in place.
#
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
# LB 3
$LB3Breaks $LB3NonBreaks;
$LB3Breaks $CM* $LB5NonBreaks;
$LB3Breaks [$LB3NonBreaks-$CM];
$LB3Breaks $CM+ $CAN_CM;
$LF $CR;
# LB 4 x SP
# x ZW
[$SP $ZW] $LB3NonBreaks;
[$SP $ZW] $CM* $LB5NonBreaks;
[$SP $ZW] [$LB3NonBreaks-$CM];
[$SP $ZW] $CM+ $CAN_CM;
# LB 5 Break after zero width space
# LB 7 Combining marks.
# $SP $CM needs to behave like $ID.
# X $CM needs to behave like X, where X is not $SP.
# X $CM needs to behave like X, where X is not $SP or controls.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $LB5NonBreaks;
$CM+ $CAN_CM;
# LB 8
$CL $CM* $LB5NonBreaks;
$EX $CM* $LB5NonBreaks;
$IS $CM* $LB5NonBreaks;
$SY $CM* $LB5NonBreaks;
$CL $CM+ $CAN_CM;
$EX $CM+ $CAN_CM;
$IS $CM+ $CAN_CM;
$SY $CM+ $CAN_CM;
$CL [$LB5NonBreaks-$CM];
$EX [$LB5NonBreaks-$CM];
$IS [$LB5NonBreaks-$CM];
$SY [$LB5NonBreaks-$CM];
# Rule 9 & 8 together.
# This really wants to chain at the $CM+ (which is acting as an $AL)
# except for $CM chaining being disabled.
[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP;
# LB 9 OP SP* x
#
$CM* $CAN_CM $SP* $CM* $OP;
$CANT_CM $SP* $CM* $OP;
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 7, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
$SY $CM $SP+ $OP; # Experiment. Remove.
# LB 9
$LB5NonBreaks $SP* $CM* $OP;
# LB 10
$CM* $OP $SP* $CM* $QU;
@ -310,48 +450,57 @@ $CM* $OP $SP* $CM* $QU;
$CM* $NS $SP* $CM* $CL;
# LB 11a
($CM* $B2)+;
$CM* $B2 $SP* $CM* $B2;
# LB 11b
$CM* $WJ $CM* $LB5NonBreaks;
$CM* $LB5NonBreaks $CM* $WJ;
. $CM* $WJ;
$CM* $WJ $CM* $CAN_CM;
$CM* $WJ [$LB5NonBreaks-$CM];
$CANT_CM $CM* $WJ;
$CM* $CAN_CM $CM* $WJ;
# LB 12
# LB 14
$CM* $GL $CM* $LB12NonBreaks;
$CM* $GL $CM+ $SP;
$CM* $LB5NonBreaks $CM* $GL;
# LB 13
# x GL
#
$CM* $GL $CM* [$LB12NonBreaks-$CM];
# LB 14
$CM* $QU $CM* $LB12NonBreaks;
$CM* $QU $CM+ $SP; # CM+ SP behaves as ID
$CM* $LB5NonBreaks $CM* $QU;
#
# GL x
#
$CANT_CM $CM* $GL;
$CM* $CAN_CM $CM* $GL;
# LB 14a
$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
#
# LB 14
#
$CM* $QU $CM* $CAN_CM; # . x QU
$CM* $QU $LB12NonBreaks;
$CM* $CAN_CM $CM* $QU; # QU x .
$CANT_CM $CM* $QU;
# LB 15
$CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;
($CM* ($BA | $HY | $NS))+ $CM+ / $LB5Breaks;
[$CR $LF $BK $NL $ZW] $CM* $BB;
$CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
$CM* ($BA | $HY | $NS) $CM* [$LB14NonBreaks-$CM]; # . x (BA | HY | NS)
$CM* [$LB14NonBreaks-$CM] $CM* $BB; # BB x .
[^$CB] $CM* $BB; #
# LB 16
$CM* $IN $CM* $ALPlus;
# by rule 7c, any otherwise unattached CM behaves as AL
$CM* $IN $CM+ / $LB5Breaks;
$CM* $IN $CM* ($ID | $CM $SP);
$CM* $IN $CM* $ID;
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;
# $LB 17
$CM* $PO $CM* ($ID | $CM $SP);
$CM* $NU ($CM* $ALPlus)+; # includes $LB19
$CM* $NU $CM+ / $LB5Breaks; # Rule 7c
$CM* $PO $CM* $ID;
$CM* $NU $CM* $ALPlus;
$CM* $ALPlus $CM* $NU;
# LB 18
@ -371,13 +520,13 @@ $CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
# LB 19
$CM* $ALPlus $CM* $ALPlus;
# The $CM* is from rule 7C, and unattached CM is treated as AL
$CM* $ALPlus $CM* $IS;
$CM* $ALPlus $CM+ / $LB5Breaks;
## problem state table can't handle lookahead when it is at the
## start of the string, currently handled in the rbbi code
## todo fix this
# LB 19b
$CM* $ALPlus $CM* $IS;
## -------------------------------------------------
@ -395,6 +544,7 @@ $SP+ $CM* $QU;
# LB 11
$SP+ $CM* $CL;
$SP+ $CM* $B2;
# LB 18
($CM* ($IS | $SY))+ $CM* $NU;
@ -404,18 +554,14 @@ $CL $CM* ($NU | $IS | $SY);
!!safe_forward;
# LB 7
[^$BK $CR $LF $NL $ZW $SP] $CM+;
$SP $CM+ / [^$CM];
# Skip forward over all character classes that are involved in
# rules containing patterns with possibly more than one char
# of context.
#
# It might be slightly more efficient to have specific rules
# instead of one generic one, but only if we could
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $B2 $PR $HY $SP]+ [^$CM $OP $QU $CL $B2 $PR $HY];
# LB 9
$OP $CM* $SP+;
# LB 10
$QU $CM* $SP+;
# LB 11
$CL $CM* $SP+;
# LB 18
$CM* $PRcm? ($OPcm | $HYcm)? $NU;

View file

@ -1,5 +1,5 @@
#
# Copyright (C) 2002-2004, International Business Machines Corporation and others.
# Copyright (C) 2002-2005, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: sent.txt
@ -111,10 +111,12 @@ $End? $Join [$RULE12 - $Sp - $Close];
# forces a break at the beginning of text "$Sp blah blah blah"
# remember the break iterators takes the longest match
$End? $Join $Sp / [^$Term $ATerm $Sp $Close];
$NOT_T_A_S_C = [^$Term $ATerm $Sp $Close];
$End? $Join $Sp / [$NOT_T_A_S_C {eof}];
# forces a break at the beginning of text "$Close blah blah blah"
$End? $Join $Close / [^$Term $ATerm $Close];
$NOT_T_A_C = [^$Term $ATerm $Close];
$End? $Join $Close / [$NOT_T_A_C {eof}];
## -------------------------------------------------

View file

@ -17,62 +17,30 @@
!!chain;
$Katakana = [[:Script = KATAKANA:]
[:name = VERTICAL KANA REPEAT MARK:]
[:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
[:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
[:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
[:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
[:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
[:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
[:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
[:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
- [:Ideographic:]
- $Katakana
- [:Script = Hiragana:]
- [:Script = Thai:]
- [:Script = Lao:]
- [:Grapheme_Extend = TRUE:]];
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
[:name = HEBREW PUNCTUATION GERSHAYIM:]
[:name = RIGHT SINGLE QUOTATION MARK:]
[:name = HYPHENATION POINT:]
[:name = COLON:]];
$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]];
$Numeric = [:LineBreak = Numeric:];
$ExtendNumLet = [[:Connector_Punctuation:]
- [:name = KATAKANA MIDDLE DOT:]
- [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
#
# Character Class Definitions.
# The names are those from TR29.
#
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
$ALetter = [\p{Word_Break = ALetter}];
$MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{General_Category=Connector_Punctuation}];
$CR = \u000d;
$LF = \u000a;
$Extend = [[:Grapheme_Extend = TRUE:]];
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
$Format = [[:Cf:] - $Extend];
$Hiragana = [:Hiragana:];
$Ideographic = [:IDEOGRAPHIC:];
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$ALetterEx = $ALetter $Extend*;
$NumericEx = $Numeric $Extend*;
$MidNumEx = $MidNum $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$KatakanaEx = $Katakana $Extend*;
$KatakanaEx = $Katakana $Extend*;
$ALetterEx = $ALetter $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$MidNumEx = $MidNum $Extend*;
$NumericEx = $Numeric $Extend*;
$ExtendNumLetEx = $ExtendNumLet $Extend*;
## -------------------------------------------------
@ -81,7 +49,7 @@ $ExtendNumLetEx = $ExtendNumLet $Extend*;
# Rule 3 - don't break grapheme clusters.
# see character breaks
# see character breaks.
$CR $LF;
[^$Control] $Extend+;
@ -114,8 +82,9 @@ $NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
# rule 13
$KatakanaEx $Format* $KatakanaEx {300};
$Hiragana $Extend* {300};
$Ideographic $Extend* {400};
[\p{Hiragana}] $Extend* {300}; # To get tag values.
[\p{Ideographic}] $Extend* {400};
# rule 13a/b

View file

@ -320,16 +320,16 @@ static void printStringBreaks(UnicodeString ustr, int expected[],
j ++;
}
u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c,
u_isUAlphabetic(c),
printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c,
u_isUAlphabetic(c),
u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
u_isalnum(c),
u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
u_charType(c),
U_SHORT_PROPERTY_NAME),
u_getPropertyValueName(UCHAR_LINE_BREAK,
u_getIntPropertyValue(c,
UCHAR_LINE_BREAK),
u_isalnum(c),
u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
u_charType(c),
U_SHORT_PROPERTY_NAME),
u_getPropertyValueName(UCHAR_LINE_BREAK,
u_getIntPropertyValue(c,
UCHAR_LINE_BREAK),
U_SHORT_PROPERTY_NAME),
name);
}
@ -390,9 +390,9 @@ void RBBITest::TestMixedThaiLineBreak()
// @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
// start
// start
ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
@ -406,9 +406,9 @@ void RBBITest::TestMixedThaiLineBreak()
ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
// @suwit - end of changes
// Arabic numerals should always be separated from surrounding Thai text
@ -449,7 +449,7 @@ ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u
ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
*/
/* remove the old data sample.
/* remove the old data sample.
// The Unicode Linebreak TR says do not break before or after quotes.
// So this test is changed ot not break around the quote.
// TODO: should Thai break around the around the quotes, like the original behavior here?
@ -517,21 +517,21 @@ void RBBITest::TestThaiWordBreak() {
ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); // Break at start of data
// @suwit -- Thai sample data from GVT Guideline
// start
ADD_DATACHUNK(thaiWordSelection, "\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07", 0, status); //5
ADD_DATACHUNK(thaiWordSelection, "\\u0E04\\u0E33", 0, status); //7
ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E17\\u0E22", 0, status); //10
ADD_DATACHUNK(thaiWordSelection, "\\u0E2A\\u0E32\\u0E21\\u0E32\\u0E23\\u0E16", 0, status); //16
ADD_DATACHUNK(thaiWordSelection, "\\u0E1B\\u0E23\\u0E30\\u0E01\\u0E2D\\u0E1A", 0, status); //22
ADD_DATACHUNK(thaiWordSelection, "\\u0E14\\u0E49\\u0E27\\u0E22", 0, status); //26
ADD_DATACHUNK(thaiWordSelection, "\\u0e2b\\u0e25\\u0e32\\u0e22", 0, status); //30
ADD_DATACHUNK(thaiWordSelection, "\\u0e1e\\u0e22\\u0e32\\u0e07\\u0e04\\u0e4c", 0, status); //36
// @suwit -- Thai sample data from GVT Guideline
// start
ADD_DATACHUNK(thaiWordSelection, "\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07", 0, status); //5
ADD_DATACHUNK(thaiWordSelection, "\\u0E04\\u0E33", 0, status); //7
ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E17\\u0E22", 0, status); //10
ADD_DATACHUNK(thaiWordSelection, "\\u0E2A\\u0E32\\u0E21\\u0E32\\u0E23\\u0E16", 0, status); //16
ADD_DATACHUNK(thaiWordSelection, "\\u0E1B\\u0E23\\u0E30\\u0E01\\u0E2D\\u0E1A", 0, status); //22
ADD_DATACHUNK(thaiWordSelection, "\\u0E14\\u0E49\\u0E27\\u0E22", 0, status); //26
ADD_DATACHUNK(thaiWordSelection, "\\u0e2b\\u0e25\\u0e32\\u0e22", 0, status); //30
ADD_DATACHUNK(thaiWordSelection, "\\u0e1e\\u0e22\\u0e32\\u0e07\\u0e04\\u0e4c", 0, status); //36
// @suwit - end of changes
/* remove the old data sample because Thai translation of the Wizard of Oz is not good testcase for wordbreak API.
ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2
ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5
ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6
@ -598,11 +598,11 @@ void RBBITest::TestBug3818() {
UErrorCode status = U_ZERO_ERROR;
// Four Thai words...
static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
UnicodeString thaiStr(thaiWordData);
RuleBasedBreakIterator* bi =
RuleBasedBreakIterator* bi =
(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
if (U_FAILURE(status) || bi == NULL) {
errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
@ -655,7 +655,7 @@ void RBBITest::TestJapaneseWordBreak() {
void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
{
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
switch (index) {
case 0: name = "TestBug4153072";
if(exec) TestBug4153072(); break;
@ -708,6 +708,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if(exec) TestBug3818(); break;
case 19: name = "TestJapaneseWordBreak";
if(exec) TestJapaneseWordBreak(); break;
case 20: name = "TestDebug";
if(exec) TestDebug(); break;
default: name = ""; break; //needed to end loop
}
@ -1890,9 +1892,8 @@ void RBBITest::TestLineBreakData() {
int32_t expectedPos; // Expected break position (index into test string)
bi->setText(testString);
pos = bi->first(); // TODO: break iterators always return a match at pos 0.
pos = bi->next(); // Line Break TR says no match at position 0.
// Resolve.
pos = bi->first();
pos = bi->next();
for (; pos != BreakIterator::DONE; ) {
expectedPos = expectedBreaks.elementAti(expectedI);
@ -2117,27 +2118,14 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
fSets = new UVector(status);
fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}"
"\\u3031-\\u3035\\u309b\\u309c\\u30a0"
"\\u30fc\\uff70\\uff9e\\uff9f]", status);
const UnicodeString ALetterStr( "[[\\p{Alphabetic}"
"\\u00a0" // NBSP
"\\u05f3]" // Hebrew punct Geresh
"-[\\p{Ideographic}]"
"-[\\p{Script=Thai}]"
"-[\\p{Script=Lao}]"
"-[\\p{Script=Hiragana}]"
"-[\\p{Grapheme_Extend}]]");
fALetterSet = new UnicodeSet(ALetterStr, status);
fALetterSet->removeAll(*fKatakanaSet);
fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status);
fMidNumSet = new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status);
fNumericSet = new UnicodeSet("[\\p{Line_Break=Numeric}]", status);
fFormatSet = new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status);
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
fExtendNumLetSet = new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status);
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]", status);
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]", status);
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]", status);
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]", status);
fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]", status);
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]", status);
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]", status);
fExtendNumLetSet = new UnicodeSet("[\\p{General_Category = Connector_Punctuation}]", status);
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {
deferredStatus = status;
@ -2180,7 +2168,7 @@ void RBBIWordMonkey::setText(const UnicodeString &s) {
int32_t RBBIWordMonkey::next(int32_t prevPos) {
UErrorCode status = U_ZERO_ERROR;
int p0, p1, p2, p3; // Indices of the significant code points around the
int p0, p1, p2, p3; // Indices of the significant code points around the
// break position being tested. The candidate break
// location is before p2.
@ -2221,7 +2209,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
U_ASSERT(U_SUCCESS(status));
c3 = fText->char32At(p3);
}
if (p1 == p2) {
// Still warming up the loop. (won't work with zero length strings, but we don't care)
continue;
@ -2275,7 +2263,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
// Rule (11) Numeric (MidNum | MidNumLet) x Numeric
if ( fNumericSet->contains(c0) &&
fMidNumSet->contains(c1) &&
fMidNumSet->contains(c1) &&
fNumericSet->contains(c2)) {
continue;
}
@ -2286,7 +2274,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
fNumericSet->contains(c3)) {
continue;
}
// Rule (13) Katakana x Katakana
if (fKatakanaSet->contains(c1) &&
fKatakanaSet->contains(c2)) {
@ -2301,7 +2289,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
}
// Rule 13b
if (fExtendNumLetSet->contains(c1) &&
if (fExtendNumLetSet->contains(c1) &&
(fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
fKatakanaSet->contains(c2))) {
continue;
@ -2362,7 +2350,7 @@ public:
virtual UVector *charClasses();
virtual void setText(const UnicodeString &s);
virtual int32_t next(int32_t i);
virtual void rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
virtual void rule7Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
private:
UVector *fSets;
@ -2381,14 +2369,14 @@ private:
UnicodeSet *fBA;
UnicodeSet *fBB;
UnicodeSet *fHY;
UnicodeSet *fH2;
UnicodeSet *fH3;
UnicodeSet *fH2;
UnicodeSet *fH3;
UnicodeSet *fCL;
UnicodeSet *fEX;
UnicodeSet *fIN;
UnicodeSet *fJL;
UnicodeSet *fJV;
UnicodeSet *fJT;
UnicodeSet *fJL;
UnicodeSet *fJV;
UnicodeSet *fJT;
UnicodeSet *fNS;
UnicodeSet *fOP;
UnicodeSet *fQU;
@ -2409,12 +2397,11 @@ private:
int32_t *fOrigPositions;
RegexMatcher *fNumberMatcher;
RegexMatcher *fLB10Matcher;
RegexMatcher *fLB11Matcher;
};
RBBILineMonkey::RBBILineMonkey()
RBBILineMonkey::RBBILineMonkey()
{
UErrorCode status = U_ZERO_ERROR;
@ -2439,7 +2426,7 @@ RBBILineMonkey::RBBILineMonkey()
fCL = new UnicodeSet("[\\p{Line_break=CL}]", status);
fEX = new UnicodeSet("[\\p{Line_break=EX}]", status);
fIN = new UnicodeSet("[\\p{Line_break=IN}]", status);
fJL = new UnicodeSet("[\\p{Line_break=JL}]", status);
fJL = new UnicodeSet("[\\p{Line_break=JL}]", status);
fJV = new UnicodeSet("[\\p{Line_break=JV}]", status);
fJT = new UnicodeSet("[\\p{Line_break=JT}]", status);
fNS = new UnicodeSet("[\\p{Line_break=NS}]", status);
@ -2460,8 +2447,6 @@ RBBILineMonkey::RBBILineMonkey()
fAL->addAll(*fAI); // Default behavior for AI is identical to AL
fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
fSets->addElement(fLF, status);
@ -2497,9 +2482,6 @@ RBBILineMonkey::RBBILineMonkey()
fSets->addElement(fID, status);
fSets->addElement(fWJ, status);
fSets->addElement(fSA, status);
// fSets->addElement(fXX, status);
fNumberMatcher = new RegexMatcher(
"(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"
@ -2507,19 +2489,7 @@ RBBILineMonkey::RBBILineMonkey()
"\\p{Line_Break=NU}\\p{Line_Break=CM}*"
"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
"(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
"(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
0, status);
fLB10Matcher = new RegexMatcher(
"\\p{Line_Break=QU}\\p{Line_Break=CM}*"
"\\p{Line_Break=SP}*"
"(\\p{Line_Break=OP})\\p{Line_Break=CM}*",
0, status);
fLB11Matcher = new RegexMatcher(
"\\p{Line_Break=CL}\\p{Line_Break=CM}*"
"\\p{Line_Break=SP}*"
"(\\p{Line_Break=NS})\\p{Line_Break=CM}*",
"(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
0, status);
fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
@ -2537,7 +2507,7 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
}
//
// rule67Adjust
// rule7Adjust
// Line Break TR rules 6 and 7 implementation.
// This deals with combining marks and other sequences that
// that must be treated as if they were something other than what they actually are.
@ -2546,21 +2516,20 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
// each potential break, once to the chars before the position being checked, then
// again to the text following the possible break.
//
void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
void RBBILineMonkey::rule7Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
if (pos == -1) {
// Invalid initial position. Happens during the warmup iteration of the
// Invalid initial position. Happens during the warmup iteration of the
// main loop in next().
return;
}
int32_t nPos = *nextPos;
// LB 7b Keep combining sequences together.
// advance over any CM class chars. Note that Line Break CM is different
// from normal Mc general category.
if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a
|| *posChar==0x0d || *posChar==0x85)) {
// from normal Mc general category.
if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
*posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
for (;;) {
*nextChar = fText->char32At(nPos);
if (!fCM->contains(*nextChar)) {
@ -2569,16 +2538,11 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
nPos = fText->moveIndex32(nPos, 1);
}
}
// LB 7a In a SP CM* sequence, treat the SP as an ID
if (nPos != *nextPos && fSP->contains(*posChar)) {
*posChar = 0x4e00; // 0x4e00 is a CJK Ideograph, linebreak type is ID.
}
// LB 7b Treat X CM* as if it were x.
// No explicit action required.
// No explicit action required.
// LB 7c Treat any remaining combining mark as AL
if (fCM->contains(*posChar)) {
*posChar = 0x41; // thisChar = 'A';
@ -2635,11 +2599,27 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
// Break at end of text.
// Rule LB2 - Break at end of text.
if (pos >= fText->length()) {
break;
}
// Rule LB 7 - adjust for combining sequences.
// We do this one out-of-order because the adjustment does not change anything
// that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
// be applied.
rule7Adjust(prevPos, &prevChar, &pos, &thisChar);
nextCPPos = nextPos = fText->moveIndex32(pos, 1);
c = fText->char32At(nextPos);
rule7Adjust(pos, &thisChar, &nextPos, &c);
// If the loop is still warming up - if we haven't shifted the initial
// -1 positions out of prevPos yet - loop back to advance the
// position in the input without any further looking for breaks.
if (prevPos == -1) {
continue;
}
// LB 3a Always break after hard line breaks,
if (fBK->contains(prevChar)) {
break;
@ -2661,33 +2641,6 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}
// LB 10 QU SP* x OP
if (prevPos >= 0) {
UnicodeString subStr10(*fText, prevPos);
fLB10Matcher->reset(subStr10);
status = U_ZERO_ERROR;
if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
// TODO: Check status codes
pos = prevPos + fLB10Matcher->start(1, status);
nextPos = prevPos + fLB10Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
}
}
// LB 11 CL SP* x NS
if (prevPos >= 0) {
UnicodeString subStr11(*fText, prevPos);
fLB11Matcher->reset(subStr11);
status = U_ZERO_ERROR;
if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
// TODO: Check status codes
pos = prevPos + fLB11Matcher->start(1, status);
nextPos = prevPos + fLB11Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
}
}
// LB 4 Don't break before spaces or zero-width space.
if (fSP->contains(thisChar)) {
@ -2703,37 +2656,8 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
break;
}
// LB LB 7
rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
c = fText->char32At(nextPos);
// another percularity of LB 4 - Dont break before space
if (fSP->contains(thisChar)) {
continue;
}
rule67Adjust(pos, &thisChar, &nextPos, &c);
// If the loop is still warming up - if we haven't shifted the initial
// -1 positions out of prevPos yet - loop back to advance the
// position in the input without any further looking for breaks.
if (prevPos == -1) {
continue;
}
// Re-apply rules 3c, 4 because these could be affected by having
// a new thisChar from doing rule 6 or 7.
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || // 3c
fBK->contains(thisChar)) {
continue;
}
if (fSP->contains(thisChar)) { // LB 4
continue;
}
if (fZW->contains(thisChar)) { // LB 4
continue;
}
// LB 7 Already done, at top of loop.
//
// LB 8 Don't break before closings.
@ -2751,7 +2675,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
// Scan backwards, checking for this sequence.
// The OP char could include combining marks, so we acually check for
// OP CM* SP*
// Another Twist: The Rule 67 fixes may have changed a CP CM
// Another Twist: The Rule 67 fixes may have changed a SP CM
// sequence into a ID char, so before scanning back through spaces,
// verify that prevChar is indeed a space. The prevChar variable
// may differ from fText[prevPos]
@ -2769,12 +2693,58 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
}
// LB 11a B2 x B2
if (fB2->contains(thisChar) && fB2->contains(prevChar)) {
continue;
// LB 10 QU SP* x OP
if (fSP->contains(prevChar) && fOP->contains(thisChar)) {
// Scan backwards from prevChar to see if it is preceded by QU CM* SP*
int tPos = prevPos;
while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
tPos = fText->moveIndex32(tPos, -1);
}
while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
tPos = fText->moveIndex32(tPos, -1);
}
if (fQU->contains(fText->char32At(tPos))) {
continue;
}
}
// LB 11b
// LB 11 CL SP* x NS
// Scan backwards for SP* CM* CL
if (fNS->contains(thisChar)) {
int tPos = prevPos;
while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
tPos = fText->moveIndex32(tPos, -1);
}
while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
tPos = fText->moveIndex32(tPos, -1);
}
if (fCL->contains(fText->char32At(tPos))) {
continue;
}
}
// LB 11a B2 SP* x B2
if (fB2->contains(thisChar)) {
// Scan backwards, checking for the B2 CM* SP* sequence.
tPos = prevPos;
if (fSP->contains(prevChar)) {
while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
tPos=fText->moveIndex32(tPos, -1);
}
}
while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
tPos=fText->moveIndex32(tPos, -1);
}
if (fB2->contains(fText->char32At(tPos))) {
continue;
}
}
// LB 11b
// x WJ
// WJ x
if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
@ -2786,7 +2756,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
break;
}
// LB 13
// LB 13
// x GL
// GL x
if (fGL->contains(thisChar) || fGL->contains(prevChar)) {
@ -2805,7 +2775,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
break;
}
// LB 15
// LB 15
if (fBA->contains(thisChar) ||
fHY->contains(thisChar) ||
fNS->contains(thisChar) ||
@ -2818,28 +2788,27 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
fID->contains(prevChar) && fIN->contains(thisChar) ||
fIN->contains(prevChar) && fIN->contains(thisChar) ||
fNU->contains(prevChar) && fIN->contains(thisChar) ) {
continue;
continue;
}
// LB 17 ID x PO (Note: Leading CM behaves like ID)
// LB 17 ID x PO
// AL x NU
// NU x AL
if (fID->contains(prevChar) && fPO->contains(thisChar) ||
fCM->contains(prevChar) && fPO->contains(thisChar) ||
fAL->contains(prevChar) && fNU->contains(thisChar) ||
fNU->contains(prevChar) && fAL->contains(thisChar) ) {
continue;
continue;
}
// LB 18 Numbers
UnicodeString subStr18(*fText, prevPos);
fNumberMatcher->reset(subStr18);
if (fNumberMatcher->lookingAt(status)) {
// TODO: Check status codes
if (fNumberMatcher->lookingAt(prevPos, status)) {
if (U_FAILURE(status)) {
break;
}
// Matched a number. But could have been just a single digit, which would
// not represent a "no break here" between prevChar and thisChar
int32_t numEndIdx = prevPos + fNumberMatcher->end(status); // idx of first char following num
int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
if (numEndIdx > pos) {
// Number match includes at least our two chars being checked
if (numEndIdx > nextPos) {
@ -2847,7 +2816,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
// so that next loop iteration will continue at the end of the number,
// checking for breaks between last char in number & whatever follows.
nextPos = numEndIdx;
pos = fCharBI->preceding(numEndIdx);
pos = fCharBI->preceding(numEndIdx);
thisChar = fText->char32At(pos);
while (fCM->contains(thisChar)) {
pos = fCharBI->preceding(pos);
@ -2861,29 +2830,28 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
if (fPR->contains(prevChar) && fAL->contains(thisChar)) {
continue;
}
if (fPR->contains(prevChar) && fID->contains(thisChar)) {
continue;
}
// LB 18b
if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
fJV->contains(thisChar) ||
fH2->contains(thisChar) ||
fH3->contains(thisChar))) {
continue;
}
if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
fJV->contains(thisChar) ||
fH2->contains(thisChar) ||
fH3->contains(thisChar))) {
continue;
}
if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
(fJV->contains(thisChar) || fJT->contains(thisChar))) {
continue;
(fJV->contains(thisChar) || fJT->contains(thisChar))) {
continue;
}
if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
fJT->contains(thisChar)) {
continue;
}
// LB 18c more Korean
if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
@ -2902,9 +2870,6 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
// LB 19
if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
continue;
@ -2917,9 +2882,9 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
// LB 20 Break everywhere else
break;
}
return pos;
}
@ -2970,8 +2935,6 @@ RBBILineMonkey::~RBBILineMonkey() {
delete fCharBI;
delete fNumberMatcher;
delete fLB10Matcher;
delete fLB11Matcher;
}
@ -3014,9 +2977,9 @@ static int32_t getIntParam(UnicodeString name, UnicodeString &params, int32_t d
}
#endif
static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
BreakIterator *bi,
int expected[],
int expected[],
int expectedcount)
{
int count = 0;
@ -3026,7 +2989,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count] = i;
if (count < expectedcount && expected[count] != i) {
test->errln("break forward test failed: expected %d but got %d",
test->errln("break forward test failed: expected %d but got %d",
expected[count], i);
break;
}
@ -3034,7 +2997,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
}
if (count != expectedcount) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("break test failed: missed %d match",
test->errln("break test failed: missed %d match",
expectedcount - count);
return;
}
@ -3058,7 +3021,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
count --;
if (forward[count] != i) {
test->errln("happy break test reverse failed: expected %d but got %d",
test->errln("happy break test reverse failed: expected %d but got %d",
forward[count], i);
break;
}
@ -3079,7 +3042,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
return;
}
}
}
}
}
void RBBITest::TestWordBreaks(void)
@ -3091,8 +3054,8 @@ void RBBITest::TestWordBreaks(void)
UErrorCode status = U_ZERO_ERROR;
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
UChar str[300];
static const char *strlist[] =
UChar str[300];
static const char *strlist[] =
{
"\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
"\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
@ -3168,8 +3131,8 @@ void RBBITest::TestWordBoundary(void)
UErrorCode status = U_ZERO_ERROR;
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
UChar str[50];
static const char *strlist[] =
UChar str[50];
static const char *strlist[] =
{
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
@ -3212,7 +3175,7 @@ void RBBITest::TestWordBoundary(void)
UnicodeString ustr(str);
int forward[50];
int count = 0;
bi->setText(ustr);
int prev = 0;
int i;
@ -3223,7 +3186,7 @@ void RBBITest::TestWordBoundary(void)
for (j = prev + 1; j < i; j ++) {
if (bi->isBoundary(j)) {
printStringBreaks(ustr, forward, count);
errln("happy boundary test failed: expected %d not a boundary",
errln("happy boundary test failed: expected %d not a boundary",
j);
return;
}
@ -3231,7 +3194,7 @@ void RBBITest::TestWordBoundary(void)
}
if (!bi->isBoundary(i)) {
printStringBreaks(ustr, forward, count);
errln("happy boundary test failed: expected %d a boundary",
errln("happy boundary test failed: expected %d a boundary",
i);
return;
}
@ -3247,8 +3210,8 @@ void RBBITest::TestLineBreaks(void)
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
UChar str[50];
static const char *strlist[] =
UChar str[50];
static const char *strlist[] =
{
"\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
"\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
@ -3324,14 +3287,14 @@ void RBBITest::TestSentBreaks(void)
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
UChar str[100];
static const char *strlist[] =
UChar str[100];
static const char *strlist[] =
{
"Now\ris\nthe\r\ntime\n\rfor\r\r",
"This\n",
"Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
"\"Sentence ending with a quote.\" Bye.",
" (This is it). Testing the sentence iterator. \"This isn't it.\"",
" (This is it). Testing the sentence iterator. \"This isn't it.\"",
"Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
"Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
"Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
@ -3427,7 +3390,7 @@ void RBBITest::TestMonkey(char *params) {
RBBILineMonkey m;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
if (params == NULL) {
loopCount = 50;
loopCount = loopCount / 5; // Line break runs slower than the others.
}
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "line", seed, loopCount);
@ -3642,7 +3605,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
for (ci=0; ci<2; ci++) { // Number of items to include in error text.
for (;;) {
if (endContext >= testText.length()) {break;}
if (expectedBreaks[endContext-1] != 0) {
if (expectedBreaks[endContext-1] != 0) {
if (count == 0) break;
count --;
}
@ -3655,7 +3618,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
/***if (strcmp(errorType, "next()") == 0) {
startContext = 0;
endContext = testText.length();
printStringBreaks(testText, expected, expectedCount);
}***/
@ -3704,5 +3667,28 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
#endif
}
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked
// for tracing without a lot of unwanted extra stuff happening.
//
void RBBITest::TestDebug(void) {
#if 0
UErrorCode status = U_ZERO_ERROR;
int pos;
RuleBasedBreakIterator* bi =
// (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
UnicodeString s("\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07\\u0E04\\u0E33");
s = s.unescape();
bi->setText(s);
// bi->last();
do {
pos = bi->next();
printf("%d\n", pos);
} while (pos != BreakIterator::DONE);
#endif
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1999-2004, International Business Machines Corporation and
* COPYRIGHT:
* Copyright (c) 1999-2005, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/************************************************************************
@ -31,20 +31,20 @@ class RBBIMonkeyKind;
*/
class RBBITest: public IntlTest {
public:
RBBITest();
virtual ~RBBITest();
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
/**
* Tests rule status return values
**/
**/
void TestStatusReturn();
/**
* Run the Unicode Line Break test data.
**/
**/
void TestLineBreakData();
/**
@ -58,8 +58,8 @@ public:
void TestBug4153072();
void TestJapaneseLineBreak();
void TestThaiLineBreak();
void TestMixedThaiLineBreak();
void TestMaiyamok();
void TestMixedThaiLineBreak();
void TestMaiyamok();
void TestThaiWordBreak();
void TestMonkey(char *params);
@ -73,16 +73,17 @@ public:
void TestSentBreaks();
void TestBug3818();
void TestJapaneseWordBreak();
void TestDebug();
/***********************/
private:
/**
* internal methods to prepare test data
**/
/**
* Perform tests of BreakIterator forward and backward functionality
* Perform tests of BreakIterator forward and backward functionality
* on different kinds of iterators (word, sentence, line and character).
* It tests the methods first(), next(), current(), preceding(), following()
* previous() and isBoundary().
@ -110,7 +111,7 @@ private:
**/
void testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td);
/**
* Internal method to perform tests of BreakIterator multiple selection functionality
* Internal method to perform tests of BreakIterator multiple selection functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td);