mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-4157 Unicode 4.1 RBBI rule updates + required implementation fixes
X-SVN-Rev: 17376
This commit is contained in:
parent
475c03442f
commit
cd85b65d35
9 changed files with 608 additions and 463 deletions
|
@ -982,12 +982,8 @@ continueOn:
|
|||
//
|
||||
// handlePrevious()
|
||||
//
|
||||
// This method backs the iterator back up to a "safe position" in the text.
|
||||
// This is a position that we know, without any context, may be any position
|
||||
// not more than 2 breaks away. Occasionally, the position may be less than
|
||||
// one break away.
|
||||
// The various calling methods then iterate forward from this safe position to
|
||||
// the appropriate position to return.
|
||||
// Iterate backwards, according to the logic of the reverse rules.
|
||||
// This version handles the exact style backwards rules.
|
||||
//
|
||||
// The logic of this function is very similar to handleNext(), above.
|
||||
//
|
||||
|
@ -1005,14 +1001,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
|
||||
int32_t state = START_STATE;
|
||||
int32_t category;
|
||||
int32_t lastCategory = 0;
|
||||
UBool hasPassedStartText = !fText->hasPrevious();
|
||||
UChar32 c = fText->previous32();
|
||||
// previous character
|
||||
int32_t result = fText->getIndex();
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t lookaheadResult = 0;
|
||||
int32_t lookaheadTagIdx = 0;
|
||||
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
|
||||
RBBIStateTableRow *row;
|
||||
|
@ -1031,20 +1025,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
for (;;) {
|
||||
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
if (hasPassedStartText) {
|
||||
// if we have already considered the start of the text
|
||||
if (row->fLookAhead != 0 && lookaheadResult == 0) {
|
||||
result = 0;
|
||||
}
|
||||
break;
|
||||
// end of input is hardwired by rule builder as category #1.
|
||||
category = 1;
|
||||
} else {
|
||||
// look up the current character's category
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
}
|
||||
|
||||
// save the last character's category and look up the current
|
||||
// character's category
|
||||
lastCategory = category;
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators.
|
||||
//
|
||||
|
@ -1073,8 +1061,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, could have lookahead so we move on to check it
|
||||
result = fText->getIndex();
|
||||
/// added
|
||||
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) value.
|
||||
}
|
||||
|
||||
if (row->fLookAhead != 0) {
|
||||
|
@ -1083,7 +1069,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
lookaheadStatus = 0;
|
||||
/// i think we have to back up to read the lookahead character again
|
||||
/// fText->setIndex(lookaheadResult);
|
||||
|
@ -1100,7 +1085,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
fText->setIndex(result);
|
||||
return result;
|
||||
}
|
||||
category = lastCategory;
|
||||
fText->setIndex(result);
|
||||
|
||||
goto continueOn;
|
||||
|
@ -1109,7 +1093,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
int32_t r = fText->getIndex();
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
fLastRuleStatusIndex = row->fTagIdx;
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
@ -1119,21 +1102,33 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
goto continueOn;
|
||||
}
|
||||
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||
|
||||
// This is a plain (non-look-ahead) accepting state
|
||||
if (!lookAheadHardBreak) {
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||
// But only if not doing the lookAheadHardBreak option,
|
||||
// which needs to force a break no matter what is going
|
||||
// on with the rest of the match, i.e. we can't abandon
|
||||
// a partially completed look-ahead match because some
|
||||
// other rule matched further than the '/' position
|
||||
// in the look-ahead match.
|
||||
}
|
||||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
break;
|
||||
}
|
||||
|
||||
// then advance one character backwards
|
||||
if (hasPassedStartText) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Advance one character backwards
|
||||
hasPassedStartText = !fText->hasPrevious();
|
||||
c = fText->previous32();
|
||||
}
|
||||
|
||||
// Note: the result postion isn't what is returned to the user by previous(),
|
||||
// but where the implementation of previous() turns around and
|
||||
// starts iterating forward again.
|
||||
|
||||
fText->setIndex(result);
|
||||
|
||||
return result;
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
//
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2002-2004 International Business Machines Corporation *
|
||||
* Copyright (C) 2002-2005 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
*/
|
||||
|
@ -147,7 +147,7 @@ void RBBISetBuilder::build() {
|
|||
// Find the set of non-overlapping ranges of characters
|
||||
//
|
||||
int ni;
|
||||
for (ni=0; ; ni++) {
|
||||
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
|
||||
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
|
||||
if (usetNode==NULL) {
|
||||
break;
|
||||
|
@ -222,6 +222,10 @@ void RBBISetBuilder::build() {
|
|||
// The groups are numbered, and these group numbers are the set of
|
||||
// input symbols recognized by the run-time state machine.
|
||||
//
|
||||
// Numbering: # 0 (state table column 0) is unused.
|
||||
// # 1 is reserved - table column 1 is for end-of-input
|
||||
// # 2 is the first range list.
|
||||
//
|
||||
RangeDescriptor *rlSearchRange;
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
|
||||
|
@ -232,12 +236,32 @@ void RBBISetBuilder::build() {
|
|||
}
|
||||
if (rlRange->fNum == 0) {
|
||||
fGroupCount ++;
|
||||
rlRange->fNum = fGroupCount;
|
||||
rlRange->fNum = fGroupCount+1;
|
||||
rlRange->setDictionaryFlag();
|
||||
addValToSets(rlRange->fIncludesSets, fGroupCount);
|
||||
addValToSets(rlRange->fIncludesSets, fGroupCount+1);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle input sets that contain the special string {eof}.
|
||||
// Column 1 of the state table is reserved for EOF on input.
|
||||
// Add this column value (1) to the equivalent expression
|
||||
// subtree for each UnicodeSet that contains the string {eof}
|
||||
// Because EOF is not a character in the normal sense, it doesn't
|
||||
// affect the computation of ranges or TRIE.
|
||||
static UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
|
||||
UnicodeString eofString(eofUString);
|
||||
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
|
||||
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
|
||||
if (usetNode==NULL) {
|
||||
break;
|
||||
}
|
||||
UnicodeSet *inputSet = usetNode->fInputSet;
|
||||
if (inputSet->contains(eofString)) {
|
||||
addValToSet(usetNode, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
|
||||
|
||||
|
@ -296,7 +320,7 @@ void RBBISetBuilder::serializeTrie(uint8_t *where) {
|
|||
//------------------------------------------------------------------------
|
||||
//
|
||||
// addValToSets Add a runtime-mapped input value to each uset from a
|
||||
// list of uset nodes.
|
||||
// list of uset nodes. (val corresponds to a state table column.)
|
||||
// For each of the original Unicode sets - which correspond
|
||||
// directly to uset nodes - a logically equivalent expression
|
||||
// is constructed in terms of the remapped runtime input
|
||||
|
@ -312,35 +336,38 @@ void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
|
|||
|
||||
for (ix=0; ix<sets->size(); ix++) {
|
||||
RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
|
||||
RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
|
||||
leafNode->fVal = (unsigned short)val;
|
||||
if (usetNode->fLeftChild == NULL) {
|
||||
usetNode->fLeftChild = leafNode;
|
||||
leafNode->fParent = usetNode;
|
||||
} else {
|
||||
// There are already input symbols present for this set.
|
||||
// Set up an OR node, with the previous stuff as the left child
|
||||
// and the new value as the right child.
|
||||
RBBINode *orNode = new RBBINode(RBBINode::opOr);
|
||||
orNode->fLeftChild = usetNode->fLeftChild;
|
||||
orNode->fRightChild = leafNode;
|
||||
orNode->fLeftChild->fParent = orNode;
|
||||
orNode->fRightChild->fParent = orNode;
|
||||
usetNode->fLeftChild = orNode;
|
||||
orNode->fParent = usetNode;
|
||||
}
|
||||
addValToSet(usetNode, val);
|
||||
}
|
||||
}
|
||||
|
||||
void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
|
||||
RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
|
||||
leafNode->fVal = (unsigned short)val;
|
||||
if (usetNode->fLeftChild == NULL) {
|
||||
usetNode->fLeftChild = leafNode;
|
||||
leafNode->fParent = usetNode;
|
||||
} else {
|
||||
// There are already input symbols present for this set.
|
||||
// Set up an OR node, with the previous stuff as the left child
|
||||
// and the new value as the right child.
|
||||
RBBINode *orNode = new RBBINode(RBBINode::opOr);
|
||||
orNode->fLeftChild = usetNode->fLeftChild;
|
||||
orNode->fRightChild = leafNode;
|
||||
orNode->fLeftChild->fParent = orNode;
|
||||
orNode->fRightChild->fParent = orNode;
|
||||
usetNode->fLeftChild = orNode;
|
||||
orNode->fParent = usetNode;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// getNumOutputSets
|
||||
// getNumCharCategories
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
int32_t RBBISetBuilder::getNumCharCategories() const {
|
||||
return fGroupCount + 1;
|
||||
return fGroupCount + 2;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
// rbbisetb.h
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001-2004, International Business Machines
|
||||
* Copyright (c) 2001-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -79,7 +79,8 @@ public:
|
|||
~RBBISetBuilder();
|
||||
|
||||
void build();
|
||||
void addValToSets(UVector *sets, uint32_t val);
|
||||
void addValToSets(UVector *sets, uint32_t val);
|
||||
void addValToSet (RBBINode *usetNode, uint32_t val);
|
||||
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
|
||||
// runtime state machine, which are the same as
|
||||
// columns in the DFA state table
|
||||
|
@ -110,8 +111,9 @@ private:
|
|||
// Groups correspond to character categories -
|
||||
// groups of ranges that are in the same original UnicodeSets.
|
||||
// fGroupCount is the index of the last used group.
|
||||
// The value is also the number of columns in the RBBI state table being compiled.
|
||||
// Index 0 is not used. Funny counting.
|
||||
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
|
||||
// State table column 0 is not used. Column 1 is for end-of-input.
|
||||
// column 2 is for group 0. Funny counting.
|
||||
int32_t fGroupCount;
|
||||
|
||||
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2004, International Business Machines
|
||||
* Copyright (c) 2002-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -343,7 +343,7 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
|||
// get a list of all endmarker nodes.
|
||||
tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
|
||||
|
||||
// get a list all leaf nodes
|
||||
// get a list all leaf nodes
|
||||
tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
|
@ -383,10 +383,12 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
|||
// into the rule file.
|
||||
if (fRB->fLBCMNoChain) {
|
||||
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
|
||||
U_ASSERT(c != -1);
|
||||
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
|
||||
if (cLBProp == U_LB_COMBINING_MARK) {
|
||||
continue;
|
||||
if (c != -1) {
|
||||
// c == -1 occurs with sets containing only the {eof} marker string.
|
||||
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
|
||||
if (cLBProp == U_LB_COMBINING_MARK) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -572,14 +574,29 @@ void RBBITableBuilder::flagAcceptingStates() {
|
|||
// Any non-zero value for fAccepting means this is an accepting node.
|
||||
// The value is what will be returned to the user as the break status.
|
||||
// If no other value was specified, force it to -1.
|
||||
sd->fAccepting = endMarker->fVal;
|
||||
if (sd->fAccepting == 0) {
|
||||
sd->fAccepting = -1;
|
||||
|
||||
if (sd->fAccepting==0) {
|
||||
// State hasn't been marked as accepting yet. Do it now.
|
||||
sd->fAccepting = endMarker->fVal;
|
||||
if (sd->fAccepting == 0) {
|
||||
sd->fAccepting = -1;
|
||||
}
|
||||
}
|
||||
if (sd->fAccepting==-1 && endMarker->fVal != 0) {
|
||||
// Both lookahead and non-lookahead accepting for this state.
|
||||
// Favor the look-ahead. Expedient for line break.
|
||||
// TODO: need a more elegant resolution for conflicting rules.
|
||||
sd->fAccepting = endMarker->fVal;
|
||||
}
|
||||
// implicit else:
|
||||
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
|
||||
|
||||
// If the end marker node is from a look-ahead rule, set
|
||||
// the fLookAhead field or this state also.
|
||||
if (endMarker->fLookAheadEnd) {
|
||||
// TODO: don't change value if already set?
|
||||
// TODO: allow for more than one active look-ahead rule in engine.
|
||||
// Make value here an index to a side array in engine?
|
||||
sd->fLookAhead = sd->fAccepting;
|
||||
}
|
||||
}
|
||||
|
@ -644,7 +661,7 @@ void RBBITableBuilder::flagTaggedStates() {
|
|||
}
|
||||
for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
|
||||
tagNode = (RBBINode *)tagNodes.elementAt(i);
|
||||
|
||||
|
||||
for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
|
||||
|
@ -686,9 +703,9 @@ void RBBITableBuilder::mergeRuleStatusVals() {
|
|||
fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
|
||||
fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
|
||||
}
|
||||
|
||||
// For each state
|
||||
for (n=0; n<fDStates->size(); n++) {
|
||||
|
||||
// For each state
|
||||
for (n=0; n<fDStates->size(); n++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
UVector *thisStatesTagValues = sd->fTagVals;
|
||||
if (thisStatesTagValues == NULL) {
|
||||
|
@ -704,7 +721,7 @@ void RBBITableBuilder::mergeRuleStatusVals() {
|
|||
sd->fTagsIdx = -1;
|
||||
int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
|
||||
int32_t nextTagGroupStart = 0;
|
||||
|
||||
|
||||
// Loop runs once per group of tags in the global list
|
||||
while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
|
||||
thisTagGroupStart = nextTagGroupStart;
|
||||
|
@ -718,21 +735,21 @@ void RBBITableBuilder::mergeRuleStatusVals() {
|
|||
// The lengths match, go ahead and compare the actual tag values
|
||||
// between this state and the group from the global list.
|
||||
for (i=0; i<thisStatesTagValues->size(); i++) {
|
||||
if (thisStatesTagValues->elementAti(i) !=
|
||||
if (thisStatesTagValues->elementAti(i) !=
|
||||
fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
|
||||
// Mismatch.
|
||||
// Mismatch.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (i == thisStatesTagValues->size()) {
|
||||
// We found a set of tag values in the global list that match
|
||||
// those for this state. Use them.
|
||||
sd->fTagsIdx = thisTagGroupStart;
|
||||
break;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (sd->fTagsIdx == -1) {
|
||||
// No suitable entry in the global tag list already. Add one
|
||||
sd->fTagsIdx = fRB->fRuleStatusVals->size();
|
||||
|
@ -1027,7 +1044,7 @@ void RBBITableBuilder::printRuleStatusTable() {
|
|||
|
||||
RBBIDebugPrintf("index | tags \n");
|
||||
RBBIDebugPrintf("-------------------\n");
|
||||
|
||||
|
||||
while (nextRecord < tbl->size()) {
|
||||
thisRecord = nextRecord;
|
||||
nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
|
||||
|
@ -1057,7 +1074,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
|
|||
fTagVals = NULL;
|
||||
fPositions = NULL;
|
||||
fDtran = NULL;
|
||||
|
||||
|
||||
fDtran = new UVector(lastInputSymbol+1, *fStatus);
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
|
|
|
@ -14,7 +14,43 @@
|
|||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of the incredibly annoying rule LB 7c
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When interating forwards, these sequences do not present any problems
|
||||
# When interating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -60,7 +96,7 @@ $ZW = [:LineBreak = ZWSpace:];
|
|||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
$ALPlus = $AL | $AI | $SA | $XX;
|
||||
$ALPlus = [$AL $AI $SA $XX];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
|
@ -87,7 +123,6 @@ $OPcm = $OP $CM*;
|
|||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$SPcm = $SP $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
|
@ -120,45 +155,87 @@ $OP $CM+;
|
|||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$SP $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
# for what they can combine with are _very_ different from the rest of Unicode.
|
||||
#
|
||||
# Note that $CM itself is left out of this set. If CM is needed as a base
|
||||
# it must be listed separately in the rule.
|
||||
#
|
||||
$CAN_CM = [^$BK $CR $LF $NL $ZW $SP $CM]; # Bases that can take CMs
|
||||
$CANT_CM = [ $BK $CR $LF $NL $ZW $SP $CM]; # Bases that can't take CMs
|
||||
|
||||
#
|
||||
# Rule LB 3
|
||||
$LB3Breaks = [$BK $CR $LF $NL];
|
||||
$LB3NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $EX $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
|
||||
$LB3NonBreaks? $LB3Breaks {100};
|
||||
$LB5NonBreaks $CM* $LB3Breaks {100};
|
||||
|
||||
#
|
||||
# Rule LB 3 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB3Breaks = [$BK $CR $LF $NL];
|
||||
$LB3NonBreaks = [^$BK $CR $LF $NL];
|
||||
|
||||
$LB3NonBreaks? $LB3Breaks {100}; # LB 3c do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB3Breaks {100};
|
||||
$CM+ $LB3Breaks {100};
|
||||
$CR $LF {100};
|
||||
|
||||
# LB 4 x SP
|
||||
# x ZW
|
||||
$ZW [$SP $ZW];
|
||||
$LB5NonBreaks $CM* [$SP $ZW];
|
||||
$LB3NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
|
||||
# LB 5 Break after zero width space
|
||||
$LB5Breaks = [$LB3Breaks $ZW];
|
||||
$LB5Breaks = [$LB3Breaks $ZW];
|
||||
$LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
|
||||
|
||||
|
||||
# LB 7 Combining marks. $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
# LB 7 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
#
|
||||
$LB5NonBreaks $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
|
||||
#
|
||||
# LB 8
|
||||
$LB5NonBreaks $CM* $CL;
|
||||
$LB5NonBreaks $CM* $EX;
|
||||
$LB5NonBreaks $CM* $IS;
|
||||
$LB5NonBreaks $CM* $SY;
|
||||
#
|
||||
$LB5NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 7c, stand-alone CM behaves as AL
|
||||
|
||||
$LB5NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 7c, stand-alone CM behaves as AL
|
||||
|
||||
$LB5NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 7c, stand-alone CM behaves as AL
|
||||
|
||||
$LB5NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 7c, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 9
|
||||
$OPcm $SP* .?;
|
||||
$OPcm $SP* $LB5NonBreaks $CM*;
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 7c, stand-alone CM behaves as AL
|
||||
|
||||
# LB 10
|
||||
$QUcm $SP* $OPcm;
|
||||
|
@ -167,58 +244,71 @@ $QUcm $SP* $OPcm;
|
|||
$CLcm $SP* $NScm;
|
||||
|
||||
# LB 11a
|
||||
($B2cm)+;
|
||||
$B2cm $SP* $B2cm;
|
||||
|
||||
# LB 11b Word Joiner
|
||||
$LB5NonBreaks $CM* $WJcm;
|
||||
$WJcm .?;
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB5NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
|
||||
$WJcm [^$CAN_CM];
|
||||
$WJcm $CAN_CM $CM*;
|
||||
|
||||
# LB 12
|
||||
$LB12NonBreaks = [$LB5NonBreaks - $SP];
|
||||
$LB12NonBreaks = [$LB5NonBreaks - [$SP]];
|
||||
$LB12Breaks = [$LB5Breaks $SP];
|
||||
|
||||
# LB 13
|
||||
# x GL
|
||||
$LB12NonBreaks $CM* $GLcm;
|
||||
$SP $CM+ $GLcm; # LB7a SP CM+ behaves as ID
|
||||
$CM+ $GLcm;
|
||||
|
||||
|
||||
# GL x
|
||||
$GLcm .?;
|
||||
#
|
||||
$GLcm $LB12Breaks;
|
||||
$GLcm $LB12NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
$GLcm $SP $CM+; # SP CM+ behaves as ID
|
||||
# TODO: I don't think we need this rule.
|
||||
# All but $CM will chain off of preceding rule.
|
||||
# $GLcm will pick up the CM case by itself.
|
||||
|
||||
# LB 14
|
||||
# x QU
|
||||
$LB12NonBreaks $CM* $QUcm;
|
||||
$SP $CM+ $QUcm; # LB7a SP CM+ behaves as ID
|
||||
$CM+ $QUcm;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB12NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
$QUcm $SP $CM+; # SP CM+ behaves as ID
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
|
||||
# LB 14a
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
$LB14NonBreaks = [$LB12NonBreaks - $CB];
|
||||
$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
|
||||
|
||||
# LB 15
|
||||
$LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
|
||||
$BBcm [^$CB];
|
||||
$BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*;
|
||||
$LB14NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB14NonBreaks $CM*;
|
||||
|
||||
# LB 16
|
||||
$ALcm $INcm;
|
||||
$CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL
|
||||
$IDcm $INcm;
|
||||
$SP $CM+ $INcm; # by rule 7a, $SP $CM behaves like ID
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
|
||||
|
||||
# $LB 17
|
||||
($IDcm | $SP $CM+) $POcm;
|
||||
$ALcm+ $NUcm; # includes $LB19
|
||||
$CM+ $NUcm; # Rule 7c
|
||||
$NUcm $ALcm+;
|
||||
$IDcm $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$CM+ $NUcm; # Rule 7c, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
|
||||
# LB 18
|
||||
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? $POcm?;
|
||||
|
@ -237,7 +327,10 @@ $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
|||
|
||||
|
||||
# LB 19
|
||||
$CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
||||
$ALcm $ALcm;
|
||||
$CM+ $ALcm; # The $CM+ is from rule 7C, and unattached CM is treated as AL
|
||||
|
||||
# LB 19b
|
||||
$IScm $ALcm;
|
||||
|
||||
#
|
||||
|
@ -269,39 +362,86 @@ $CM+ $OP;
|
|||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $SP;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB9 will match, need to surpress this break.
|
||||
# LB9 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 3
|
||||
|
||||
$LB3Breaks $LB3NonBreaks;
|
||||
$LB3Breaks $CM* $LB5NonBreaks;
|
||||
$LB3Breaks [$LB3NonBreaks-$CM];
|
||||
$LB3Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 4 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] $LB3NonBreaks;
|
||||
[$SP $ZW] $CM* $LB5NonBreaks;
|
||||
[$SP $ZW] [$LB3NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 5 Break after zero width space
|
||||
|
||||
|
||||
# LB 7 Combining marks.
|
||||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $LB5NonBreaks;
|
||||
$CM+ $CAN_CM;
|
||||
|
||||
# LB 8
|
||||
$CL $CM* $LB5NonBreaks;
|
||||
$EX $CM* $LB5NonBreaks;
|
||||
$IS $CM* $LB5NonBreaks;
|
||||
$SY $CM* $LB5NonBreaks;
|
||||
$CL $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB5NonBreaks-$CM];
|
||||
$EX [$LB5NonBreaks-$CM];
|
||||
$IS [$LB5NonBreaks-$CM];
|
||||
$SY [$LB5NonBreaks-$CM];
|
||||
|
||||
# Rule 9 & 8 together.
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 9 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 7, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # Experiment. Remove.
|
||||
|
||||
|
||||
# LB 9
|
||||
$LB5NonBreaks $SP* $CM* $OP;
|
||||
|
||||
# LB 10
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
|
@ -310,48 +450,57 @@ $CM* $OP $SP* $CM* $QU;
|
|||
$CM* $NS $SP* $CM* $CL;
|
||||
|
||||
# LB 11a
|
||||
($CM* $B2)+;
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 11b
|
||||
$CM* $WJ $CM* $LB5NonBreaks;
|
||||
$CM* $LB5NonBreaks $CM* $WJ;
|
||||
. $CM* $WJ;
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB5NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12
|
||||
|
||||
# LB 14
|
||||
$CM* $GL $CM* $LB12NonBreaks;
|
||||
$CM* $GL $CM+ $SP;
|
||||
$CM* $LB5NonBreaks $CM* $GL;
|
||||
# LB 13
|
||||
# x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB12NonBreaks-$CM];
|
||||
|
||||
# LB 14
|
||||
$CM* $QU $CM* $LB12NonBreaks;
|
||||
$CM* $QU $CM+ $SP; # CM+ SP behaves as ID
|
||||
$CM* $LB5NonBreaks $CM* $QU;
|
||||
#
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
|
||||
# LB 14a
|
||||
$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
|
||||
|
||||
|
||||
#
|
||||
# LB 14
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB12NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
# LB 15
|
||||
$CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;
|
||||
($CM* ($BA | $HY | $NS))+ $CM+ / $LB5Breaks;
|
||||
[$CR $LF $BK $NL $ZW] $CM* $BB;
|
||||
$CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
|
||||
$CM* ($BA | $HY | $NS) $CM* [$LB14NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB14NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
|
||||
|
||||
# LB 16
|
||||
$CM* $IN $CM* $ALPlus;
|
||||
# by rule 7c, any otherwise unattached CM behaves as AL
|
||||
$CM* $IN $CM+ / $LB5Breaks;
|
||||
|
||||
$CM* $IN $CM* ($ID | $CM $SP);
|
||||
$CM* $IN $CM* $ID;
|
||||
$CM* $IN $CM* $IN;
|
||||
$CM* $IN $CM* $NU;
|
||||
|
||||
# $LB 17
|
||||
$CM* $PO $CM* ($ID | $CM $SP);
|
||||
$CM* $NU ($CM* $ALPlus)+; # includes $LB19
|
||||
$CM* $NU $CM+ / $LB5Breaks; # Rule 7c
|
||||
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* $ALPlus;
|
||||
$CM* $ALPlus $CM* $NU;
|
||||
|
||||
# LB 18
|
||||
|
@ -371,13 +520,13 @@ $CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
|||
|
||||
# LB 19
|
||||
$CM* $ALPlus $CM* $ALPlus;
|
||||
# The $CM* is from rule 7C, and unattached CM is treated as AL
|
||||
$CM* $ALPlus $CM* $IS;
|
||||
$CM* $ALPlus $CM+ / $LB5Breaks;
|
||||
|
||||
## problem state table can't handle lookahead when it is at the
|
||||
## start of the string, currently handled in the rbbi code
|
||||
## todo fix this
|
||||
|
||||
# LB 19b
|
||||
$CM* $ALPlus $CM* $IS;
|
||||
|
||||
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
@ -395,6 +544,7 @@ $SP+ $CM* $QU;
|
|||
|
||||
# LB 11
|
||||
$SP+ $CM* $CL;
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 18
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
|
@ -404,18 +554,14 @@ $CL $CM* ($NU | $IS | $SY);
|
|||
|
||||
!!safe_forward;
|
||||
|
||||
# LB 7
|
||||
[^$BK $CR $LF $NL $ZW $SP] $CM+;
|
||||
$SP $CM+ / [^$CM];
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $B2 $PR $HY $SP]+ [^$CM $OP $QU $CL $B2 $PR $HY];
|
||||
|
||||
# LB 9
|
||||
$OP $CM* $SP+;
|
||||
|
||||
# LB 10
|
||||
$QU $CM* $SP+;
|
||||
|
||||
# LB 11
|
||||
$CL $CM* $SP+;
|
||||
|
||||
# LB 18
|
||||
$CM* $PRcm? ($OPcm | $HYcm)? $NU;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2002-2004, International Business Machines Corporation and others.
|
||||
# Copyright (C) 2002-2005, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: sent.txt
|
||||
|
@ -111,10 +111,12 @@ $End? $Join [$RULE12 - $Sp - $Close];
|
|||
|
||||
# forces a break at the beginning of text "$Sp blah blah blah"
|
||||
# remember the break iterators takes the longest match
|
||||
$End? $Join $Sp / [^$Term $ATerm $Sp $Close];
|
||||
$NOT_T_A_S_C = [^$Term $ATerm $Sp $Close];
|
||||
$End? $Join $Sp / [$NOT_T_A_S_C {eof}];
|
||||
|
||||
# forces a break at the beginning of text "$Close blah blah blah"
|
||||
$End? $Join $Close / [^$Term $ATerm $Close];
|
||||
$NOT_T_A_C = [^$Term $ATerm $Close];
|
||||
$End? $Join $Close / [$NOT_T_A_C {eof}];
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
|
|
@ -17,62 +17,30 @@
|
|||
|
||||
!!chain;
|
||||
|
||||
$Katakana = [[:Script = KATAKANA:]
|
||||
[:name = VERTICAL KANA REPEAT MARK:]
|
||||
[:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
|
||||
[:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
|
||||
[:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
|
||||
[:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
|
||||
[:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
|
||||
[:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
|
||||
[:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
|
||||
[:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- [:Ideographic:]
|
||||
- $Katakana
|
||||
- [:Script = Hiragana:]
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Grapheme_Extend = TRUE:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:]
|
||||
[:name = HYPHENATION POINT:]
|
||||
[:name = COLON:]];
|
||||
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
$ExtendNumLet = [[:Connector_Punctuation:]
|
||||
- [:name = KATAKANA MIDDLE DOT:]
|
||||
- [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{General_Category=Connector_Punctuation}];
|
||||
|
||||
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
|
||||
$Format = [[:Cf:] - $Extend];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [:IDEOGRAPHIC:];
|
||||
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$ExtendNumLetEx = $ExtendNumLet $Extend*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
@ -81,7 +49,7 @@ $ExtendNumLetEx = $ExtendNumLet $Extend*;
|
|||
|
||||
|
||||
# Rule 3 - don't break grapheme clusters.
|
||||
# see character breaks
|
||||
# see character breaks.
|
||||
|
||||
$CR $LF;
|
||||
[^$Control] $Extend+;
|
||||
|
@ -114,8 +82,9 @@ $NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
|
|||
# rule 13
|
||||
|
||||
$KatakanaEx $Format* $KatakanaEx {300};
|
||||
$Hiragana $Extend* {300};
|
||||
$Ideographic $Extend* {400};
|
||||
|
||||
[\p{Hiragana}] $Extend* {300}; # To get tag values.
|
||||
[\p{Ideographic}] $Extend* {400};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
|
|
|
@ -320,16 +320,16 @@ static void printStringBreaks(UnicodeString ustr, int expected[],
|
|||
j ++;
|
||||
}
|
||||
u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
|
||||
printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c,
|
||||
u_isUAlphabetic(c),
|
||||
printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c,
|
||||
u_isUAlphabetic(c),
|
||||
u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
|
||||
u_isalnum(c),
|
||||
u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
|
||||
u_charType(c),
|
||||
U_SHORT_PROPERTY_NAME),
|
||||
u_getPropertyValueName(UCHAR_LINE_BREAK,
|
||||
u_getIntPropertyValue(c,
|
||||
UCHAR_LINE_BREAK),
|
||||
u_isalnum(c),
|
||||
u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
|
||||
u_charType(c),
|
||||
U_SHORT_PROPERTY_NAME),
|
||||
u_getPropertyValueName(UCHAR_LINE_BREAK,
|
||||
u_getIntPropertyValue(c,
|
||||
UCHAR_LINE_BREAK),
|
||||
U_SHORT_PROPERTY_NAME),
|
||||
name);
|
||||
}
|
||||
|
@ -390,9 +390,9 @@ void RBBITest::TestMixedThaiLineBreak()
|
|||
|
||||
|
||||
// @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
|
||||
// start
|
||||
// start
|
||||
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
|
||||
|
@ -406,9 +406,9 @@ void RBBITest::TestMixedThaiLineBreak()
|
|||
ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
|
||||
|
||||
|
||||
// @suwit - end of changes
|
||||
|
||||
// Arabic numerals should always be separated from surrounding Thai text
|
||||
|
@ -449,7 +449,7 @@ ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u
|
|||
ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status);
|
||||
*/
|
||||
|
||||
/* remove the old data sample.
|
||||
/* remove the old data sample.
|
||||
// The Unicode Linebreak TR says do not break before or after quotes.
|
||||
// So this test is changed ot not break around the quote.
|
||||
// TODO: should Thai break around the around the quotes, like the original behavior here?
|
||||
|
@ -517,21 +517,21 @@ void RBBITest::TestThaiWordBreak() {
|
|||
ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); // Break at start of data
|
||||
|
||||
|
||||
// @suwit -- Thai sample data from GVT Guideline
|
||||
// start
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07", 0, status); //5
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E04\\u0E33", 0, status); //7
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E17\\u0E22", 0, status); //10
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E2A\\u0E32\\u0E21\\u0E32\\u0E23\\u0E16", 0, status); //16
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E1B\\u0E23\\u0E30\\u0E01\\u0E2D\\u0E1A", 0, status); //22
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E14\\u0E49\\u0E27\\u0E22", 0, status); //26
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0e2b\\u0e25\\u0e32\\u0e22", 0, status); //30
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0e1e\\u0e22\\u0e32\\u0e07\\u0e04\\u0e4c", 0, status); //36
|
||||
// @suwit -- Thai sample data from GVT Guideline
|
||||
// start
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07", 0, status); //5
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E04\\u0E33", 0, status); //7
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E17\\u0E22", 0, status); //10
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E2A\\u0E32\\u0E21\\u0E32\\u0E23\\u0E16", 0, status); //16
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E1B\\u0E23\\u0E30\\u0E01\\u0E2D\\u0E1A", 0, status); //22
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E14\\u0E49\\u0E27\\u0E22", 0, status); //26
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0e2b\\u0e25\\u0e32\\u0e22", 0, status); //30
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0e1e\\u0e22\\u0e32\\u0e07\\u0e04\\u0e4c", 0, status); //36
|
||||
|
||||
// @suwit - end of changes
|
||||
|
||||
/* remove the old data sample because Thai translation of the Wizard of Oz is not good testcase for wordbreak API.
|
||||
|
||||
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5
|
||||
ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6
|
||||
|
@ -598,11 +598,11 @@ void RBBITest::TestBug3818() {
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
// Four Thai words...
|
||||
static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
|
||||
0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
|
||||
static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
|
||||
0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
|
||||
UnicodeString thaiStr(thaiWordData);
|
||||
|
||||
RuleBasedBreakIterator* bi =
|
||||
RuleBasedBreakIterator* bi =
|
||||
(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
|
||||
if (U_FAILURE(status) || bi == NULL) {
|
||||
errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
|
||||
|
@ -655,7 +655,7 @@ void RBBITest::TestJapaneseWordBreak() {
|
|||
void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
|
||||
{
|
||||
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
|
||||
|
||||
|
||||
switch (index) {
|
||||
case 0: name = "TestBug4153072";
|
||||
if(exec) TestBug4153072(); break;
|
||||
|
@ -708,6 +708,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
if(exec) TestBug3818(); break;
|
||||
case 19: name = "TestJapaneseWordBreak";
|
||||
if(exec) TestJapaneseWordBreak(); break;
|
||||
case 20: name = "TestDebug";
|
||||
if(exec) TestDebug(); break;
|
||||
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
|
@ -1890,9 +1892,8 @@ void RBBITest::TestLineBreakData() {
|
|||
int32_t expectedPos; // Expected break position (index into test string)
|
||||
|
||||
bi->setText(testString);
|
||||
pos = bi->first(); // TODO: break iterators always return a match at pos 0.
|
||||
pos = bi->next(); // Line Break TR says no match at position 0.
|
||||
// Resolve.
|
||||
pos = bi->first();
|
||||
pos = bi->next();
|
||||
|
||||
for (; pos != BreakIterator::DONE; ) {
|
||||
expectedPos = expectedBreaks.elementAti(expectedI);
|
||||
|
@ -2117,27 +2118,14 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
|
|||
|
||||
fSets = new UVector(status);
|
||||
|
||||
fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}"
|
||||
"\\u3031-\\u3035\\u309b\\u309c\\u30a0"
|
||||
"\\u30fc\\uff70\\uff9e\\uff9f]", status);
|
||||
|
||||
const UnicodeString ALetterStr( "[[\\p{Alphabetic}"
|
||||
"\\u00a0" // NBSP
|
||||
"\\u05f3]" // Hebrew punct Geresh
|
||||
"-[\\p{Ideographic}]"
|
||||
"-[\\p{Script=Thai}]"
|
||||
"-[\\p{Script=Lao}]"
|
||||
"-[\\p{Script=Hiragana}]"
|
||||
"-[\\p{Grapheme_Extend}]]");
|
||||
fALetterSet = new UnicodeSet(ALetterStr, status);
|
||||
fALetterSet->removeAll(*fKatakanaSet);
|
||||
|
||||
fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status);
|
||||
fMidNumSet = new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status);
|
||||
fNumericSet = new UnicodeSet("[\\p{Line_Break=Numeric}]", status);
|
||||
fFormatSet = new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status);
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
|
||||
fExtendNumLetSet = new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status);
|
||||
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]", status);
|
||||
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]", status);
|
||||
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]", status);
|
||||
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]", status);
|
||||
fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]", status);
|
||||
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]", status);
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]", status);
|
||||
fExtendNumLetSet = new UnicodeSet("[\\p{General_Category = Connector_Punctuation}]", status);
|
||||
fOtherSet = new UnicodeSet();
|
||||
if(U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
|
@ -2180,7 +2168,7 @@ void RBBIWordMonkey::setText(const UnicodeString &s) {
|
|||
int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
int p0, p1, p2, p3; // Indices of the significant code points around the
|
||||
int p0, p1, p2, p3; // Indices of the significant code points around the
|
||||
// break position being tested. The candidate break
|
||||
// location is before p2.
|
||||
|
||||
|
@ -2221,7 +2209,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
U_ASSERT(U_SUCCESS(status));
|
||||
c3 = fText->char32At(p3);
|
||||
}
|
||||
|
||||
|
||||
if (p1 == p2) {
|
||||
// Still warming up the loop. (won't work with zero length strings, but we don't care)
|
||||
continue;
|
||||
|
@ -2275,7 +2263,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
|
||||
// Rule (11) Numeric (MidNum | MidNumLet) x Numeric
|
||||
if ( fNumericSet->contains(c0) &&
|
||||
fMidNumSet->contains(c1) &&
|
||||
fMidNumSet->contains(c1) &&
|
||||
fNumericSet->contains(c2)) {
|
||||
continue;
|
||||
}
|
||||
|
@ -2286,7 +2274,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
fNumericSet->contains(c3)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// Rule (13) Katakana x Katakana
|
||||
if (fKatakanaSet->contains(c1) &&
|
||||
fKatakanaSet->contains(c2)) {
|
||||
|
@ -2301,7 +2289,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
}
|
||||
|
||||
// Rule 13b
|
||||
if (fExtendNumLetSet->contains(c1) &&
|
||||
if (fExtendNumLetSet->contains(c1) &&
|
||||
(fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
|
||||
fKatakanaSet->contains(c2))) {
|
||||
continue;
|
||||
|
@ -2362,7 +2350,7 @@ public:
|
|||
virtual UVector *charClasses();
|
||||
virtual void setText(const UnicodeString &s);
|
||||
virtual int32_t next(int32_t i);
|
||||
virtual void rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
|
||||
virtual void rule7Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
|
||||
private:
|
||||
UVector *fSets;
|
||||
|
||||
|
@ -2381,14 +2369,14 @@ private:
|
|||
UnicodeSet *fBA;
|
||||
UnicodeSet *fBB;
|
||||
UnicodeSet *fHY;
|
||||
UnicodeSet *fH2;
|
||||
UnicodeSet *fH3;
|
||||
UnicodeSet *fH2;
|
||||
UnicodeSet *fH3;
|
||||
UnicodeSet *fCL;
|
||||
UnicodeSet *fEX;
|
||||
UnicodeSet *fIN;
|
||||
UnicodeSet *fJL;
|
||||
UnicodeSet *fJV;
|
||||
UnicodeSet *fJT;
|
||||
UnicodeSet *fJL;
|
||||
UnicodeSet *fJV;
|
||||
UnicodeSet *fJT;
|
||||
UnicodeSet *fNS;
|
||||
UnicodeSet *fOP;
|
||||
UnicodeSet *fQU;
|
||||
|
@ -2409,12 +2397,11 @@ private:
|
|||
int32_t *fOrigPositions;
|
||||
|
||||
RegexMatcher *fNumberMatcher;
|
||||
RegexMatcher *fLB10Matcher;
|
||||
RegexMatcher *fLB11Matcher;
|
||||
};
|
||||
|
||||
|
||||
RBBILineMonkey::RBBILineMonkey()
|
||||
RBBILineMonkey::RBBILineMonkey()
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
|
@ -2439,7 +2426,7 @@ RBBILineMonkey::RBBILineMonkey()
|
|||
fCL = new UnicodeSet("[\\p{Line_break=CL}]", status);
|
||||
fEX = new UnicodeSet("[\\p{Line_break=EX}]", status);
|
||||
fIN = new UnicodeSet("[\\p{Line_break=IN}]", status);
|
||||
fJL = new UnicodeSet("[\\p{Line_break=JL}]", status);
|
||||
fJL = new UnicodeSet("[\\p{Line_break=JL}]", status);
|
||||
fJV = new UnicodeSet("[\\p{Line_break=JV}]", status);
|
||||
fJT = new UnicodeSet("[\\p{Line_break=JT}]", status);
|
||||
fNS = new UnicodeSet("[\\p{Line_break=NS}]", status);
|
||||
|
@ -2460,8 +2447,6 @@ RBBILineMonkey::RBBILineMonkey()
|
|||
fAL->addAll(*fAI); // Default behavior for AI is identical to AL
|
||||
fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
|
||||
|
||||
|
||||
|
||||
fSets->addElement(fBK, status);
|
||||
fSets->addElement(fCR, status);
|
||||
fSets->addElement(fLF, status);
|
||||
|
@ -2497,9 +2482,6 @@ RBBILineMonkey::RBBILineMonkey()
|
|||
fSets->addElement(fID, status);
|
||||
fSets->addElement(fWJ, status);
|
||||
fSets->addElement(fSA, status);
|
||||
// fSets->addElement(fXX, status);
|
||||
|
||||
|
||||
|
||||
fNumberMatcher = new RegexMatcher(
|
||||
"(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"
|
||||
|
@ -2507,19 +2489,7 @@ RBBILineMonkey::RBBILineMonkey()
|
|||
"\\p{Line_Break=NU}\\p{Line_Break=CM}*"
|
||||
"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
|
||||
"(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
|
||||
"(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
|
||||
0, status);
|
||||
|
||||
fLB10Matcher = new RegexMatcher(
|
||||
"\\p{Line_Break=QU}\\p{Line_Break=CM}*"
|
||||
"\\p{Line_Break=SP}*"
|
||||
"(\\p{Line_Break=OP})\\p{Line_Break=CM}*",
|
||||
0, status);
|
||||
|
||||
fLB11Matcher = new RegexMatcher(
|
||||
"\\p{Line_Break=CL}\\p{Line_Break=CM}*"
|
||||
"\\p{Line_Break=SP}*"
|
||||
"(\\p{Line_Break=NS})\\p{Line_Break=CM}*",
|
||||
"(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
|
||||
0, status);
|
||||
|
||||
fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
|
||||
|
@ -2537,7 +2507,7 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
|
|||
}
|
||||
|
||||
//
|
||||
// rule67Adjust
|
||||
// rule7Adjust
|
||||
// Line Break TR rules 6 and 7 implementation.
|
||||
// This deals with combining marks and other sequences that
|
||||
// that must be treated as if they were something other than what they actually are.
|
||||
|
@ -2546,21 +2516,20 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
|
|||
// each potential break, once to the chars before the position being checked, then
|
||||
// again to the text following the possible break.
|
||||
//
|
||||
void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
|
||||
void RBBILineMonkey::rule7Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
|
||||
if (pos == -1) {
|
||||
// Invalid initial position. Happens during the warmup iteration of the
|
||||
// Invalid initial position. Happens during the warmup iteration of the
|
||||
// main loop in next().
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t nPos = *nextPos;
|
||||
|
||||
|
||||
|
||||
// LB 7b Keep combining sequences together.
|
||||
// advance over any CM class chars. Note that Line Break CM is different
|
||||
// from normal Mc general category.
|
||||
if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a
|
||||
|| *posChar==0x0d || *posChar==0x85)) {
|
||||
// from normal Mc general category.
|
||||
if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
|
||||
*posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
|
||||
for (;;) {
|
||||
*nextChar = fText->char32At(nPos);
|
||||
if (!fCM->contains(*nextChar)) {
|
||||
|
@ -2569,16 +2538,11 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
|
|||
nPos = fText->moveIndex32(nPos, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// LB 7a In a SP CM* sequence, treat the SP as an ID
|
||||
if (nPos != *nextPos && fSP->contains(*posChar)) {
|
||||
*posChar = 0x4e00; // 0x4e00 is a CJK Ideograph, linebreak type is ID.
|
||||
}
|
||||
|
||||
|
||||
|
||||
// LB 7b Treat X CM* as if it were x.
|
||||
// No explicit action required.
|
||||
|
||||
// No explicit action required.
|
||||
|
||||
// LB 7c Treat any remaining combining mark as AL
|
||||
if (fCM->contains(*posChar)) {
|
||||
*posChar = 0x41; // thisChar = 'A';
|
||||
|
@ -2635,11 +2599,27 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
nextCPPos = fText->moveIndex32(pos, 1);
|
||||
nextPos = nextCPPos;
|
||||
|
||||
// Break at end of text.
|
||||
// Rule LB2 - Break at end of text.
|
||||
if (pos >= fText->length()) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Rule LB 7 - adjust for combining sequences.
|
||||
// We do this one out-of-order because the adjustment does not change anything
|
||||
// that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
|
||||
// be applied.
|
||||
rule7Adjust(prevPos, &prevChar, &pos, &thisChar);
|
||||
nextCPPos = nextPos = fText->moveIndex32(pos, 1);
|
||||
c = fText->char32At(nextPos);
|
||||
rule7Adjust(pos, &thisChar, &nextPos, &c);
|
||||
|
||||
// If the loop is still warming up - if we haven't shifted the initial
|
||||
// -1 positions out of prevPos yet - loop back to advance the
|
||||
// position in the input without any further looking for breaks.
|
||||
if (prevPos == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 3a Always break after hard line breaks,
|
||||
if (fBK->contains(prevChar)) {
|
||||
break;
|
||||
|
@ -2661,33 +2641,6 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LB 10 QU SP* x OP
|
||||
if (prevPos >= 0) {
|
||||
UnicodeString subStr10(*fText, prevPos);
|
||||
fLB10Matcher->reset(subStr10);
|
||||
status = U_ZERO_ERROR;
|
||||
if (fLB10Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
|
||||
// TODO: Check status codes
|
||||
pos = prevPos + fLB10Matcher->start(1, status);
|
||||
nextPos = prevPos + fLB10Matcher->end(0, status);
|
||||
thisChar = fText->char32At(pos);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 11 CL SP* x NS
|
||||
if (prevPos >= 0) {
|
||||
UnicodeString subStr11(*fText, prevPos);
|
||||
fLB11Matcher->reset(subStr11);
|
||||
status = U_ZERO_ERROR;
|
||||
if (fLB11Matcher->lookingAt(status)) { // /QU CM* SP* (OP) CM*/;
|
||||
// TODO: Check status codes
|
||||
pos = prevPos + fLB11Matcher->start(1, status);
|
||||
nextPos = prevPos + fLB11Matcher->end(0, status);
|
||||
thisChar = fText->char32At(pos);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 4 Don't break before spaces or zero-width space.
|
||||
if (fSP->contains(thisChar)) {
|
||||
|
@ -2703,37 +2656,8 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB LB 7
|
||||
rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
|
||||
|
||||
nextCPPos = fText->moveIndex32(pos, 1);
|
||||
nextPos = nextCPPos;
|
||||
c = fText->char32At(nextPos);
|
||||
// another percularity of LB 4 - Dont break before space
|
||||
if (fSP->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
rule67Adjust(pos, &thisChar, &nextPos, &c);
|
||||
|
||||
// If the loop is still warming up - if we haven't shifted the initial
|
||||
// -1 positions out of prevPos yet - loop back to advance the
|
||||
// position in the input without any further looking for breaks.
|
||||
if (prevPos == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Re-apply rules 3c, 4 because these could be affected by having
|
||||
// a new thisChar from doing rule 6 or 7.
|
||||
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || // 3c
|
||||
fBK->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
if (fSP->contains(thisChar)) { // LB 4
|
||||
continue;
|
||||
}
|
||||
if (fZW->contains(thisChar)) { // LB 4
|
||||
continue;
|
||||
}
|
||||
// LB 7 Already done, at top of loop.
|
||||
//
|
||||
|
||||
|
||||
// LB 8 Don't break before closings.
|
||||
|
@ -2751,7 +2675,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
// Scan backwards, checking for this sequence.
|
||||
// The OP char could include combining marks, so we acually check for
|
||||
// OP CM* SP*
|
||||
// Another Twist: The Rule 67 fixes may have changed a CP CM
|
||||
// Another Twist: The Rule 67 fixes may have changed a SP CM
|
||||
// sequence into a ID char, so before scanning back through spaces,
|
||||
// verify that prevChar is indeed a space. The prevChar variable
|
||||
// may differ from fText[prevPos]
|
||||
|
@ -2769,12 +2693,58 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
}
|
||||
|
||||
|
||||
// LB 11a B2 x B2
|
||||
if (fB2->contains(thisChar) && fB2->contains(prevChar)) {
|
||||
continue;
|
||||
// LB 10 QU SP* x OP
|
||||
if (fSP->contains(prevChar) && fOP->contains(thisChar)) {
|
||||
// Scan backwards from prevChar to see if it is preceded by QU CM* SP*
|
||||
int tPos = prevPos;
|
||||
while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
|
||||
tPos = fText->moveIndex32(tPos, -1);
|
||||
}
|
||||
while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
|
||||
tPos = fText->moveIndex32(tPos, -1);
|
||||
}
|
||||
if (fQU->contains(fText->char32At(tPos))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LB 11b
|
||||
|
||||
|
||||
// LB 11 CL SP* x NS
|
||||
// Scan backwards for SP* CM* CL
|
||||
if (fNS->contains(thisChar)) {
|
||||
int tPos = prevPos;
|
||||
while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
|
||||
tPos = fText->moveIndex32(tPos, -1);
|
||||
}
|
||||
while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
|
||||
tPos = fText->moveIndex32(tPos, -1);
|
||||
}
|
||||
if (fCL->contains(fText->char32At(tPos))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// LB 11a B2 SP* x B2
|
||||
if (fB2->contains(thisChar)) {
|
||||
// Scan backwards, checking for the B2 CM* SP* sequence.
|
||||
tPos = prevPos;
|
||||
if (fSP->contains(prevChar)) {
|
||||
while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
|
||||
tPos=fText->moveIndex32(tPos, -1);
|
||||
}
|
||||
}
|
||||
while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
|
||||
tPos=fText->moveIndex32(tPos, -1);
|
||||
}
|
||||
if (fB2->contains(fText->char32At(tPos))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// LB 11b
|
||||
// x WJ
|
||||
// WJ x
|
||||
if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
|
||||
|
@ -2786,7 +2756,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB 13
|
||||
// LB 13
|
||||
// x GL
|
||||
// GL x
|
||||
if (fGL->contains(thisChar) || fGL->contains(prevChar)) {
|
||||
|
@ -2805,7 +2775,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB 15
|
||||
// LB 15
|
||||
if (fBA->contains(thisChar) ||
|
||||
fHY->contains(thisChar) ||
|
||||
fNS->contains(thisChar) ||
|
||||
|
@ -2818,28 +2788,27 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
fID->contains(prevChar) && fIN->contains(thisChar) ||
|
||||
fIN->contains(prevChar) && fIN->contains(thisChar) ||
|
||||
fNU->contains(prevChar) && fIN->contains(thisChar) ) {
|
||||
continue;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 17 ID x PO (Note: Leading CM behaves like ID)
|
||||
// LB 17 ID x PO
|
||||
// AL x NU
|
||||
// NU x AL
|
||||
if (fID->contains(prevChar) && fPO->contains(thisChar) ||
|
||||
fCM->contains(prevChar) && fPO->contains(thisChar) ||
|
||||
fAL->contains(prevChar) && fNU->contains(thisChar) ||
|
||||
fNU->contains(prevChar) && fAL->contains(thisChar) ) {
|
||||
continue;
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 18 Numbers
|
||||
UnicodeString subStr18(*fText, prevPos);
|
||||
fNumberMatcher->reset(subStr18);
|
||||
if (fNumberMatcher->lookingAt(status)) {
|
||||
// TODO: Check status codes
|
||||
if (fNumberMatcher->lookingAt(prevPos, status)) {
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
}
|
||||
// Matched a number. But could have been just a single digit, which would
|
||||
// not represent a "no break here" between prevChar and thisChar
|
||||
int32_t numEndIdx = prevPos + fNumberMatcher->end(status); // idx of first char following num
|
||||
int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
|
||||
if (numEndIdx > pos) {
|
||||
// Number match includes at least our two chars being checked
|
||||
if (numEndIdx > nextPos) {
|
||||
|
@ -2847,7 +2816,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
// so that next loop iteration will continue at the end of the number,
|
||||
// checking for breaks between last char in number & whatever follows.
|
||||
nextPos = numEndIdx;
|
||||
pos = fCharBI->preceding(numEndIdx);
|
||||
pos = fCharBI->preceding(numEndIdx);
|
||||
thisChar = fText->char32At(pos);
|
||||
while (fCM->contains(thisChar)) {
|
||||
pos = fCharBI->preceding(pos);
|
||||
|
@ -2861,29 +2830,28 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
if (fPR->contains(prevChar) && fAL->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fPR->contains(prevChar) && fID->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 18b
|
||||
if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
|
||||
fJV->contains(thisChar) ||
|
||||
fH2->contains(thisChar) ||
|
||||
fH3->contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
|
||||
fJV->contains(thisChar) ||
|
||||
fH2->contains(thisChar) ||
|
||||
fH3->contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
|
||||
(fJV->contains(thisChar) || fJT->contains(thisChar))) {
|
||||
continue;
|
||||
(fJV->contains(thisChar) || fJT->contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
|
||||
fJT->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// LB 18c more Korean
|
||||
if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
|
||||
fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
|
||||
|
@ -2902,9 +2870,6 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// LB 19
|
||||
if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
|
||||
continue;
|
||||
|
@ -2917,9 +2882,9 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
|
||||
// LB 20 Break everywhere else
|
||||
break;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
@ -2970,8 +2935,6 @@ RBBILineMonkey::~RBBILineMonkey() {
|
|||
|
||||
delete fCharBI;
|
||||
delete fNumberMatcher;
|
||||
delete fLB10Matcher;
|
||||
delete fLB11Matcher;
|
||||
}
|
||||
|
||||
|
||||
|
@ -3014,9 +2977,9 @@ static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d
|
|||
}
|
||||
#endif
|
||||
|
||||
static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
||||
static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
||||
BreakIterator *bi,
|
||||
int expected[],
|
||||
int expected[],
|
||||
int expectedcount)
|
||||
{
|
||||
int count = 0;
|
||||
|
@ -3026,7 +2989,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
|||
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count] = i;
|
||||
if (count < expectedcount && expected[count] != i) {
|
||||
test->errln("break forward test failed: expected %d but got %d",
|
||||
test->errln("break forward test failed: expected %d but got %d",
|
||||
expected[count], i);
|
||||
break;
|
||||
}
|
||||
|
@ -3034,7 +2997,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
|||
}
|
||||
if (count != expectedcount) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("break test failed: missed %d match",
|
||||
test->errln("break test failed: missed %d match",
|
||||
expectedcount - count);
|
||||
return;
|
||||
}
|
||||
|
@ -3058,7 +3021,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
|||
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
test->errln("happy break test reverse failed: expected %d but got %d",
|
||||
test->errln("happy break test reverse failed: expected %d but got %d",
|
||||
forward[count], i);
|
||||
break;
|
||||
}
|
||||
|
@ -3079,7 +3042,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
|||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RBBITest::TestWordBreaks(void)
|
||||
|
@ -3091,8 +3054,8 @@ void RBBITest::TestWordBreaks(void)
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
UChar str[300];
|
||||
static const char *strlist[] =
|
||||
UChar str[300];
|
||||
static const char *strlist[] =
|
||||
{
|
||||
"\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
|
||||
"\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
|
||||
|
@ -3168,8 +3131,8 @@ void RBBITest::TestWordBoundary(void)
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
UChar str[50];
|
||||
static const char *strlist[] =
|
||||
UChar str[50];
|
||||
static const char *strlist[] =
|
||||
{
|
||||
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
|
||||
"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
|
||||
|
@ -3212,7 +3175,7 @@ void RBBITest::TestWordBoundary(void)
|
|||
UnicodeString ustr(str);
|
||||
int forward[50];
|
||||
int count = 0;
|
||||
|
||||
|
||||
bi->setText(ustr);
|
||||
int prev = 0;
|
||||
int i;
|
||||
|
@ -3223,7 +3186,7 @@ void RBBITest::TestWordBoundary(void)
|
|||
for (j = prev + 1; j < i; j ++) {
|
||||
if (bi->isBoundary(j)) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy boundary test failed: expected %d not a boundary",
|
||||
errln("happy boundary test failed: expected %d not a boundary",
|
||||
j);
|
||||
return;
|
||||
}
|
||||
|
@ -3231,7 +3194,7 @@ void RBBITest::TestWordBoundary(void)
|
|||
}
|
||||
if (!bi->isBoundary(i)) {
|
||||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy boundary test failed: expected %d a boundary",
|
||||
errln("happy boundary test failed: expected %d a boundary",
|
||||
i);
|
||||
return;
|
||||
}
|
||||
|
@ -3247,8 +3210,8 @@ void RBBITest::TestLineBreaks(void)
|
|||
Locale locale("en");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
|
||||
UChar str[50];
|
||||
static const char *strlist[] =
|
||||
UChar str[50];
|
||||
static const char *strlist[] =
|
||||
{
|
||||
"\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
|
||||
"\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
|
||||
|
@ -3324,14 +3287,14 @@ void RBBITest::TestSentBreaks(void)
|
|||
Locale locale("en");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
|
||||
UChar str[100];
|
||||
static const char *strlist[] =
|
||||
UChar str[100];
|
||||
static const char *strlist[] =
|
||||
{
|
||||
"Now\ris\nthe\r\ntime\n\rfor\r\r",
|
||||
"This\n",
|
||||
"Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
|
||||
"\"Sentence ending with a quote.\" Bye.",
|
||||
" (This is it). Testing the sentence iterator. \"This isn't it.\"",
|
||||
" (This is it). Testing the sentence iterator. \"This isn't it.\"",
|
||||
"Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
|
||||
"Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
|
||||
"Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
|
||||
|
@ -3427,7 +3390,7 @@ void RBBITest::TestMonkey(char *params) {
|
|||
RBBILineMonkey m;
|
||||
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
|
||||
if (params == NULL) {
|
||||
loopCount = 50;
|
||||
loopCount = loopCount / 5; // Line break runs slower than the others.
|
||||
}
|
||||
if (U_SUCCESS(status)) {
|
||||
RunMonkey(bi, m, "line", seed, loopCount);
|
||||
|
@ -3642,7 +3605,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
for (ci=0; ci<2; ci++) { // Number of items to include in error text.
|
||||
for (;;) {
|
||||
if (endContext >= testText.length()) {break;}
|
||||
if (expectedBreaks[endContext-1] != 0) {
|
||||
if (expectedBreaks[endContext-1] != 0) {
|
||||
if (count == 0) break;
|
||||
count --;
|
||||
}
|
||||
|
@ -3655,7 +3618,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
/***if (strcmp(errorType, "next()") == 0) {
|
||||
startContext = 0;
|
||||
endContext = testText.length();
|
||||
|
||||
|
||||
printStringBreaks(testText, expected, expectedCount);
|
||||
}***/
|
||||
|
||||
|
@ -3704,5 +3667,28 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name
|
|||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// TestDebug - A place-holder test for debugging purposes.
|
||||
// For putting in fragments of other tests that can be invoked
|
||||
// for tracing without a lot of unwanted extra stuff happening.
|
||||
//
|
||||
void RBBITest::TestDebug(void) {
|
||||
#if 0
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int pos;
|
||||
|
||||
RuleBasedBreakIterator* bi =
|
||||
// (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
|
||||
(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
|
||||
UnicodeString s("\\u0E2B\\u0E19\\u0E36\\u0E48\\u0E07\\u0E04\\u0E33");
|
||||
s = s.unescape();
|
||||
bi->setText(s);
|
||||
// bi->last();
|
||||
do {
|
||||
pos = bi->next();
|
||||
printf("%d\n", pos);
|
||||
} while (pos != BreakIterator::DONE);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2004, International Business Machines Corporation and
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2005, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/************************************************************************
|
||||
|
@ -31,20 +31,20 @@ class RBBIMonkeyKind;
|
|||
*/
|
||||
class RBBITest: public IntlTest {
|
||||
public:
|
||||
|
||||
|
||||
RBBITest();
|
||||
virtual ~RBBITest();
|
||||
|
||||
void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
|
||||
|
||||
|
||||
/**
|
||||
* Tests rule status return values
|
||||
**/
|
||||
**/
|
||||
void TestStatusReturn();
|
||||
|
||||
/**
|
||||
* Run the Unicode Line Break test data.
|
||||
**/
|
||||
**/
|
||||
void TestLineBreakData();
|
||||
|
||||
/**
|
||||
|
@ -58,8 +58,8 @@ public:
|
|||
void TestBug4153072();
|
||||
void TestJapaneseLineBreak();
|
||||
void TestThaiLineBreak();
|
||||
void TestMixedThaiLineBreak();
|
||||
void TestMaiyamok();
|
||||
void TestMixedThaiLineBreak();
|
||||
void TestMaiyamok();
|
||||
void TestThaiWordBreak();
|
||||
void TestMonkey(char *params);
|
||||
|
||||
|
@ -73,16 +73,17 @@ public:
|
|||
void TestSentBreaks();
|
||||
void TestBug3818();
|
||||
void TestJapaneseWordBreak();
|
||||
|
||||
|
||||
void TestDebug();
|
||||
|
||||
|
||||
/***********************/
|
||||
private:
|
||||
/**
|
||||
* internal methods to prepare test data
|
||||
**/
|
||||
|
||||
|
||||
/**
|
||||
* Perform tests of BreakIterator forward and backward functionality
|
||||
* Perform tests of BreakIterator forward and backward functionality
|
||||
* on different kinds of iterators (word, sentence, line and character).
|
||||
* It tests the methods first(), next(), current(), preceding(), following()
|
||||
* previous() and isBoundary().
|
||||
|
@ -110,7 +111,7 @@ private:
|
|||
**/
|
||||
void testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td);
|
||||
/**
|
||||
* Internal method to perform tests of BreakIterator multiple selection functionality
|
||||
* Internal method to perform tests of BreakIterator multiple selection functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td);
|
||||
|
|
Loading…
Add table
Reference in a new issue