ICU-45 RBBI, getRuleStatus() works after previous().

More Tests.
Private includes removed from public header
Break rule tag status added to word break rules.

X-SVN-Rev: 9284
This commit is contained in:
Andy Heninger 2002-07-22 22:02:08 +00:00
parent c17a59a0c2
commit e32993b2d8
12 changed files with 1041 additions and 906 deletions

View file

@ -160,7 +160,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
//-----------------------------------------------------------------------------
//
//
// init() Shared initialization routine. Used by all the constructors.
//
//-----------------------------------------------------------------------------
@ -172,6 +172,7 @@ void RuleBasedBreakIterator::init() {
fData = NULL;
fCharMappings = NULL;
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
fDictionaryCharCount = 0;
if (debugInitDone == FALSE) {
@ -309,14 +310,23 @@ int32_t RuleBasedBreakIterator::first(void) {
*/
int32_t RuleBasedBreakIterator::last(void) {
reset();
if (fText == NULL)
if (fText == NULL) {
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
return BreakIterator::DONE;
}
// I'm not sure why, but t.last() returns the offset of the last character,
// rather than the past-the-end offset
//
// (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
// will work correctly.)
fLastBreakTagValid = FALSE;
int32_t pos = fText->endIndex();
fText->setIndex(pos);
return pos;
}
@ -356,8 +366,11 @@ int32_t RuleBasedBreakIterator::next(void) {
*/
int32_t RuleBasedBreakIterator::previous(void) {
// if we're already sitting at the beginning of the text, return DONE
if (fText == NULL || current() == fText->startIndex())
if (fText == NULL || current() == fText->startIndex()) {
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
return BreakIterator::DONE;
}
// set things up. handlePrevious() will back us up to some valid
// break position before the current position (we back our internal
@ -366,23 +379,43 @@ int32_t RuleBasedBreakIterator::previous(void) {
// where we started
int32_t start = current();
fText->previous32();
int32_t lastResult = handlePrevious();
int32_t result = lastResult;
int32_t lastResult = handlePrevious();
int32_t result = lastResult;
int32_t lastTag = 0;
UBool breakTagValid = FALSE;
// iterate forward from the known break position until we pass our
// starting point. The last break position before the starting
// point is our return value
while (result != BreakIterator::DONE && result < start) {
lastResult = result;
result = handleNext();
for (;;) {
result = handleNext();
if (result == BreakIterator::DONE || result >= start) {
break;
}
lastResult = result;
lastTag = fLastBreakTag;
breakTagValid = TRUE;
}
// fLastBreakTag wants to have the value for section of text preceding
// the result position that we are to return (in lastResult.) If
// the backwards rules overshot and the above loop had to do two or more
// handleNext()s to move up to the desired return position, we will have a valid
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
// we wont have a tag value for that position, which is only set by handleNext().
// set the current iteration position to be the last break position
// before where we started, and then return that value
fText->setIndex(lastResult);
fLastBreakTag = lastTag; // for use by getRuleStatus()
fLastBreakTagValid = breakTagValid;
return lastResult;
}
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
@ -393,10 +426,14 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
if (fText == NULL || offset >= fText->endIndex()) {
// fText->setToEnd();
return BreakIterator::DONE;
}
else if (offset < fText->startIndex()) {
// fText->setToStart();
return fText->startIndex();
}
@ -414,9 +451,12 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
// position at or before our starting position. Advance forward
// from here until we've passed the starting position. The position
// we stop on will be the first break position after the specified one.
int32_t result = handlePrevious();
while (result != BreakIterator::DONE && result <= offset)
result = handleNext();
int32_t result = previous();
while (result != BreakIterator::DONE && result <= offset) {
result = next();
}
return result;
}
@ -496,9 +536,15 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
if (fTrace) {
printf("Handle Next pos char state category \n");
}
// No matter what, handleNext alway correctly sets the break tag value.
fLastBreakTagValid = TRUE;
// if we're already at the end of the text, return DONE.
if (fText == NULL || fData == NULL || fText->getIndex() == fText->endIndex())
if (fText == NULL || fData == NULL || fText->getIndex() == fText->endIndex()) {
fLastBreakTag = 0;
return BreakIterator::DONE;
}
// no matter what, we always advance at least one character forward
int32_t result = fText->getIndex() + 1;
@ -619,7 +665,8 @@ continueOn:
// (the theory here is that if there are no characters at all after the lookahead
// position, that always matches the lookahead criteria)
if (c == CharacterIterator::DONE && lookaheadResult == fText->endIndex()) {
result = lookaheadResult;
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
}
@ -780,7 +827,21 @@ RuleBasedBreakIterator::reset()
//
//-------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::getRuleStatus() const {
return fLastBreakTag;
// If the break tag value is unkown, back the iterator up, then move
// forward again. Moving forward will set the fLastBreakTag value correctly.
RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
if (fLastBreakTagValid == FALSE) {
if (current() == fText->startIndex()) {
nonConstThis->fLastBreakTag = 0;
nonConstThis->fLastBreakTagValid = TRUE;
} else {
int32_t pa = current();
nonConstThis->previous();
int32_t pb = nonConstThis->next();
assert(pa == pb);
}
}
return nonConstThis->fLastBreakTag;
}

View file

@ -27,6 +27,7 @@
#include "rbbiscan.h"
#include "rbbisetb.h"
#include "rbbitblb.h"
#include "rbbidata.h"
#include <stdio.h> // TODO - getrid of this.
#include <stdlib.h>

View file

@ -16,6 +16,8 @@
#include "uvector.h"
#include "uhash.h"
struct UNewTrie;
U_NAMESPACE_BEGIN
//
@ -126,7 +128,7 @@ private:
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
UNewTrie *fTrie; // The mapping TRIE that is the end result of processin
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
uint32_t fTrieSize; // the Unicode Sets.
// Groups correspond to character categories -

View file

@ -14,6 +14,7 @@
#include "rbbitblb.h"
#include "rbbirb.h"
#include "rbbisetb.h"
#include "rbbidata.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

View file

@ -18,14 +18,15 @@
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/parseerr.h"
#include "utrie.h"
#include "rbbidata.h"
struct UTrie;
U_NAMESPACE_BEGIN
struct RBBIDataHeader;
class RuleBasedBreakIteratorTables;
class BreakIterator;
class RBBIDataWrapper;
@ -131,6 +132,11 @@ protected:
// Rule {tag} value for the most recent match.
int32_t fLastBreakTag;
// Rule tag value valid flag.
// Some iterator operations don't intrinsically set the correct tag value.
// This flag lets us lazily compute it if we are ever asked for the value.
UBool fLastBreakTagValid;
//
// Counter for the number of characters encountered with the "dictionary"
// flag set. Normal RBBI iterators don't use it, although the code

View file

@ -163,6 +163,19 @@ void UVector::insertElementAt(void* obj, int32_t index, UErrorCode &status) {
/* else index out of range */
}
void UVector::insertElementAt(int32_t elem, int32_t index, UErrorCode &status) {
// must have 0 <= index <= count
if (0 <= index && index <= count && ensureCapacity(count + 1, status)) {
for (int32_t i=count; i>index; --i) {
elements[i] = elements[i-1];
}
elements[index].pointer = NULL;
elements[index].integer = elem;
++count;
}
/* else index out of range */
}
void* UVector::elementAt(int32_t index) const {
return (0 <= index && index < count) ? elements[index].pointer : 0;
}

View file

@ -149,6 +149,8 @@ public:
void insertElementAt(void* obj, int32_t index, UErrorCode &status);
void insertElementAt(int32_t elem, int32_t index, UErrorCode &status);
void* elementAt(int32_t index) const;
int32_t elementAti(int32_t index) const;

View file

@ -360,4 +360,4 @@ $Openings $GluedWord $Closings $Endings;
# TODO: make smarter reverse rules for better efficiency
#
! . . [^$BK | $CR | $LF]* (. | $LF $CR);
! .*;
#! .*;

View file

@ -6,9 +6,12 @@
$Hiragana = [[:L:] & [:Hira:]];
$Katakana = [[:L:] & [:Kana:]];
####################################################################################
#
# Definition of $Ideographic is from TR14, Line Breaking.
# Definitions imported from LineBreak.
# LineBreak gets them from the Unicode Line Break Properties data file.
#
####################################################################################
$Ideographic =
[ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
\u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
@ -24,10 +27,23 @@ $Ideographic =
\uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
\uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
\U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
$Hyphen = [ \u002D];
$Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
####################################################################################
#
# These definitions are from the character break rules.
# Definitions imported from Character Break Rules.
#
####################################################################################
$CGJ = [\u034f]; #Combining Grapheme Joiner
$Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
$NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator,
@ -61,9 +77,6 @@ $Extend = # From UNIDATA/DerivedCoreProperties.txt
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
#
# Korean, also taken from character break rules.
#
#
# Korean Syllable Sequences
#
@ -98,15 +111,24 @@ $LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \u
$LVT = [[\uac00-\ud7a3] - $LV];
$Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*);
#
# End of Imported Definitions
#
####################################################################################
#
# Word Break Rules. Definitions and Rules specific to word break begin
# Here. Preceding stuff is copied from line or char break rules.
#
####################################################################################
$LineBreak = [$Ideographic $Hiragana $Katakana];
$Letter = [[[:L:] [:Sk:]] & [^$LineBreak]];
#$MidLetter = [\u0027 \u2019 \u0029 \u00ad \u05f3 \u05f4];
$MidLetter = [\u0027 \u2019 \u003a \u0029 \u00ad \u05f3 \u05f4];
#
# LetterEx - extended letter, includes combining chars, CGJ sequences, Hangul sequences.
#
$Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
$LetterBase = [:L:];
$CGJSequence = $CGJ+ ($Base | $Hangul_Sequence);
@ -118,29 +140,37 @@ $LetterEx = ($Letter | $Hangul_Sequence) $Extend* ((($LinkSequence | $CG
#
# Numeric Definitions
# TODO: More complete handling of $Extend combining chars.
#
$Numeric = [:Nd:]; #TODO remove FULL WIDTH
$NumericEx = $Numeric $Extend*;
$InfixNumeric = [\u002c \u002e \u003a \u003b \u0589];
$PostfixNumeric = [\% \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7
\u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0];
$PrefixNumeric = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]];
$NumericPrefix = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?;
$NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*;
$PrefixNumeric? $Hyphen? $InfixNumeric? $NumericEx ($InfixNumeric? $NumericEx)* $InfixNumeric? $PostfixNumeric? {100};
#
# The Big Rule. Gloms everything together.
# Words. Alpha-numerics,
# - must include at least one letter.
# - may include both letters and numbers.
# - may inclue certain punctuation, but only between letters, not numbers.
#
$NumericPrefix? (($LetterEx ($MidLetter $LetterEx)*)? $NumericInterior?)* ($NumericInterior $PostfixNumeric)?;
$MidLetterSequence = ($LetterEx $MidLetter $LetterEx);
$MidLetNum = $MidLetterSequence | $LetterEx | $NumericEx;
$MidLetNum* ($LetterEx | $MidLetterSequence) $MidLetNum* {200};
#
# Lesser rules
# Hiragana and KataKana
#
($Hiragana $Extend*)+ {300};
($Katakana $Extend*)+ {300};
#
# Everything Else.
#
($Hiragana $Extend*)*;
($Katakana $Extend*)*;
$NotControl $Extend*;
\r\n;
.;

View file

@ -465,7 +465,6 @@ static UBreakIterator * testOpenRules(char *rules) {
}
/*
* TestBreakIteratorRules - Verify that a break iterator can be created from
* a set of source rules.
@ -515,11 +514,10 @@ static void TestBreakIteratorRules() {
pos = ubrk_next(bi);
}
}
freeToUCharStrings(&freeHook);
}
static void TestBreakIteratorRuleError() {
/*
* TestBreakIteratorRuleError - Try to create a BI from rules with syntax errors,

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,7 @@
class Vector;
class Enumeration;
class BITestData;
/**
* Test the RuleBasedBreakIterator class giving different rules
@ -103,40 +104,32 @@ private:
* previous() and isBoundary().
* It makes use of internal functions to achieve this.
**/
void generalIteratorTest(RuleBasedBreakIterator& bi, Vector* expectedResult);
void generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td);
/**
* Internal method to perform iteration and test the first() and next() functions
**/
Vector* testFirstAndNext(RuleBasedBreakIterator& bi, UnicodeString& text);
void testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td);
/**
* Internal method to perform iteration and test the last() and previous() functions
**/
Vector* testLastAndPrevious(RuleBasedBreakIterator& bi, UnicodeString& text);
void testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td);
/**
* Internal method to perform iteration and test the following() function
**/
void testFollowing(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries);
void testFollowing(RuleBasedBreakIterator& bi, BITestData &td);
/**
* Internal method to perform iteration and test the preceding() function
**/
void testPreceding(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries);
void testPreceding(RuleBasedBreakIterator& bi, BITestData &td);
/**
* Internal method to perform iteration and test the isBoundary() function
**/
void testIsBoundary(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries);
/**
* Internal method which does the comparision of expected and got results.
**/
void compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2);
void testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td);
/**
* Internal method to perform tests of BreakIterator multiple selection functionality
* on different kinds of iterators (word, sentence, line and character)
**/
void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, UnicodeString& testText);
/**
* Internal method to create test data string from an enumerator
**/
UnicodeString createTestData(Enumeration* e);
void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td);
};