mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-45 RBBI, getRuleStatus() works after previous().
More Tests. Private includes removed from public header Break rule tag status added to word break rules. X-SVN-Rev: 9284
This commit is contained in:
parent
c17a59a0c2
commit
e32993b2d8
12 changed files with 1041 additions and 906 deletions
|
@ -160,7 +160,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
|||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// init() Shared initialization routine. Used by all the constructors.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -172,6 +172,7 @@ void RuleBasedBreakIterator::init() {
|
|||
fData = NULL;
|
||||
fCharMappings = NULL;
|
||||
fLastBreakTag = 0;
|
||||
fLastBreakTagValid = TRUE;
|
||||
fDictionaryCharCount = 0;
|
||||
|
||||
if (debugInitDone == FALSE) {
|
||||
|
@ -309,14 +310,23 @@ int32_t RuleBasedBreakIterator::first(void) {
|
|||
*/
|
||||
int32_t RuleBasedBreakIterator::last(void) {
|
||||
reset();
|
||||
if (fText == NULL)
|
||||
if (fText == NULL) {
|
||||
fLastBreakTag = 0;
|
||||
fLastBreakTagValid = TRUE;
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// I'm not sure why, but t.last() returns the offset of the last character,
|
||||
// rather than the past-the-end offset
|
||||
//
|
||||
// (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
|
||||
// will work correctly.)
|
||||
|
||||
|
||||
fLastBreakTagValid = FALSE;
|
||||
int32_t pos = fText->endIndex();
|
||||
fText->setIndex(pos);
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
@ -356,8 +366,11 @@ int32_t RuleBasedBreakIterator::next(void) {
|
|||
*/
|
||||
int32_t RuleBasedBreakIterator::previous(void) {
|
||||
// if we're already sitting at the beginning of the text, return DONE
|
||||
if (fText == NULL || current() == fText->startIndex())
|
||||
if (fText == NULL || current() == fText->startIndex()) {
|
||||
fLastBreakTag = 0;
|
||||
fLastBreakTagValid = TRUE;
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// set things up. handlePrevious() will back us up to some valid
|
||||
// break position before the current position (we back our internal
|
||||
|
@ -366,23 +379,43 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
// where we started
|
||||
int32_t start = current();
|
||||
fText->previous32();
|
||||
int32_t lastResult = handlePrevious();
|
||||
int32_t result = lastResult;
|
||||
int32_t lastResult = handlePrevious();
|
||||
int32_t result = lastResult;
|
||||
int32_t lastTag = 0;
|
||||
UBool breakTagValid = FALSE;
|
||||
|
||||
// iterate forward from the known break position until we pass our
|
||||
// starting point. The last break position before the starting
|
||||
// point is our return value
|
||||
while (result != BreakIterator::DONE && result < start) {
|
||||
lastResult = result;
|
||||
result = handleNext();
|
||||
for (;;) {
|
||||
result = handleNext();
|
||||
if (result == BreakIterator::DONE || result >= start) {
|
||||
break;
|
||||
}
|
||||
lastResult = result;
|
||||
lastTag = fLastBreakTag;
|
||||
breakTagValid = TRUE;
|
||||
}
|
||||
|
||||
// fLastBreakTag wants to have the value for section of text preceding
|
||||
// the result position that we are to return (in lastResult.) If
|
||||
// the backwards rules overshot and the above loop had to do two or more
|
||||
// handleNext()s to move up to the desired return position, we will have a valid
|
||||
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
|
||||
// we wont have a tag value for that position, which is only set by handleNext().
|
||||
|
||||
|
||||
// set the current iteration position to be the last break position
|
||||
// before where we started, and then return that value
|
||||
fText->setIndex(lastResult);
|
||||
fLastBreakTag = lastTag; // for use by getRuleStatus()
|
||||
fLastBreakTagValid = breakTagValid;
|
||||
return lastResult;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Sets the iterator to refer to the first boundary position following
|
||||
* the specified position.
|
||||
|
@ -393,10 +426,14 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
|||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
fLastBreakTag = 0;
|
||||
fLastBreakTagValid = TRUE;
|
||||
if (fText == NULL || offset >= fText->endIndex()) {
|
||||
// fText->setToEnd();
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < fText->startIndex()) {
|
||||
// fText->setToStart();
|
||||
return fText->startIndex();
|
||||
}
|
||||
|
||||
|
@ -414,9 +451,12 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
|||
// position at or before our starting position. Advance forward
|
||||
// from here until we've passed the starting position. The position
|
||||
// we stop on will be the first break position after the specified one.
|
||||
int32_t result = handlePrevious();
|
||||
while (result != BreakIterator::DONE && result <= offset)
|
||||
result = handleNext();
|
||||
|
||||
int32_t result = previous();
|
||||
while (result != BreakIterator::DONE && result <= offset) {
|
||||
result = next();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -496,9 +536,15 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||
if (fTrace) {
|
||||
printf("Handle Next pos char state category \n");
|
||||
}
|
||||
|
||||
// No matter what, handleNext alway correctly sets the break tag value.
|
||||
fLastBreakTagValid = TRUE;
|
||||
|
||||
// if we're already at the end of the text, return DONE.
|
||||
if (fText == NULL || fData == NULL || fText->getIndex() == fText->endIndex())
|
||||
if (fText == NULL || fData == NULL || fText->getIndex() == fText->endIndex()) {
|
||||
fLastBreakTag = 0;
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// no matter what, we always advance at least one character forward
|
||||
int32_t result = fText->getIndex() + 1;
|
||||
|
@ -619,7 +665,8 @@ continueOn:
|
|||
// (the theory here is that if there are no characters at all after the lookahead
|
||||
// position, that always matches the lookahead criteria)
|
||||
if (c == CharacterIterator::DONE && lookaheadResult == fText->endIndex()) {
|
||||
result = lookaheadResult;
|
||||
result = lookaheadResult;
|
||||
fLastBreakTag = lookaheadTag;
|
||||
}
|
||||
|
||||
|
||||
|
@ -780,7 +827,21 @@ RuleBasedBreakIterator::reset()
|
|||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::getRuleStatus() const {
|
||||
return fLastBreakTag;
|
||||
// If the break tag value is unkown, back the iterator up, then move
|
||||
// forward again. Moving forward will set the fLastBreakTag value correctly.
|
||||
RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
|
||||
if (fLastBreakTagValid == FALSE) {
|
||||
if (current() == fText->startIndex()) {
|
||||
nonConstThis->fLastBreakTag = 0;
|
||||
nonConstThis->fLastBreakTagValid = TRUE;
|
||||
} else {
|
||||
int32_t pa = current();
|
||||
nonConstThis->previous();
|
||||
int32_t pb = nonConstThis->next();
|
||||
assert(pa == pb);
|
||||
}
|
||||
}
|
||||
return nonConstThis->fLastBreakTag;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include "rbbiscan.h"
|
||||
#include "rbbisetb.h"
|
||||
#include "rbbitblb.h"
|
||||
#include "rbbidata.h"
|
||||
|
||||
#include <stdio.h> // TODO - getrid of this.
|
||||
#include <stdlib.h>
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
#include "uvector.h"
|
||||
#include "uhash.h"
|
||||
|
||||
struct UNewTrie;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//
|
||||
|
@ -126,7 +128,7 @@ private:
|
|||
|
||||
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
|
||||
|
||||
UNewTrie *fTrie; // The mapping TRIE that is the end result of processin
|
||||
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
|
||||
uint32_t fTrieSize; // the Unicode Sets.
|
||||
|
||||
// Groups correspond to character categories -
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "rbbitblb.h"
|
||||
#include "rbbirb.h"
|
||||
#include "rbbisetb.h"
|
||||
#include "rbbidata.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
|
|
@ -18,14 +18,15 @@
|
|||
#include "unicode/brkiter.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "utrie.h"
|
||||
|
||||
#include "rbbidata.h"
|
||||
struct UTrie;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
struct RBBIDataHeader;
|
||||
class RuleBasedBreakIteratorTables;
|
||||
class BreakIterator;
|
||||
class RBBIDataWrapper;
|
||||
|
||||
|
||||
|
||||
|
@ -131,6 +132,11 @@ protected:
|
|||
// Rule {tag} value for the most recent match.
|
||||
int32_t fLastBreakTag;
|
||||
|
||||
// Rule tag value valid flag.
|
||||
// Some iterator operations don't intrinsically set the correct tag value.
|
||||
// This flag lets us lazily compute it if we are ever asked for the value.
|
||||
UBool fLastBreakTagValid;
|
||||
|
||||
//
|
||||
// Counter for the number of characters encountered with the "dictionary"
|
||||
// flag set. Normal RBBI iterators don't use it, although the code
|
||||
|
|
|
@ -163,6 +163,19 @@ void UVector::insertElementAt(void* obj, int32_t index, UErrorCode &status) {
|
|||
/* else index out of range */
|
||||
}
|
||||
|
||||
void UVector::insertElementAt(int32_t elem, int32_t index, UErrorCode &status) {
|
||||
// must have 0 <= index <= count
|
||||
if (0 <= index && index <= count && ensureCapacity(count + 1, status)) {
|
||||
for (int32_t i=count; i>index; --i) {
|
||||
elements[i] = elements[i-1];
|
||||
}
|
||||
elements[index].pointer = NULL;
|
||||
elements[index].integer = elem;
|
||||
++count;
|
||||
}
|
||||
/* else index out of range */
|
||||
}
|
||||
|
||||
void* UVector::elementAt(int32_t index) const {
|
||||
return (0 <= index && index < count) ? elements[index].pointer : 0;
|
||||
}
|
||||
|
|
|
@ -149,6 +149,8 @@ public:
|
|||
|
||||
void insertElementAt(void* obj, int32_t index, UErrorCode &status);
|
||||
|
||||
void insertElementAt(int32_t elem, int32_t index, UErrorCode &status);
|
||||
|
||||
void* elementAt(int32_t index) const;
|
||||
|
||||
int32_t elementAti(int32_t index) const;
|
||||
|
|
|
@ -360,4 +360,4 @@ $Openings $GluedWord $Closings $Endings;
|
|||
# TODO: make smarter reverse rules for better efficiency
|
||||
#
|
||||
! . . [^$BK | $CR | $LF]* (. | $LF $CR);
|
||||
! .*;
|
||||
#! .*;
|
||||
|
|
|
@ -6,9 +6,12 @@
|
|||
$Hiragana = [[:L:] & [:Hira:]];
|
||||
$Katakana = [[:L:] & [:Kana:]];
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Definition of $Ideographic is from TR14, Line Breaking.
|
||||
# Definitions imported from LineBreak.
|
||||
# LineBreak gets them from the Unicode Line Break Properties data file.
|
||||
#
|
||||
####################################################################################
|
||||
$Ideographic =
|
||||
[ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
|
||||
\u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
|
||||
|
@ -24,10 +27,23 @@ $Ideographic =
|
|||
\uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
|
||||
\uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
|
||||
\U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
|
||||
|
||||
$Hyphen = [ \u002D];
|
||||
|
||||
$Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
|
||||
\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
|
||||
\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
|
||||
\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# These definitions are from the character break rules.
|
||||
# Definitions imported from Character Break Rules.
|
||||
#
|
||||
####################################################################################
|
||||
$CGJ = [\u034f]; #Combining Grapheme Joiner
|
||||
$Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
|
||||
$NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator,
|
||||
|
@ -61,9 +77,6 @@ $Extend = # From UNIDATA/DerivedCoreProperties.txt
|
|||
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
||||
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
||||
|
||||
#
|
||||
# Korean, also taken from character break rules.
|
||||
#
|
||||
#
|
||||
# Korean Syllable Sequences
|
||||
#
|
||||
|
@ -98,15 +111,24 @@ $LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \u
|
|||
$LVT = [[\uac00-\ud7a3] - $LV];
|
||||
$Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*);
|
||||
|
||||
#
|
||||
# End of Imported Definitions
|
||||
#
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin
|
||||
# Here. Preceding stuff is copied from line or char break rules.
|
||||
#
|
||||
####################################################################################
|
||||
$LineBreak = [$Ideographic $Hiragana $Katakana];
|
||||
$Letter = [[[:L:] [:Sk:]] & [^$LineBreak]];
|
||||
#$MidLetter = [\u0027 \u2019 \u0029 \u00ad \u05f3 \u05f4];
|
||||
$MidLetter = [\u0027 \u2019 \u003a \u0029 \u00ad \u05f3 \u05f4];
|
||||
|
||||
|
||||
|
||||
#
|
||||
# LetterEx - extended letter, includes combining chars, CGJ sequences, Hangul sequences.
|
||||
#
|
||||
$Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
|
||||
$LetterBase = [:L:];
|
||||
$CGJSequence = $CGJ+ ($Base | $Hangul_Sequence);
|
||||
|
@ -118,29 +140,37 @@ $LetterEx = ($Letter | $Hangul_Sequence) $Extend* ((($LinkSequence | $CG
|
|||
|
||||
#
|
||||
# Numeric Definitions
|
||||
# TODO: More complete handling of $Extend combining chars.
|
||||
#
|
||||
$Numeric = [:Nd:]; #TODO remove FULL WIDTH
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$InfixNumeric = [\u002c \u002e \u003a \u003b \u0589];
|
||||
$PostfixNumeric = [\% \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7
|
||||
\u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0];
|
||||
$PrefixNumeric = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]];
|
||||
|
||||
$NumericPrefix = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?;
|
||||
$NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*;
|
||||
$PrefixNumeric? $Hyphen? $InfixNumeric? $NumericEx ($InfixNumeric? $NumericEx)* $InfixNumeric? $PostfixNumeric? {100};
|
||||
|
||||
|
||||
#
|
||||
# The Big Rule. Gloms everything together.
|
||||
# Words. Alpha-numerics,
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may inclue certain punctuation, but only between letters, not numbers.
|
||||
#
|
||||
$NumericPrefix? (($LetterEx ($MidLetter $LetterEx)*)? $NumericInterior?)* ($NumericInterior $PostfixNumeric)?;
|
||||
$MidLetterSequence = ($LetterEx $MidLetter $LetterEx);
|
||||
$MidLetNum = $MidLetterSequence | $LetterEx | $NumericEx;
|
||||
|
||||
$MidLetNum* ($LetterEx | $MidLetterSequence) $MidLetNum* {200};
|
||||
|
||||
|
||||
#
|
||||
# Lesser rules
|
||||
# Hiragana and KataKana
|
||||
#
|
||||
($Hiragana $Extend*)+ {300};
|
||||
($Katakana $Extend*)+ {300};
|
||||
|
||||
#
|
||||
# Everything Else.
|
||||
#
|
||||
($Hiragana $Extend*)*;
|
||||
($Katakana $Extend*)*;
|
||||
$NotControl $Extend*;
|
||||
\r\n;
|
||||
.;
|
||||
|
|
|
@ -465,7 +465,6 @@ static UBreakIterator * testOpenRules(char *rules) {
|
|||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* TestBreakIteratorRules - Verify that a break iterator can be created from
|
||||
* a set of source rules.
|
||||
|
@ -515,11 +514,10 @@ static void TestBreakIteratorRules() {
|
|||
pos = ubrk_next(bi);
|
||||
}
|
||||
}
|
||||
|
||||
freeToUCharStrings(&freeHook);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void TestBreakIteratorRuleError() {
|
||||
/*
|
||||
* TestBreakIteratorRuleError - Try to create a BI from rules with syntax errors,
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -19,6 +19,7 @@
|
|||
|
||||
class Vector;
|
||||
class Enumeration;
|
||||
class BITestData;
|
||||
|
||||
/**
|
||||
* Test the RuleBasedBreakIterator class giving different rules
|
||||
|
@ -103,40 +104,32 @@ private:
|
|||
* previous() and isBoundary().
|
||||
* It makes use of internal functions to achieve this.
|
||||
**/
|
||||
void generalIteratorTest(RuleBasedBreakIterator& bi, Vector* expectedResult);
|
||||
void generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td);
|
||||
/**
|
||||
* Internal method to perform iteration and test the first() and next() functions
|
||||
**/
|
||||
Vector* testFirstAndNext(RuleBasedBreakIterator& bi, UnicodeString& text);
|
||||
void testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td);
|
||||
/**
|
||||
* Internal method to perform iteration and test the last() and previous() functions
|
||||
**/
|
||||
Vector* testLastAndPrevious(RuleBasedBreakIterator& bi, UnicodeString& text);
|
||||
void testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td);
|
||||
/**
|
||||
* Internal method to perform iteration and test the following() function
|
||||
**/
|
||||
void testFollowing(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries);
|
||||
void testFollowing(RuleBasedBreakIterator& bi, BITestData &td);
|
||||
/**
|
||||
* Internal method to perform iteration and test the preceding() function
|
||||
**/
|
||||
void testPreceding(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries);
|
||||
void testPreceding(RuleBasedBreakIterator& bi, BITestData &td);
|
||||
/**
|
||||
* Internal method to perform iteration and test the isBoundary() function
|
||||
**/
|
||||
void testIsBoundary(RuleBasedBreakIterator& bi, UnicodeString& text, int32_t *boundaries);
|
||||
/**
|
||||
* Internal method which does the comparision of expected and got results.
|
||||
**/
|
||||
void compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2);
|
||||
void testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td);
|
||||
/**
|
||||
* Internal method to perform tests of BreakIterator multiple selection functionality
|
||||
* on different kinds of iterators (word, sentence, line and character)
|
||||
**/
|
||||
void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, UnicodeString& testText);
|
||||
/**
|
||||
* Internal method to create test data string from an enumerator
|
||||
**/
|
||||
UnicodeString createTestData(Enumeration* e);
|
||||
void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td);
|
||||
|
||||
};
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue