ICU-45 Add word break enum values. Add test for same.

Bug fix:  bad tag value returned after BreakIterator::first()

X-SVN-Rev: 9438
This commit is contained in:
Andy Heninger 2002-07-30 19:09:14 +00:00
parent 976c946b19
commit 7de935a168
5 changed files with 88 additions and 7 deletions

View file

@ -297,6 +297,8 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
*/
int32_t RuleBasedBreakIterator::first(void) {
reset();
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
if (fText == NULL)
return BreakIterator::DONE;

View file

@ -25,6 +25,7 @@
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/locid.h"
#include "unicode/ubrk.h"
U_NAMESPACE_BEGIN

View file

@ -195,6 +195,29 @@ typedef enum UBreakIteratorType UBreakIteratorType;
*/
#define UBRK_DONE ((int32_t) -1)
/**
* Enum constants for the word break tags returned by
* getRuleStatus(). A range of values is defined for each category of
* word, to allow for further subdivisions of a category in future releases.
* Applications should check for tag values falling within the range, rather
* than for single individual values.
*/
enum UWordBreak {
UBRK_WORD_NONE = 0,
UBRK_WORD_NONE_LIMIT = 100,
UBRK_WORD_NUMBER = 100,
UBRK_WORD_NUMBER_LIMIT = 200,
UBRK_WORD_LETTER = 200,
UBRK_WORD_LETTER_LIMIT = 300,
UBRK_WORD_HIRAKATA = 300,
UBRK_WORD_HIRAKATA_LIMIT = 400,
UBRK_WORD_IDEO = 400,
UBRK_WORD_IDEO_LIMIT = 500
};
typedef enum UWordBreak UWordBreak;
/**
* Open a new UBreakIterator for locating text boundaries for a specified locale.
* A UBreakIterator may be used for detecting character, line, word,
@ -412,6 +435,8 @@ ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned.
* <p>
* For word break iterators, the possible values are defined in enum UWordBreak.
*/
U_CAPI int32_t U_EXPORT2
ubrk_getRuleStatus(UBreakIterator *bi);

View file

@ -139,8 +139,6 @@ void RBBIAPITest::TestgetRules()
bi1->setText((UnicodeString)"Hello there");
RuleBasedBreakIterator* bi3 =(RuleBasedBreakIterator*)bi1->clone();
UnicodeString temp=bi1->getRules();
@ -563,6 +561,8 @@ void RBBIAPITest::TestLastPreviousPreceding()
delete wordIter1;
delete lineIter1;
}
void RBBIAPITest::TestIsBoundary(){
UErrorCode status=U_ZERO_ERROR;
UnicodeString testString1=CharsToUnicodeString("Write here. \\u092d\\u093e\\u0930\\u0924 \\u0938\\u0941\\u0902\\u0926\\u0930 \\u0939\\u094c\\u0964");
@ -621,12 +621,12 @@ void RBBIAPITest::TestBuilder() {
//
void RBBIAPITest::TestQuoteGrouping() {
UnicodeString rulesString1 = "#Here comes the rule...\n"
"'$@!'*;\n"
"'$@!'*;\n" // (\$\@\!)*
".;\n";
UnicodeString testString1 = "$@!X$@!XX";
// 01234567890
int32_t bounds1[] = {0, 3, 4, 7, 8, 9};
UnicodeString testString1 = "$@!$@!X$@!!X";
// 0123456789012
int32_t bounds1[] = {0, 6, 7, 10, 11, 12};
UErrorCode status=U_ZERO_ERROR;
UParseError parseError;
@ -640,6 +640,55 @@ void RBBIAPITest::TestQuoteGrouping() {
delete bi;
}
//
// TestWordStatus
// Test word break rule status constants.
//
void RBBIAPITest::TestWordStatus() {
UnicodeString testString1 = // Ideographic Katakana Hiragana
CharsToUnicodeString("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094");
// 012345678901234567 8 9 0 1 2 3 4 5 6
int32_t bounds1[] = { 0, 5,6, 10,11, 17,18, 19, 20,21, 23,24, 26};
int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,
UBRK_WORD_HIRAKATA, UBRK_WORD_NONE, UBRK_WORD_HIRAKATA};
int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT,
UBRK_WORD_HIRAKATA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_HIRAKATA_LIMIT};
UErrorCode status=U_ZERO_ERROR;
RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
if(U_FAILURE(status)) {
errln("FAIL : in construction");
} else {
bi->setText(testString1);
// First test that the breaks are in the right spots.
doBoundaryTest(*bi, testString1, bounds1);
// Then go back and check tag values
int32_t i = 0;
int32_t pos, tag;
for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
if (pos != bounds1[i]) {
errln("FAIL: unexpected word break at postion %d", pos);
break;
}
tag = bi->getRuleStatus();
if (tag < tag_lo[i] || tag >= tag_hi[i]) {
errln("FAIL: incorrect tag value %d at position %d", tag, pos);
break;
}
}
}
delete bi;
}
//---------------------------------------------
// runIndexedTest
@ -659,6 +708,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
case 6: name = "TestIsBoundary"; if (exec) TestIsBoundary(); break;
case 7: name = "TestBuilder"; if (exec) TestBuilder(); break;
case 8: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
case 9: name = "TestWordStatus"; if (exec) TestWordStatus(); break;
default: name = ""; break; /*needed to end loop*/
}

View file

@ -68,6 +68,10 @@ public:
**/
void TestQuoteGrouping();
/**
* Tests word break status returns.
*/
void TestWordStatus();
/**
*Internal subroutines
@ -79,7 +83,6 @@ public:
void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected);
};
#endif