mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 09:21:03 +00:00
ICU-45 Add word break enum values. Add test for same.
Bug fix: bad tag value returned after BreakIterator::first() X-SVN-Rev: 9438
This commit is contained in:
parent
976c946b19
commit
7de935a168
5 changed files with 88 additions and 7 deletions
|
@ -297,6 +297,8 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
|
|||
*/
|
||||
int32_t RuleBasedBreakIterator::first(void) {
|
||||
reset();
|
||||
fLastBreakTag = 0;
|
||||
fLastBreakTagValid = TRUE;
|
||||
if (fText == NULL)
|
||||
return BreakIterator::DONE;
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include "unicode/unistr.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/ubrk.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
|
|
@ -195,6 +195,29 @@ typedef enum UBreakIteratorType UBreakIteratorType;
|
|||
*/
|
||||
#define UBRK_DONE ((int32_t) -1)
|
||||
|
||||
|
||||
/**
|
||||
* Enum constants for the word break tags returned by
|
||||
* getRuleStatus(). A range of values is defined for each category of
|
||||
* word, to allow for further subdivisions of a category in future releases.
|
||||
* Applications should check for tag values falling within the range, rather
|
||||
* than for single individual values.
|
||||
*/
|
||||
enum UWordBreak {
|
||||
UBRK_WORD_NONE = 0,
|
||||
UBRK_WORD_NONE_LIMIT = 100,
|
||||
UBRK_WORD_NUMBER = 100,
|
||||
UBRK_WORD_NUMBER_LIMIT = 200,
|
||||
UBRK_WORD_LETTER = 200,
|
||||
UBRK_WORD_LETTER_LIMIT = 300,
|
||||
UBRK_WORD_HIRAKATA = 300,
|
||||
UBRK_WORD_HIRAKATA_LIMIT = 400,
|
||||
UBRK_WORD_IDEO = 400,
|
||||
UBRK_WORD_IDEO_LIMIT = 500
|
||||
};
|
||||
typedef enum UWordBreak UWordBreak;
|
||||
|
||||
|
||||
/**
|
||||
* Open a new UBreakIterator for locating text boundaries for a specified locale.
|
||||
* A UBreakIterator may be used for detecting character, line, word,
|
||||
|
@ -412,6 +435,8 @@ ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
|
|||
* returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. For rules that do not specify a
|
||||
* status, a default value of 0 is returned.
|
||||
* <p>
|
||||
* For word break iterators, the possible values are defined in enum UWordBreak.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_getRuleStatus(UBreakIterator *bi);
|
||||
|
|
|
@ -139,8 +139,6 @@ void RBBIAPITest::TestgetRules()
|
|||
|
||||
bi1->setText((UnicodeString)"Hello there");
|
||||
|
||||
|
||||
|
||||
RuleBasedBreakIterator* bi3 =(RuleBasedBreakIterator*)bi1->clone();
|
||||
|
||||
UnicodeString temp=bi1->getRules();
|
||||
|
@ -563,6 +561,8 @@ void RBBIAPITest::TestLastPreviousPreceding()
|
|||
delete wordIter1;
|
||||
delete lineIter1;
|
||||
}
|
||||
|
||||
|
||||
void RBBIAPITest::TestIsBoundary(){
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
UnicodeString testString1=CharsToUnicodeString("Write here. \\u092d\\u093e\\u0930\\u0924 \\u0938\\u0941\\u0902\\u0926\\u0930 \\u0939\\u094c\\u0964");
|
||||
|
@ -621,12 +621,12 @@ void RBBIAPITest::TestBuilder() {
|
|||
//
|
||||
void RBBIAPITest::TestQuoteGrouping() {
|
||||
UnicodeString rulesString1 = "#Here comes the rule...\n"
|
||||
"'$@!'*;\n"
|
||||
"'$@!'*;\n" // (\$\@\!)*
|
||||
".;\n";
|
||||
|
||||
UnicodeString testString1 = "$@!X$@!XX";
|
||||
// 01234567890
|
||||
int32_t bounds1[] = {0, 3, 4, 7, 8, 9};
|
||||
UnicodeString testString1 = "$@!$@!X$@!!X";
|
||||
// 0123456789012
|
||||
int32_t bounds1[] = {0, 6, 7, 10, 11, 12};
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
UParseError parseError;
|
||||
|
||||
|
@ -640,6 +640,55 @@ void RBBIAPITest::TestQuoteGrouping() {
|
|||
delete bi;
|
||||
}
|
||||
|
||||
//
|
||||
// TestWordStatus
|
||||
// Test word break rule status constants.
|
||||
//
|
||||
void RBBIAPITest::TestWordStatus() {
|
||||
|
||||
|
||||
UnicodeString testString1 = // Ideographic Katakana Hiragana
|
||||
CharsToUnicodeString("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094");
|
||||
// 012345678901234567 8 9 0 1 2 3 4 5 6
|
||||
int32_t bounds1[] = { 0, 5,6, 10,11, 17,18, 19, 20,21, 23,24, 26};
|
||||
int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
|
||||
UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
|
||||
UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,
|
||||
UBRK_WORD_HIRAKATA, UBRK_WORD_NONE, UBRK_WORD_HIRAKATA};
|
||||
|
||||
int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
|
||||
UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
|
||||
UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT,
|
||||
UBRK_WORD_HIRAKATA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_HIRAKATA_LIMIT};
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
|
||||
RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status)) {
|
||||
errln("FAIL : in construction");
|
||||
} else {
|
||||
bi->setText(testString1);
|
||||
// First test that the breaks are in the right spots.
|
||||
doBoundaryTest(*bi, testString1, bounds1);
|
||||
|
||||
// Then go back and check tag values
|
||||
int32_t i = 0;
|
||||
int32_t pos, tag;
|
||||
for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
|
||||
if (pos != bounds1[i]) {
|
||||
errln("FAIL: unexpected word break at postion %d", pos);
|
||||
break;
|
||||
}
|
||||
tag = bi->getRuleStatus();
|
||||
if (tag < tag_lo[i] || tag >= tag_hi[i]) {
|
||||
errln("FAIL: incorrect tag value %d at position %d", tag, pos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
delete bi;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------
|
||||
// runIndexedTest
|
||||
|
@ -659,6 +708,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
|||
case 6: name = "TestIsBoundary"; if (exec) TestIsBoundary(); break;
|
||||
case 7: name = "TestBuilder"; if (exec) TestBuilder(); break;
|
||||
case 8: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
|
||||
case 9: name = "TestWordStatus"; if (exec) TestWordStatus(); break;
|
||||
|
||||
default: name = ""; break; /*needed to end loop*/
|
||||
}
|
||||
|
|
|
@ -68,6 +68,10 @@ public:
|
|||
**/
|
||||
void TestQuoteGrouping();
|
||||
|
||||
/**
|
||||
* Tests word break status returns.
|
||||
*/
|
||||
void TestWordStatus();
|
||||
|
||||
/**
|
||||
*Internal subroutines
|
||||
|
@ -79,7 +83,6 @@ public:
|
|||
void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected);
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue