ICU-45 Add word break enum values. Add test for same.

Bug fix: bad tag value returned after BreakIterator::first() X-SVN-Rev: 9438
2025-04-14 09:21:03 +00:00 · 2002-07-30 19:09:14 +00:00 · 2002-07-30 19:09:14 +00:00 · 7de935a168
commit 7de935a168
parent 976c946b19
5 changed files with 88 additions and 7 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -297,6 +297,8 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
 */
 int32_t RuleBasedBreakIterator::first(void) {
    reset();
+    fLastBreakTag      = 0;
+    fLastBreakTagValid = TRUE;
    if (fText == NULL)
        return BreakIterator::DONE;

--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@ -25,6 +25,7 @@
 #include "unicode/unistr.h"
 #include "unicode/chariter.h"
 #include "unicode/locid.h"
+#include "unicode/ubrk.h"


 U_NAMESPACE_BEGIN
--- a/icu4c/source/common/unicode/ubrk.h
+++ b/icu4c/source/common/unicode/ubrk.h
@ -195,6 +195,29 @@ typedef enum UBreakIteratorType UBreakIteratorType;
 */
 #define UBRK_DONE ((int32_t) -1)

+
+/**
+ *  Enum constants for the word break tags returned by
+ *  getRuleStatus().  A range of values is defined for each category of
+ *  word, to allow for further subdivisions of a category in future releases.
+ *  Applications should check for tag values falling within the range, rather
+ *  than for single individual values.
+*/
+enum UWordBreak {
+    UBRK_WORD_NONE           = 0,
+    UBRK_WORD_NONE_LIMIT     = 100,
+    UBRK_WORD_NUMBER         = 100,
+    UBRK_WORD_NUMBER_LIMIT   = 200,
+    UBRK_WORD_LETTER         = 200,
+    UBRK_WORD_LETTER_LIMIT   = 300,
+    UBRK_WORD_HIRAKATA       = 300,
+    UBRK_WORD_HIRAKATA_LIMIT = 400,
+    UBRK_WORD_IDEO           = 400,
+    UBRK_WORD_IDEO_LIMIT     = 500
+};
+typedef enum UWordBreak UWordBreak;
+
+
 /**
 * Open a new UBreakIterator for locating text boundaries for a specified locale.
 * A UBreakIterator may be used for detecting character, line, word,
@ -412,6 +435,8 @@ ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
 * returned break position.  The values appear in the rule source
 * within brackets, {123}, for example.  For rules that do not specify a
 * status, a default value of 0 is returned.
+ * <p>
+ * For word break iterators, the possible values are defined in enum UWordBreak.
 */
 U_CAPI  int32_t U_EXPORT2
 ubrk_getRuleStatus(UBreakIterator *bi);
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -139,8 +139,6 @@ void RBBIAPITest::TestgetRules()

    bi1->setText((UnicodeString)"Hello there");

-
-
    RuleBasedBreakIterator* bi3 =(RuleBasedBreakIterator*)bi1->clone();

    UnicodeString temp=bi1->getRules();
@ -563,6 +561,8 @@ void RBBIAPITest::TestLastPreviousPreceding()
    delete wordIter1;
    delete lineIter1;
 }
+
+
 void RBBIAPITest::TestIsBoundary(){
    UErrorCode status=U_ZERO_ERROR;
    UnicodeString testString1=CharsToUnicodeString("Write here. \\u092d\\u093e\\u0930\\u0924 \\u0938\\u0941\\u0902\\u0926\\u0930 \\u0939\\u094c\\u0964");
@ -621,12 +621,12 @@ void RBBIAPITest::TestBuilder() {
 //
 void RBBIAPITest::TestQuoteGrouping() {
     UnicodeString rulesString1 = "#Here comes the rule...\n"
-                                  "'$@!'*;\n"
+                                  "'$@!'*;\n"   //  (\$\@\!)*
                                  ".;\n";

-     UnicodeString testString1  = "$@!X$@!XX";
-                                // 01234567890
-     int32_t bounds1[] = {0, 3, 4, 7, 8, 9};
+     UnicodeString testString1  = "$@!$@!X$@!!X";
+                                // 0123456789012
+     int32_t bounds1[] = {0, 6, 7, 10, 11, 12};
     UErrorCode status=U_ZERO_ERROR;
     UParseError    parseError;
     
@ -640,6 +640,55 @@ void RBBIAPITest::TestQuoteGrouping() {
     delete bi;
 }

+//
+//  TestWordStatus
+//      Test word break rule status constants.
+//
+void RBBIAPITest::TestWordStatus() {
+
+     
+     UnicodeString testString1 =   //                  Ideographic    Katakana       Hiragana
+             CharsToUnicodeString("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094");
+                                // 012345678901234567  8      9    0  1      2    3  4      5    6
+     int32_t bounds1[] =     {     0,   5,6, 10,11, 17,18,  19,   20,21,       23,24,       26};
+     int32_t tag_lo[]  = {UBRK_WORD_NONE,     UBRK_WORD_LETTER, UBRK_WORD_NONE,    UBRK_WORD_LETTER,
+                          UBRK_WORD_NONE,     UBRK_WORD_NUMBER, UBRK_WORD_NONE,
+                          UBRK_WORD_IDEO,     UBRK_WORD_IDEO,   UBRK_WORD_NONE,
+                          UBRK_WORD_HIRAKATA, UBRK_WORD_NONE,   UBRK_WORD_HIRAKATA};
+
+     int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT,     UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT,    UBRK_WORD_LETTER_LIMIT,
+                          UBRK_WORD_NONE_LIMIT,     UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
+                          UBRK_WORD_IDEO_LIMIT,     UBRK_WORD_IDEO_LIMIT,   UBRK_WORD_NONE_LIMIT,
+                          UBRK_WORD_HIRAKATA_LIMIT, UBRK_WORD_NONE_LIMIT,   UBRK_WORD_HIRAKATA_LIMIT};
+
+     UErrorCode status=U_ZERO_ERROR;
+     
+     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getDefault(), status);
+     if(U_FAILURE(status)) {
+         errln("FAIL : in construction");
+     } else {
+         bi->setText(testString1);
+         // First test that the breaks are in the right spots.
+         doBoundaryTest(*bi, testString1, bounds1);
+
+         // Then go back and check tag values
+         int32_t i = 0;
+         int32_t pos, tag;
+         for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
+             if (pos != bounds1[i]) {
+                 errln("FAIL: unexpected word break at postion %d", pos);
+                 break;
+             }
+             tag = bi->getRuleStatus();
+             if (tag < tag_lo[i] || tag >= tag_hi[i]) {
+                 errln("FAIL: incorrect tag value %d at position %d", tag, pos);
+                 break;
+             }
+         }
+     }
+     delete bi;
+}
+

 //---------------------------------------------
 // runIndexedTest
@ -659,6 +708,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
        case 6: name = "TestIsBoundary"; if (exec) TestIsBoundary(); break;
        case 7: name = "TestBuilder"; if (exec) TestBuilder(); break;
        case 8: name = "TestQuoteGrouping"; if (exec) TestQuoteGrouping(); break;
+        case 9: name = "TestWordStatus"; if (exec) TestWordStatus(); break;
                   
        default: name = ""; break; /*needed to end loop*/
    }
--- a/icu4c/source/test/intltest/rbbiapts.h
+++ b/icu4c/source/test/intltest/rbbiapts.h
@ -68,6 +68,10 @@ public:
     **/
   void TestQuoteGrouping();

+    /**
+     *  Tests word break status returns.
+     */
+    void TestWordStatus();

    /**
     *Internal subroutines
@ -79,7 +83,6 @@ public:
    void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected);


-
 };

 #endif