ICU-2093 RBBI rule make dependencies for UnicodeSet properties adjusted.

Check for empty UnicodeSets added to builder. X-SVN-Rev: 11476
2025-04-21 04:29:31 +00:00 · 2003-04-09 00:09:14 +00:00 · 2003-04-09 00:09:14 +00:00 · 71070da39f
commit 71070da39f
parent 48eda8bd06
6 changed files with 33 additions and 14 deletions
--- a/icu4c/source/common/putil.c
+++ b/icu4c/source/common/putil.c
@ -1745,7 +1745,8 @@ _uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = {
    "U_BRK_MISMATCHED_PAREN",
    "U_BRK_NEW_LINE_IN_QUOTED_STRING",
    "U_BRK_UNDEFINED_VARIABLE",
-    "U_BRK_INIT_ERROR"
+    "U_BRK_INIT_ERROR",
+    "U_BRK_RULE_EMPTY_SET"
 };

 static const char * const
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@ -1085,10 +1085,24 @@ void RBBIRuleScanner::scanSet() {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
        RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
-         error(localStatus);
-         return;
+        error(localStatus);
+        delete uset;
+        return;
    }

+    // Verify that the set contains at least one code point.
+    //
+    if (uset->charAt(0) == -1) {
+        // This set is empty.
+        //  Make it an error, because it almost certainly is not what the user wanted.
+        //  Also, avoids having to think about corner cases in the tree manipulation code
+        //   that occurs later on.
+        error(U_BRK_RULE_EMPTY_SET);
+        delete uset;
+        return;
+    }
+
+
    // Advance the RBBI parse postion over the UnicodeSet pattern.
    //   Don't just set fScanIndex because the line/char positions maintained
    //   for error reporting would be thrown off.
@ -1118,7 +1132,6 @@ void RBBIRuleScanner::scanSet() {

 }

-
 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -612,6 +612,7 @@ typedef enum UErrorCode {
    U_BRK_NEW_LINE_IN_QUOTED_STRING,       /**< Missing closing quote in an RBBI rule.            */
    U_BRK_UNDEFINED_VARIABLE,              /**< Use of an undefined $Variable in an RBBI rule.    */
    U_BRK_INIT_ERROR,                      /**< Initialization failure.  Probable missing ICU Data. */
+    U_BRK_RULE_EMPTY_SET,                  /**< Rule contains an empty Unicode Set.               */
    U_BRK_ERROR_LIMIT,                     /**< This must always be the last value to indicate the limit for Break Iterator failures */

    /*
--- a/icu4c/source/data/brkitr/char.txt
+++ b/icu4c/source/data/brkitr/char.txt
@ -29,10 +29,10 @@ $T   = [:Hangul_Syllable_Type = T:];
 $LV  = [:Hangul_Syllable_Type = LV:];
 $LVT = [:Hangul_Syllable_Type = LVT:];

-$HungulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
+$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;

 $CR $LF;
-([^$Control] | $HungulSyllable) $Extend*;
+([^$Control] | $HangulSyllable) $Extend*;
 .;


--- a/icu4c/source/data/brkitr/word.txt
+++ b/icu4c/source/data/brkitr/word.txt
@ -42,6 +42,8 @@ $Numeric   = [:LineBreak = Numeric:];
 #  Character Class Definitions.
 #    The names are those from TR29.
 #
+$CR         = \u000d;
+$LF         = \u000a;
 $Control    = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
 $Extend     = [[:Grapheme_Extend = TRUE:]]; 

@ -110,7 +112,7 @@ $KatakanaEx+ {300};
 #                   Controls are do not.
 #
 [^$Control [:Ideographic:]] $Extend*;
-[\u000d][\u000a];
+$CR $LF;

 #
 #  Reverse Rules.   Back up over any of the chars that can group together.
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@ -274,25 +274,27 @@ $(BRK_FILES:.brk" =.brk"
 #      TODO:  set up an inference rule, so these don't need to be written out one by one...
 #

-"$(ICUBLD)\$(ICUDT)char.brk" : "$(ICUBRK)\char.txt" "$(ICUBLD)\$(ICUDT)uprops.icu"
+BRKDEPS = "$(ICUBLD)\$(ICUDT)uprops.icu" "$(ICUBLD)\$(ICUDT)unames.icu" "$(ICUBLD)\$(ICUDT)pnames.icu" "$(ICUBLD)\$(ICUDT)unorm.icu"
+
+"$(ICUBLD)\$(ICUDT)char.brk" : "$(ICUBRK)\char.txt" $(BRKDEPS)
 	genbrk -r "$(ICUBRK)\char.txt" -o "$(ICUBLD)\$(ICUDT)char.brk" -i "$(ICUBLD)\\"

-"$(ICUBLD)\$(ICUDT)word.brk" : "$(ICUBRK)\word.txt" "$(ICUBLD)\$(ICUDT)uprops.icu"
+"$(ICUBLD)\$(ICUDT)word.brk" : "$(ICUBRK)\word.txt" $(BRKDEPS)
 	genbrk -r "$(ICUBRK)\word.txt" -o "$(ICUBLD)\$(ICUDT)word.brk" -i "$(ICUBLD)\\"

-"$(ICUBLD)\$(ICUDT)line.brk" : "$(ICUBRK)\line.txt" "$(ICUBLD)\$(ICUDT)uprops.icu"
+"$(ICUBLD)\$(ICUDT)line.brk" : "$(ICUBRK)\line.txt" $(BRKDEPS)
 	genbrk -r "$(ICUBRK)\line.txt" -o "$(ICUBLD)\$(ICUDT)line.brk" -i "$(ICUBLD)\\"

-"$(ICUBLD)\$(ICUDT)sent.brk" : "$(ICUBRK)\sent.txt" "$(ICUBLD)\$(ICUDT)uprops.icu"
+"$(ICUBLD)\$(ICUDT)sent.brk" : "$(ICUBRK)\sent.txt" $(BRKDEPS)
 	genbrk -r "$(ICUBRK)\sent.txt" -o "$(ICUBLD)\$(ICUDT)sent.brk" -i "$(ICUBLD)\\"

-"$(ICUBLD)\$(ICUDT)title.brk" : "$(ICUBRK)\title.txt" "$(ICUBLD)\$(ICUDT)uprops.icu"
+"$(ICUBLD)\$(ICUDT)title.brk" : "$(ICUBRK)\title.txt" $(BRKDEPS)
 	genbrk -r "$(ICUBRK)\title.txt" -o "$(ICUBLD)\$(ICUDT)title.brk" -i "$(ICUBLD)\\"

-"$(ICUBLD)\$(ICUDT)word_th.brk" : "$(ICUBRK)\word_th.txt" "$(ICUBLD)\$(ICUDT)uprops.icu"
+"$(ICUBLD)\$(ICUDT)word_th.brk" : "$(ICUBRK)\word_th.txt" $(BRKDEPS)
 	genbrk -r "$(ICUBRK)\word_th.txt" -o "$(ICUBLD)\$(ICUDT)word_th.brk" -i "$(ICUBLD)\\"

-"$(ICUBLD)\$(ICUDT)line_th.brk" : "$(ICUBRK)\line_th.txt" "$(ICUBLD)\$(ICUDT)uprops.icu"
+"$(ICUBLD)\$(ICUDT)line_th.brk" : "$(ICUBRK)\line_th.txt" $(BRKDEPS)
 	genbrk -r "$(ICUBRK)\line_th.txt" -o "$(ICUBLD)\$(ICUDT)line_th.brk" -i "$(ICUBLD)\\"