ICU-1433 getTailoredSet C/C++ return all the canonically equivalent sequences. Also added tests

X-SVN-Rev: 9867
2025-04-07 06:25:30 +00:00 · 2002-09-17 06:27:51 +00:00 · 2002-09-17 06:27:51 +00:00 · 150ad44972
commit 150ad44972
parent 0fb205837f
8 changed files with 132 additions and 23 deletions
--- a/icu4c/source/i18n/tblcoll.cpp
+++ b/icu4c/source/i18n/tblcoll.cpp
@ -321,21 +321,13 @@ void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
    }
 }

-UnicodeSet 
+UnicodeSet *
 RuleBasedCollator::getTailoredSet(UErrorCode &status) const
 {
  if(U_FAILURE(status)) {
-    return UnicodeSet();
-  }
-  USet *set = ucol_getTailoredSet(this->ucollator, &status);
-  if(U_SUCCESS(status)) {
-    UnicodeSet result(*(const UnicodeSet *)set);
-    UnicodeString pattern;
-    uset_close(set);
-    return result;
-  } else {
-    return UnicodeSet();
+    return NULL;
  }
+  return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
 }


--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -23,6 +23,7 @@
 #include "unicode/unorm.h"
 #include "unicode/udata.h"
 #include "unicode/uchar.h"
+#include "unicode/caniter.h"

 #include "ucol_bld.h"
 #include "ucol_imp.h"
@ -6729,17 +6730,33 @@ ucol_getTailoredSet(const UCollator *coll, UErrorCode *status)
  const UChar *rules = ucol_getRules(coll, &rulesLen);
  const UChar *current = NULL;
  UBool startOfRules = TRUE;
-  USet *tailored = uset_open(1, 0);
+  // we internally use the C++ class, for the following reasons:
+  // 1. we need to utilize canonical iterator, which is a C++ only class
+  // 2. canonical iterator returns UnicodeStrings - USet cannot take them
+  // 3. USet is internally really UnicodeSet, C is just a wrapper
+  UnicodeSet *tailored = new UnicodeSet();
  UnicodeString pattern;
+  CanonicalIterator it("", *status);

+
+  // The idea is to tokenize the rule set. For each non-reset token,
+  // we add all the canonicaly equivalent FCD sequences 
  ucol_tok_initTokenList(&src, rules, rulesLen, UCA, status);
  while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) {
    startOfRules = FALSE;
    if(src.parsedToken.strength != UCOL_TOK_RESET) {
      const UChar *stuff = src.source+(src.parsedToken.charsOffset);
-      uset_addString(tailored, stuff, src.parsedToken.charsLen);
+      it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status);
+      pattern = it.next();
+      while(!pattern.isBogus()) {
+        if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) {
+          tailored->add(pattern);
+        }
+        pattern = it.next();
+      }
    }
  }
  ucol_tok_closeTokenList(&src);
-  return tailored;
+  return (USet *)tailored;
 }
+
--- a/icu4c/source/i18n/unicode/coll.h
+++ b/icu4c/source/i18n/unicode/coll.h
@ -608,13 +608,15 @@ public:
  virtual uint32_t getVariableTop(UErrorCode &status) const = 0;

  /**
-   * Get an UnicodeSet that contains all the characters and sequences tailored in 
-   * this collator.
+   * Get an UnicodeSet that contains all the characters and sequences 
+   * tailored in this collator.
   * @param status      error code of the operation
-   * @return an UnicodeSet object containing all the tailored code points and sequences
+   * @return a pointer to a UnicodeSet object containing all the 
+   *         code points and sequences that may sort differently than
+   *         in the UCA. The object must be disposed of by using delete
   * @draft ICU 2.4
   */
-  virtual UnicodeSet getTailoredSet(UErrorCode &status) const;
+  virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;


  /**
@ -785,10 +787,13 @@ inline UBool Collator::operator!=(const Collator& other) const
  return (UBool)!(*this == other);
 }

-inline UnicodeSet Collator::getTailoredSet(UErrorCode &status) const
+inline UnicodeSet *Collator::getTailoredSet(UErrorCode &status) const
 {
-  status = U_UNSUPPORTED_ERROR;
-  return UnicodeSet();
+  if(U_FAILURE(status)) {
+    return NULL;
+  }
+  // everything can be changed
+  return new UnicodeSet(0, 0x10FFFF);
 }

 /*
--- a/icu4c/source/i18n/unicode/tblcoll.h
+++ b/icu4c/source/i18n/unicode/tblcoll.h
@ -495,10 +495,12 @@ public:
   * Get an UnicodeSet that contains all the characters and sequences tailored in 
   * this collator.
   * @param status      error code of the operation
-   * @return an UnicodeSet object containing all the tailored code points and sequences
+   * @return a pointer to a UnicodeSet object containing all the 
+   *         code points and sequences that may sort differently than
+   *         in the UCA. The object must be disposed of by using delete
   * @draft ICU 2.4
   */
-  virtual UnicodeSet getTailoredSet(UErrorCode &status) const;
+  virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;

  /**
   * Thread safe cloning operation.
--- a/icu4c/source/test/cintltst/capitst.c
+++ b/icu4c/source/test/cintltst/capitst.c
@ -45,6 +45,7 @@ void addCollAPITest(TestNode** root)
    addTest(root, &TestGetLocale, "tscoll/capitst/TestGetLocale");    
    addTest(root, &TestSortKeyBufferOverrun, "tscoll/capitst/TestSortKeyBufferOverrun");
    addTest(root, &TestAttribute, "tscoll/capitst/TestAttribute");
+    addTest(root, &TestGetTailoredSet, "tscoll/capitst/TestGetTailoredSet");

 }

@ -1497,3 +1498,44 @@ static void TestAttribute()

    ucol_close(coll);
 }
+
+void TestGetTailoredSet() {
+  struct {
+    char *rules;
+    char *tests[20];
+    int32_t testsize;
+  } setTest[] = {
+    { "&a < \\u212b", { "\\u212b", "A\\u030a", "\\u00c5" }, 3},
+    { "& S < \\u0161 <<< \\u0160", { "\\u0161", "s\\u030C", "\\u0160", "S\\u030C" }, 4}
+  };
+
+  int32_t i = 0, j = 0;
+  UErrorCode status = U_ZERO_ERROR;
+  UParseError pError;
+
+  UCollator *coll = NULL;
+  UChar buff[1024];
+  int32_t buffLen = 0;
+  USet *set = NULL;
+
+  for(i = 0; i < sizeof(setTest)/sizeof(setTest[0]); i++) {
+    buffLen = u_unescape(setTest[i].rules, buff, 1024);
+    coll = ucol_openRules(buff, buffLen, UCOL_DEFAULT, UCOL_DEFAULT, &pError, &status);
+    if(U_SUCCESS(status)) {
+      set = ucol_getTailoredSet(coll, &status);
+      if(uset_size(set) != setTest[i].testsize) {
+        log_err("Tailored set size different (%d) than expected (%d)\n", uset_size(set), setTest[i].testsize);
+      }
+      for(j = 0; j < setTest[i].testsize; j++) {
+        buffLen = u_unescape(setTest[i].tests[j], buff, 1024);
+        if(!uset_containsString(set, buff, buffLen)) {
+          log_err("Tailored set doesn't contain %s... It should\n", setTest[i].tests[j]);
+        }
+      }
+      uset_close(set);
+    } else {
+      log_err("Couldn't open collator with rules %s\n", setTest[i].rules);
+    }
+    ucol_close(coll);
+  }
+}
--- a/icu4c/source/test/cintltst/capitst.h
+++ b/icu4c/source/test/cintltst/capitst.h
@ -86,5 +86,9 @@
     * Test getting and setting of attributes
     */
    void TestGetSetAttr(void);
+    /**
+     * Test getTailoredSet
+     */
+    void TestGetTailoredSet(void);

 #endif
--- a/icu4c/source/test/intltest/apicoll.cpp
+++ b/icu4c/source/test/intltest/apicoll.cpp
@ -1661,6 +1661,47 @@ void CollationAPITest::TestBounds(void) {
    delete coll;
 }

+
+void CollationAPITest::TestGetTailoredSet() 
+{
+  struct {
+    char *rules;
+    char *tests[20];
+    int32_t testsize;
+  } setTest[] = {
+    { "&a < \\u212b", { "\\u212b", "A\\u030a", "\\u00c5" }, 3},
+    { "& S < \\u0161 <<< \\u0160", { "\\u0161", "s\\u030C", "\\u0160", "S\\u030C" }, 4}
+  };
+
+  int32_t i = 0, j = 0;
+  UErrorCode status = U_ZERO_ERROR;
+
+  RuleBasedCollator *coll = NULL;
+  UnicodeString buff;
+  UnicodeSet *set = NULL;
+
+  for(i = 0; i < sizeof(setTest)/sizeof(setTest[0]); i++) {
+    buff = UnicodeString(setTest[i].rules, "").unescape();
+    coll = new RuleBasedCollator(buff, status);
+    if(U_SUCCESS(status)) {
+      set = coll->getTailoredSet(status);
+      if(set->size() != setTest[i].testsize) {
+        errln("Tailored set size different (%d) than expected (%d)", set->size(), setTest[i].testsize);
+      }
+      for(j = 0; j < setTest[i].testsize; j++) {
+        buff = UnicodeString(setTest[i].tests[j], "").unescape();
+        if(!set->contains(buff)) {
+          errln("Tailored set doesn't contain %s... It should", setTest[i].tests[j]);
+        }
+      }
+      delete set;
+    } else {
+      errln("Couldn't open collator with rules %s\n", setTest[i].rules);
+    }
+    delete coll;
+  }
+}
+
 void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
 {
    if (exec) logln("TestSuite CollationAPITest: ");
@ -1684,6 +1725,7 @@ void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &n
        case 16: name = "TestRules"; if (exec) TestRules(); break;
        case 17: name = "TestGetLocale"; if (exec) TestGetLocale(); break;
        case 18: name = "TestBounds"; if (exec) TestBounds(); break;
+        case 19: name = "TestGetTailoredSet"; if (exec) TestGetTailoredSet(); break;
        default: name = ""; break;
    }
 }
--- a/icu4c/source/test/intltest/apicoll.h
+++ b/icu4c/source/test/intltest/apicoll.h
@ -140,6 +140,11 @@ public:
    */
    void TestBounds();

+    /**
+    * Tests getTailoredSet API
+    */
+    void TestGetTailoredSet();
+
 private:
    // If this is too small for the test data, just increase it.
    // Just don't make it too large, otherwise the executable will get too big