ICU-7273 compute FC_NFKC_Closure on the fly

X-SVN-Rev: 27534
2025-04-10 07:39:16 +00:00 · 2010-02-10 23:05:39 +00:00 · 2010-02-10 23:05:39 +00:00 · 1cb38e859b
commit 1cb38e859b
parent 537b4141fa
6 changed files with 60 additions and 41 deletions
--- a/icu4c/source/common/normalizer2.cpp
+++ b/icu4c/source/common/normalizer2.cpp
@ -510,6 +510,11 @@ Normalizer2Factory::getNFKC_CFImpl(UErrorCode &errorCode) {
    return allModes!=NULL ? &allModes->impl : NULL;
 }

+const Normalizer2Impl *
+Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
+    return &((Normalizer2WithImpl *)norm2)->impl;
+}
+
 const UTrie2 *
 Normalizer2Factory::getFCDTrie(UErrorCode &errorCode) {
    Norm2AllModes *allModes=
--- a/icu4c/source/common/normalizer2impl.h
+++ b/icu4c/source/common/normalizer2impl.h
@ -487,6 +487,10 @@ public:
    static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
    static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);

+    // Get the Impl instance of the Normalizer2.
+    // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
+    static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
+
    static const UTrie2 *getFCDTrie(UErrorCode &errorCode);
 private:
    Normalizer2Factory();  // No instantiation.
--- a/icu4c/source/common/unorm.cpp
+++ b/icu4c/source/common/unorm.cpp
@ -393,44 +393,6 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
    return FALSE; /* not found */
 }

-U_CAPI int32_t U_EXPORT2
-u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
-    uint16_t aux;
-
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
-        return 0;
-    }
-    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
-        return 0;
-    }
-    if(_haveData(*pErrorCode) && auxTrie.index!=NULL) {
-        aux=UTRIE2_GET16(&auxTrie, c);
-        aux&=_NORM_AUX_FNC_MASK;
-    } else {
-        aux=0;
-    }
-    if(aux!=0) {
-        const UChar *s;
-        int32_t length;
-
-        s=(const UChar *)(extraData+aux);
-        if(*s<0xff00) {
-            /* s points to the single-unit string */
-            length=1;
-        } else {
-            length=*s&0xff;
-            ++s;
-        }
-        if(0<length && length<=destCapacity) {
-            uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
-        }
-        return u_terminateUChars(dest, destCapacity, length, pErrorCode);
-    } else {
-        return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
-    }
-}
-
 U_CAPI void U_EXPORT2
 unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
    UChar c;
--- a/icu4c/source/common/unormcmp.cpp
+++ b/icu4c/source/common/unormcmp.cpp
@ -124,9 +124,6 @@ U_NAMESPACE_USE
 * Note that all of this is only a problem when case-folding _and_
 * canonical equivalence come together.
 * (Comments in unorm_compare() are more up to date than this TODO.)
- *
- * This function could be moved to a different source file, at increased cost
- * for calling the decomposition access function.
 */

 /* stack element for previous-level source/decomposition pointers */
--- a/icu4c/source/common/uprops.cpp
+++ b/icu4c/source/common/uprops.cpp
@ -586,6 +586,55 @@ uprops_getSource(UProperty which) {
    }
 }

+U_CAPI int32_t U_EXPORT2
+u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+    // Compute the FC_NFKC_Closure on the fly:
+    // We have the API for complete coverage of Unicode properties, although
+    // this value by itself is not useful via API.
+    // (What could be useful is a custom normalization table that combines
+    // case folding and NFKC.)
+    // For the derivation, see Unicode's DerivedNormalizationProps.txt.
+    const Normalizer2 *nfkc=Normalizer2Factory::getNFKCInstance(*pErrorCode);
+    const UCaseProps *csp=ucase_getSingleton(pErrorCode);
+    if(U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+    // first: b = NFKC(Fold(a))
+    UnicodeString folded1String;
+    const UChar *folded1;
+    int32_t folded1Length=ucase_toFullFolding(csp, c, &folded1, U_FOLD_CASE_DEFAULT);
+    if(folded1Length<0) {
+        const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc);
+        if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) {
+            return u_terminateUChars(dest, destCapacity, 0, pErrorCode);  // c does not change at all under CaseFolding+NFKC
+        }
+        folded1String.setTo(c);
+    } else {
+        if(folded1Length>UCASE_MAX_STRING_LENGTH) {
+            folded1String.setTo(folded1Length);
+        } else {
+            folded1String.setTo(FALSE, folded1, folded1Length);
+        }
+    }
+    UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode);
+    // second: c = NFKC(Fold(b))
+    UnicodeString folded2String(kc1);
+    UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode);
+    // if (c != b) add the mapping from a to c
+    if(U_FAILURE(*pErrorCode) || kc1==kc2) {
+        return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
+    } else {
+        return kc2.extract(dest, destCapacity, *pErrorCode);
+    }
+}
+
 /*----------------------------------------------------------------
 * Inclusions list
 *----------------------------------------------------------------*/
--- a/icu4c/source/test/cintltst/cnormtst.c
+++ b/icu4c/source/test/cintltst/cnormtst.c
@ -1342,6 +1342,8 @@ TestFCNFKCClosure(void) {
        UChar32 c;
        const UChar s[6];
    } tests[]={
+        { 0x00C4, { 0 } },
+        { 0x00E4, { 0 } },
        { 0x037A, { 0x0020, 0x03B9, 0 } },
        { 0x03D2, { 0x03C5, 0 } },
        { 0x20A8, { 0x0072, 0x0073, 0 } },