ICU-2021 another fix for canonical caseless match, still prototyping

X-SVN-Rev: 11209
2025-04-14 17:24:01 +00:00 · 2003-03-01 00:13:40 +00:00 · 2003-03-01 00:13:40 +00:00 · 353f8ee0eb
commit 353f8ee0eb
parent e9e4feea24
3 changed files with 226 additions and 121 deletions
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@ -32,6 +32,8 @@
 #include "ustr_imp.h"
 #include "uprops.h"

+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
 /* statically loaded Unicode character properties -------------------------- */

 /* MACHINE-GENERATED: Do not edit (see com.ibm.icu.dev.tools.translit.UnicodeSetCloseOver) */
@ -1912,6 +1914,54 @@ u_foldCase(UChar32 c, uint32_t options) {
    return c; /* no mapping - return c itself */
 }

+#if 0
+/* ### TODO Turkic-i case folding prototype, j2021 */
+enum {
+    FOLD_T_LENGTH=3
+};
+
+/*
+ * Turkic full case foldings.
+ * First UChar is the source, second the default mapping,
+ * then the Turkic mapping.
+ */
+static const UChar fold_T[][2+FOLD_T_LENGTH]={
+    { 0x0049, 0x0069, 0x131 },
+    { 0x0069, 0x0069, 0x131, 0x307 },
+    { 0x00cc, 0x00ec, 0x131, 0x300 },
+    { 0x00cd, 0x00ed, 0x131, 0x301 },
+    { 0x00ce, 0x00ee, 0x131, 0x302 },
+    { 0x00cf, 0x00ef, 0x131, 0x308 },
+    { 0x00ec, 0x00ec, 0x131, 0x300 },
+    { 0x00ed, 0x00ed, 0x131, 0x301 },
+    { 0x00ee, 0x00ee, 0x131, 0x302 },
+    { 0x00ef, 0x00ef, 0x131, 0x308 },
+    { 0x0128, 0x0129, 0x131, 0x303 },
+    { 0x0129, 0x0129, 0x131, 0x303 },
+    { 0x012a, 0x012b, 0x131, 0x304 },
+    { 0x012b, 0x012b, 0x131, 0x304 },
+    { 0x012c, 0x012d, 0x131, 0x306 },
+    { 0x012d, 0x012d, 0x131, 0x306 },
+    { 0x012e, 0x012f, 0x131, 0x328 },
+    { 0x012f, 0x012f, 0x131, 0x328 },
+    { 0x0130, 0,      0x131, 0x307 }, /* normal mapping is 0069 0307 */
+    { 0x01cf, 0x01d0, 0x131, 0x30c },
+    { 0x01d0, 0x01d0, 0x131, 0x30c },
+    { 0x0208, 0x0209, 0x131, 0x30f },
+    { 0x0209, 0x0209, 0x131, 0x30f },
+    { 0x020a, 0x020b, 0x131, 0x311 },
+    { 0x020b, 0x020b, 0x131, 0x311 },
+    { 0x1e2c, 0x1e2d, 0x131, 0x330 },
+    { 0x1e2d, 0x1e2d, 0x131, 0x330 },
+    { 0x1e2e, 0x1e2f, 0x131, 0x308, 0x301 },
+    { 0x1e2f, 0x1e2f, 0x131, 0x308, 0x301 },
+    { 0x1ec8, 0x1ec9, 0x131, 0x309 },
+    { 0x1ec9, 0x1ec9, 0x131, 0x309 },
+    { 0x1eca, 0x1ecb, 0x131, 0x323 },
+    { 0x1ecb, 0x1ecb, 0x131, 0x323 }
+};
+#endif
+
 /* internal, see ustr_imp.h */
 U_CAPI int32_t U_EXPORT2
 u_internalFoldCase(UChar32 c,
@ -1966,9 +2016,37 @@ u_internalFoldCase(UChar32 c,
                            dest[1]=0x307;
                        }
                        return 2;
+#if 0
+                        /* ### TODO Turkic-i case folding prototype, j2021 */
+                    } else if(c<=fold_T[LENGTHOF(fold_T)-1][0]) {
+                        for(i=0; i<LENGTHOF(fold_T) && c>=fold_T[i][0]; ++i) {
+                            if(c==fold_T[i][0]) {
+                                result=fold_T[i][1];
+                                break;
+                            }
+                        }
+#endif
                    }
                } else {
                    /* Turkic mappings */
+#if 0
+                    /* ### TODO Turkic-i case folding prototype, j2021 */
+                    if(c<=fold_T[LENGTHOF(fold_T)-1][0]) {
+                        for(i=0; i<LENGTHOF(fold_T) && c>=fold_T[i][0]; ++i) {
+                            if(c==fold_T[i][0]) {
+                                const UChar *p=&(fold_T[i][2]);
+                                length=0;
+                                while(length<FOLD_T_LENGTH && *p!=0) {
+                                    if(length<destCapacity) {
+                                        dest[length]=*p++;
+                                    }
+                                    ++length;
+                                }
+                                return length;
+                            }
+                        }
+                    }
+#else
                    if(c==0x49) {
                        /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
                        result=0x131;
@ -1976,6 +2054,7 @@ u_internalFoldCase(UChar32 c,
                        /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
                        result=0x69;
                    }
+#endif
                }
                /* return c itself because there is no special mapping for it */
                /* goto single; */
--- a/icu4c/source/common/unorm.cpp
+++ b/icu4c/source/common/unorm.cpp
@ -3100,41 +3100,39 @@ unorm_compose(UChar *dest, int32_t destCapacity,
 /**
 * Internal API for normalizing.
 * Does not check for bad input.
+ * Requires _haveData() to be true.
 * @internal
 */
-U_CAPI int32_t U_EXPORT2
+static int32_t
 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
                        const UChar *src, int32_t srcLength,
-                        UNormalizationMode mode, int32_t options,
+                        UNormalizationMode mode, const UnicodeSet *nx,
                        UErrorCode *pErrorCode) {
-    const UnicodeSet *nx;
+    int32_t destLength;
+    uint8_t trailCC;

    switch(mode) {
    case UNORM_NFD:
-        return unorm_decompose(dest, destCapacity,
-                               src, srcLength,
-                               FALSE, options,
-                               pErrorCode);
+        destLength=_decompose(dest, destCapacity,
+                              src, srcLength,
+                              FALSE, nx, trailCC);
+        break;
    case UNORM_NFKD:
-        return unorm_decompose(dest, destCapacity,
-                               src, srcLength,
-                               TRUE, options,
-                               pErrorCode);
+        destLength=_decompose(dest, destCapacity,
+                              src, srcLength,
+                              TRUE, nx, trailCC);
+        break;
    case UNORM_NFC:
-        return unorm_compose(dest, destCapacity,
-                             src, srcLength,
-                             FALSE, options,
-                             pErrorCode);
+        destLength=_compose(dest, destCapacity,
+                            src, srcLength,
+                            FALSE, nx, pErrorCode);
+        break;
    case UNORM_NFKC:
-        return unorm_compose(dest, destCapacity,
-                             src, srcLength,
-                             TRUE, options,
-                             pErrorCode);
+        destLength=_compose(dest, destCapacity,
+                            src, srcLength,
+                            TRUE, nx, pErrorCode);
+        break;
    case UNORM_FCD:
-        nx=getNX(options, *pErrorCode);
-        if(U_FAILURE(*pErrorCode)) {
-            return 0;
-        }
        return unorm_makeFCD(dest, destCapacity,
                             src, srcLength,
                             nx,
@ -3147,11 +3145,41 @@ unorm_internalNormalize(UChar *dest, int32_t destCapacity,
        if(srcLength>0 && srcLength<=destCapacity) {
            uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
        }
-        return u_terminateUChars(dest, destCapacity, srcLength, pErrorCode);
+        destLength=srcLength;
+        break;
    default:
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
+
+    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
+}
+
+/**
+ * Internal API for normalizing.
+ * Does not check for bad input.
+ * @internal
+ */
+U_CAPI int32_t U_EXPORT2
+unorm_internalNormalize(UChar *dest, int32_t destCapacity,
+                        const UChar *src, int32_t srcLength,
+                        UNormalizationMode mode, int32_t options,
+                        UErrorCode *pErrorCode) {
+    const UnicodeSet *nx;
+
+    if(!_haveData(*pErrorCode)) {
+        return 0;
+    }
+
+    nx=getNX(options, *pErrorCode);
+    if(U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+
+    return unorm_internalNormalize(dest, destCapacity,
+                                   src, srcLength,
+                                   mode, nx,
+                                   pErrorCode);
 }

 /** Public API for normalizing. */
@ -4300,6 +4328,7 @@ unorm_compare(const UChar *s1, int32_t length1,
    UChar fcd1[300], fcd2[300];
    UChar *d1, *d2;
    const UnicodeSet *nx;
+    UNormalizationMode mode;
    int32_t result;

    /* argument checking */
@ -4341,28 +4370,46 @@ unorm_compare(const UChar *s1, int32_t length1,
     * case-folding preserves the FCD-ness of a string.
     * The outer normalization is then only performed by unorm_cmpEquivFold()
     * when there is a difference.
+     *
+     * Exception: When using the Turkic case-folding option, we do perform
+     * full NFD first. This is because in the Turkic case precomposed characters
+     * with 0049 capital I or 0069 small i fold differently whether they
+     * are first decomposed or not, so an FCD check - a check only for
+     * canonical order - is not sufficient.
     */
+    if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
+        mode=UNORM_NFD;
+        options&=~UNORM_INPUT_IS_FCD;
+    } else {
+        mode=UNORM_FCD;
+    }

    if(!(options&UNORM_INPUT_IS_FCD)) {
        int32_t _len1, _len2;
        UBool isFCD1, isFCD2;

        // check if s1 and/or s2 fulfill the FCD conditions
-        isFCD1=unorm_checkFCD(s1, length1, nx);
-        isFCD2=unorm_checkFCD(s2, length2, nx);
+        isFCD1= UNORM_YES==_quickCheck(s1, length1, mode, TRUE, nx, pErrorCode);
+        isFCD2= UNORM_YES==_quickCheck(s2, length2, mode, TRUE, nx, pErrorCode);
+        if(U_FAILURE(*pErrorCode)) {
+            return 0;
+        }

-        if(!isFCD1 && !isFCD2) {
-            // if both strings need normalization then make them NFD right away and
-            // turn off normalization in the comparison function
-            uint8_t trailCC;
+        /*
+         * ICU 2.4 had a further optimization:
+         * If both strings were not in FCD, then they were both NFD'ed,
+         * and the _COMPARE_EQUIV option was turned off.
+         * It is not entirely clear that this is valid with the current
+         * definition of the canonical caseless match.
+         * Therefore, ICU 2.6 removes that optimization.
+         */

-            // fully decompose (NFD) s1 and s2
-
-            _len1=_decompose(fcd1, sizeof(fcd1)/U_SIZEOF_UCHAR,
-                             s1, length1,
-                             FALSE, nx,
-                             trailCC);
-            if(_len1<=(int32_t)(sizeof(fcd1)/U_SIZEOF_UCHAR)) {
+        if(!isFCD1) {
+            _len1=unorm_internalNormalize(fcd1, LENGTHOF(fcd1),
+                                          s1, length1,
+                                          mode, nx,
+                                          pErrorCode);
+            if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
                s1=fcd1;
            } else {
                d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR);
@ -4371,20 +4418,26 @@ unorm_compare(const UChar *s1, int32_t length1,
                    goto cleanup;
                }

-                _len1=_decompose(d1, _len1,
-                                 s1, length1,
-                                 FALSE, nx,
-                                 trailCC);
+                *pErrorCode=U_ZERO_ERROR;
+                _len1=unorm_internalNormalize(d1, _len1,
+                                              s1, length1,
+                                              mode, nx,
+                                              pErrorCode);
+                if(U_FAILURE(*pErrorCode)) {
+                    goto cleanup;
+                }

                s1=d1;
            }
            length1=_len1;
+        }

-            _len2=_decompose(fcd2, sizeof(fcd2)/U_SIZEOF_UCHAR,
-                             s2, length2,
-                             FALSE, nx,
-                             trailCC);
-            if(_len2<=(int32_t)(sizeof(fcd2)/U_SIZEOF_UCHAR)) {
+        if(!isFCD2) {
+            _len2=unorm_internalNormalize(fcd2, LENGTHOF(fcd2),
+                                          s2, length2,
+                                          mode, nx,
+                                          pErrorCode);
+            if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
                s2=fcd2;
            } else {
                d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR);
@ -4393,85 +4446,22 @@ unorm_compare(const UChar *s1, int32_t length1,
                    goto cleanup;
                }

-                _len2=_decompose(d2, _len2,
-                                 s2, length2,
-                                 FALSE, nx,
-                                 trailCC);
+                *pErrorCode=U_ZERO_ERROR;
+                _len2=unorm_internalNormalize(d2, _len2,
+                                              s2, length2,
+                                              mode, nx,
+                                              pErrorCode);
+                if(U_FAILURE(*pErrorCode)) {
+                    goto cleanup;
+                }

                s2=d2;
            }
            length2=_len2;
-
-            // compare NFD strings
-            options&=~_COMPARE_EQUIV;
-        } else {
-            // if at least one string is already in FCD then only makeFCD the other
-            // and compare for equivalence
-            if(!isFCD1) {
-                _len1=unorm_makeFCD(fcd1, sizeof(fcd1)/U_SIZEOF_UCHAR,
-                                    s1, length1,
-                                    nx,
-                                    pErrorCode);
-                if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-                    s1=fcd1;
-                } else {
-                    d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR);
-                    if(d1==0) {
-                        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-                        goto cleanup;
-                    }
-
-                    *pErrorCode=U_ZERO_ERROR;
-                    _len1=unorm_makeFCD(d1, _len1,
-                                        s1, length1,
-                                        nx,
-                                        pErrorCode);
-                    if(U_FAILURE(*pErrorCode)) {
-                        goto cleanup;
-                    }
-
-                    s1=d1;
-                }
-                length1=_len1;
-            }
-
-            if(!isFCD2) {
-                _len2=unorm_makeFCD(fcd2, sizeof(fcd2)/U_SIZEOF_UCHAR,
-                                    s2, length2,
-                                    nx,
-                                    pErrorCode);
-                if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-                    s2=fcd2;
-                } else {
-                    d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR);
-                    if(d2==0) {
-                        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-                        goto cleanup;
-                    }
-
-                    *pErrorCode=U_ZERO_ERROR;
-                    _len2=unorm_makeFCD(d2, _len2,
-                                        s2, length2,
-                                        nx,
-                                        pErrorCode);
-                    if(U_FAILURE(*pErrorCode)) {
-                        goto cleanup;
-                    }
-
-                    s2=d2;
-                }
-                length2=_len2;
-            }
        }
    }

-    if(U_FAILURE(*pErrorCode)) {
-        // do nothing
-    } else if(!(options&(_COMPARE_EQUIV|U_COMPARE_IGNORE_CASE))) {
-        // compare NFD strings case-sensitive: just use normal comparison
-        result=uprv_strCompare(s1, length1, s2, length2,
-                    FALSE, (UBool)(0!=(options&U_COMPARE_CODE_POINT_ORDER)));
-    } else {
+    if(U_SUCCESS(*pErrorCode)) {
        result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
    }

--- a/icu4c/source/data/unidata/CaseFolding.txt
+++ b/icu4c/source/data/unidata/CaseFolding.txt
@ -1,5 +1,5 @@
-# CaseFolding-3.2.0.txt
-# Date: 2002-03-22,20:54:33 GMT [MD]
+# CaseFolding-4.0.0.txt
+# Date: 2003-02-14,16:49:03 GMT [MD]
 #
 # Case Folding Properties
 #
@ -70,6 +70,7 @@
 0058; C; 0078; # LATIN CAPITAL LETTER X
 0059; C; 0079; # LATIN CAPITAL LETTER Y
 005A; C; 007A; # LATIN CAPITAL LETTER Z
+###0069; T; 0131 0307; # LATIN SMALL LETTER I
 00B5; C; 03BC; # MICRO SIGN
 00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE
 00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE
@ -84,9 +85,13 @@
 00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
 00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS
 00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE
+###00CC; T; 0131 0300; # LATIN CAPITAL LETTER I WITH GRAVE
 00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE
+###00CD; T; 0131 0301; # LATIN CAPITAL LETTER I WITH ACUTE
 00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
+###00CE; T; 0131 0302; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
 00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS
+###00CF; T; 0131 0308; # LATIN CAPITAL LETTER I WITH DIAERESIS
 00D0; C; 00F0; # LATIN CAPITAL LETTER ETH
 00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE
 00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE
@ -102,6 +107,10 @@
 00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE
 00DE; C; 00FE; # LATIN CAPITAL LETTER THORN
 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
+###00EC; T; 0131 0300; # LATIN SMALL LETTER I WITH GRAVE
+###00ED; T; 0131 0301; # LATIN SMALL LETTER I WITH ACUTE
+###00EE; T; 0131 0302; # LATIN SMALL LETTER I WITH CIRCUMFLEX
+###00EF; T; 0131 0308; # LATIN SMALL LETTER I WITH DIAERESIS
 0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON
 0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE
 0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK
@ -123,11 +132,20 @@
 0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX
 0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE
 0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE
+###0128; T; 0131 0303; # LATIN CAPITAL LETTER I WITH TILDE
+###0129; T; 0131 0303; # LATIN SMALL LETTER I WITH TILDE
 012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON
+###012A; T; 0131 0304; # LATIN CAPITAL LETTER I WITH MACRON
+###012B; T; 0131 0304; # LATIN SMALL LETTER I WITH MACRON
 012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE
+###012C; T; 0131 0306; # LATIN CAPITAL LETTER I WITH BREVE
+###012D; T; 0131 0306; # LATIN SMALL LETTER I WITH BREVE
 012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK
+###012E; T; 0131 0328; # LATIN CAPITAL LETTER I WITH OGONEK
+###012F; T; 0131 0328; # LATIN SMALL LETTER I WITH OGONEK
 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+###0130; T; 0131 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 0132; C; 0133; # LATIN CAPITAL LIGATURE IJ
 0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX
 0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA
@ -212,6 +230,8 @@
 01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J
 01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON
 01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON
+###01CF; T; 0131 030C; # LATIN CAPITAL LETTER I WITH CARON
+###01D0; T; 0131 030C; # LATIN SMALL LETTER I WITH CARON
 01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON
 01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON
 01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
@ -242,7 +262,11 @@
 0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE
 0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE
 0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
+###0208; T; 0131 030F; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
+###0209; T; 0131 030F; # LATIN SMALL LETTER I WITH DOUBLE GRAVE
 020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE
+###020A; T; 0131 0311; # LATIN CAPITAL LETTER I WITH INVERTED BREVE
+###020B; T; 0131 0311; # LATIN SMALL LETTER I WITH INVERTED BREVE
 020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE
 020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE
 0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
@ -318,9 +342,11 @@
 03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI
 03F0; C; 03BA; # GREEK KAPPA SYMBOL
 03F1; C; 03C1; # GREEK RHO SYMBOL
-03F2; C; 03C3; # GREEK LUNATE SIGMA SYMBOL
 03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL
 03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL
+#03F7; C; 03F8; # GREEK CAPITAL LETTER SHO
+#03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL
+#03FA; C; 03FB; # GREEK CAPITAL LETTER SAN
 0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE
 0401; C; 0451; # CYRILLIC CAPITAL LETTER IO
 0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE
@ -510,7 +536,11 @@
 1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA
 1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW
 1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW
+###1E2C; T; 0131 0330; # LATIN CAPITAL LETTER I WITH TILDE BELOW
+###1E2D; T; 0131 0330; # LATIN SMALL LETTER I WITH TILDE BELOW
 1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE
+###1E2E; T; 0131 0308 0301; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE
+###1E2F; T; 0131 0308 0301; # LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE
 1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE
 1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW
 1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW
@ -589,7 +619,11 @@
 1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE
 1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW
 1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE
+###1EC8; T; 0131 0309; # LATIN CAPITAL LETTER I WITH HOOK ABOVE
+###1EC9; T; 0131 0309; # LATIN SMALL LETTER I WITH HOOK ABOVE
 1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW
+###1ECA; T; 0131 0323; # LATIN CAPITAL LETTER I WITH DOT BELOW
+###1ECB; T; 0131 0323; # LATIN SMALL LETTER I WITH DOT BELOW
 1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW
 1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE
 1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE
@ -910,3 +944,5 @@ FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
 10423; C; 1044B; # DESERET CAPITAL LETTER EM
 10424; C; 1044C; # DESERET CAPITAL LETTER EN
 10425; C; 1044D; # DESERET CAPITAL LETTER ENG
+#10426; C; 1044E; # DESERET CAPITAL LETTER OI
+#10427; C; 1044F; # DESERET CAPITAL LETTER EW