ICU-1785 build skippables data; formatVersion 2.2

X-SVN-Rev: 10152
2025-04-06 22:15:31 +00:00 · 2002-11-05 00:56:25 +00:00 · 2002-11-05 00:56:25 +00:00 · 7f33a69caf
commit 7f33a69caf
parent c22abf74ad
2 changed files with 315 additions and 12 deletions
--- a/icu4c/source/common/unormimp.h
+++ b/icu4c/source/common/unormimp.h
@ -75,7 +75,8 @@ enum {
 /* value constants for auxTrie */
 enum {
    _NORM_AUX_COMP_EX_SHIFT=10,
-    _NORM_AUX_UNSAFE_SHIFT=11
+    _NORM_AUX_UNSAFE_SHIFT=11,
+    _NORM_AUX_NFC_SKIPPABLE_F_SHIFT=12
 };

 #define _NORM_AUX_MAX_FNC           ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
@ -83,6 +84,7 @@ enum {
 #define _NORM_AUX_FNC_MASK          (uint32_t)(_NORM_AUX_MAX_FNC-1)
 #define _NORM_AUX_COMP_EX_MASK      ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
 #define _NORM_AUX_UNSAFE_MASK       ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
+#define _NORM_AUX_NFC_SKIP_F_MASK   ((uint32_t)1<<_NORM_AUX_NFC_SKIPPABLE_F_SHIFT)

 /* canonStartSets[0..31] contains indexes for what is in the array */
 enum {
@ -312,11 +314,27 @@ U_CAPI UBool U_EXPORT2
 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);

 /**
- * Description of the format of unorm.dat version 2.1.
+ * Is c an NF<mode>-skippable code point? See unormimp.h.
+ * @internal
+ */
+U_CAPI UBool U_EXPORT2
+unorm_isNFSkippable(UChar32 c, UNormalizationMode mode);
+
+/**
+ * Enumerate each normalization data trie and add the
+ * start of each range of same properties to the set.
+ * @internal
+ */
+U_CAPI void U_EXPORT2
+unorm_addPropertyStarts(USet *set);
+
+/**
+ * Description of the format of unorm.dat version 2.2.
 *
 * Main change from version 1 to version 2:
 * Use of new, common UTrie instead of normalization-specific tries.
 * Change to version 2.1: add third/auxiliary trie with associated data.
+ * Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK).
 *
 * For more details of how to use the data structures see the code
 * in unorm.cpp (runtime normalization code) and
@ -520,7 +538,8 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
 *
 * The auxiliary 16-bit trie contains data for additional properties.
 * Bits
- * 15..12   reserved (for skippable flags, see NormalizerTransliterator)
+ * 15..13   reserved
+ *     12   not NFC_Skippable (f) (formatVersion>=2.2)
 *     11   flag: not a safe starter for canonical closure
 *     10   composition exclusion
 *  9.. 0   index into extraData[] to FC_NFKC_Closure string
@ -541,6 +560,29 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
 *     ++s;
 *   }
 *
+ * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
+ * (used in NormalizerTransliterator)
+ *
+ * A skippable character is
+ * a) unassigned, or ALL of the following:
+ * b) of combining class 0.
+ * c) not decomposed by this normalization form.
+ * AND if NFC or NFKC,
+ * d) can never compose with a previous character.
+ * e) can never compose with a following character.
+ * f) can never change if another character is added.
+ *    Example: a-breve might satisfy all but f, but if you
+ *    add an ogonek it changes to a-ogonek + breve
+ *
+ * a)..e) must be tested from norm32.
+ * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
+ * into the auxiliary trie.
+ * The same bit is used for NFC and NFKC; (c) differs for them.
+ * As usual, we build the "not skippable" flags so that unassigned
+ * code points get a 0 bit.
+ * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
+ * Test Hangul LV syllables entirely in code.
+ *
 *
 * - structure inside canonStartSets[]
 *
--- a/icu4c/source/tools/gennorm/store.c
+++ b/icu4c/source/tools/gennorm/store.c
@ -55,8 +55,8 @@ static UDataInfo dataInfo={
    0,

    { 0x4e, 0x6f, 0x72, 0x6d },   /* dataFormat="Norm" */
-    { 2, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
-    { 3, 1, 0, 0 }                /* dataVersion (Unicode version) */
+    { 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT },   /* formatVersion */
+    { 3, 2, 0, 0 }                /* dataVersion (Unicode version) */
 };

 extern void
@ -155,6 +155,7 @@ typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);

 static UNewTrie
    normTrie={ {0},0,0,0,0,0,0,0,0,{0} },
+    norm32Trie={ {0},0,0,0,0,0,0,0,0,{0} },
    fcdTrie={ {0},0,0,0,0,0,0,0,0,{0} },
    auxTrie={ {0},0,0,0,0,0,0,0,0,{0} };

@ -168,10 +169,29 @@ static Norm *norms;
 */
 static uint32_t haveSeenFlags[256];

+/* see addCombiningCP() for details */
 static uint32_t combiningCPs[2000];
+
+/*
+ * after processCombining() this contains for each code point in combiningCPs[]
+ * the runtime combining index
+ */
 static uint16_t combiningIndexes[2000];
+
+/* section limits for combiningCPs[], see addCombiningCP() */
 static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;

+/**
+ * Structure for a triple of code points, stored in combiningTriplesMem.
+ * The lead and trail code points combine into the the combined one,
+ * i.e., there is a canonical decomposition of combined-> <lead, trail>.
+ *
+ * Before processCombining() is called, leadIndex and trailIndex are 0.
+ * After processCombining(), they contain the indexes of the lead and trail
+ * code point in the combiningCPs[] array.
+ * They are then sorted by leadIndex, then trailIndex.
+ * They are not sorted by code points.
+ */
 typedef struct CombiningTriple {
    uint16_t leadIndex, trailIndex;
    uint32_t lead, trail, combined;
@ -312,6 +332,24 @@ setHaveSeenString(const uint32_t *s, int32_t length) {

 /* handle combining data ---------------------------------------------------- */

+/*
+ * Insert an entry into combiningCPs[] for the new code point code with its flags.
+ * The flags indicate if code combines forward, backward, or both.
+ *
+ * combiningCPs[] contains three sections:
+ * 1. code points that combine forward
+ * 2. code points that combine forward and backward
+ * 3. code points that combine backward
+ *
+ * Search for code in the entire array.
+ * If it is found and already is in the right section (old flags==new flags)
+ * then we are done.
+ * If it is found but the flags are different, then remove it,
+ * union the old and new flags, and reinsert it into its correct section.
+ * If it is not found, then just insert it.
+ *
+ * Within each section, the code points are not sorted.
+ */
 static void
 addCombiningCP(uint32_t code, uint8_t flags) {
    uint32_t newEntry;
@ -370,6 +408,12 @@ addCombiningCP(uint32_t code, uint8_t flags) {
    ++combineBackTop;
 }

+/**
+ * Find the index in combiningCPs[] where code point code is stored.
+ * @param code code point to look for
+ * @param isLead is code a forward combining code point?
+ * @return index in combiningCPs[] where code is stored
+ */
 static uint16_t
 findCombiningCP(uint32_t code, UBool isLead) {
    uint16_t i, limit;
@ -1161,7 +1205,7 @@ makeAll32() {
        norms[i].value32=make32BitNorm(norms+i);
    }

-    pNormData=utrie_getData(&normTrie, &normLength);
+    pNormData=utrie_getData(&norm32Trie, &normLength);

    count=0;
    for(i=0; i<normLength; ++i) {
@ -1208,7 +1252,7 @@ makeFCD() {
 */
 static int32_t
 usetContainsOne(const USet* set) {
-    if (uset_size(set) == 1) {
+    if (uset_size(set) == 1) { /* ### faster to count ranges and check only range?! */
        UChar32 start, end;
        UErrorCode ec = U_ZERO_ERROR;
        int32_t len = uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
@ -1225,7 +1269,7 @@ makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
        UErrorCode errorCode=U_ZERO_ERROR;

        /* does the set contain exactly one code point? */
-        c=usetContainsOne(norm->canonStart);
+        c=usetContainsOne(norm->canonStart); /* ### why? */

        /* add an entry to the BMP or supplementary search table */
        if(code<=0xffff) {
@ -1251,7 +1295,7 @@ makeCanonSetFn(void *context, uint32_t code, Norm *norm) {

            if(c>=0) {
                /* single-code point result for supplementary code point */
-                table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
+                table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00)); /* ### how does this work again? */
                table[tableLength++]=(uint16_t)c;
            } else {
                table[tableLength++]=(uint16_t)canonStartSetsTop;
@ -1281,6 +1325,219 @@ makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
    }
 }

+/* for getSkippableFlags ---------------------------------------------------- */
+
+/* combine the lead and trail code points; return <0 if they do not combine */
+static int32_t
+combine(uint32_t lead, uint32_t trail) {
+    CombiningTriple *triples;
+    uint32_t i, count;
+
+    /* search for all triples with c as lead code point */
+    triples=utm_getStart(combiningTriplesMem);
+    count=combiningTriplesMem->index;
+
+    /* triples are not sorted by code point but for each lead CP there is one contiguous block */
+    for(i=0; i<count && lead!=triples[i].lead; ++i) {}
+
+    /* check each triple for this code point */
+    for(; i<count && lead==triples[i].lead; ++i) {
+        if(trail==triples[i].trail) {
+            return (int32_t)triples[i].combined;
+        }
+    }
+
+    return -1;
+}
+
+/*
+ * Starting from the canonical decomposition s[0..length[ of a single code point,
+ * is the code point c consumed in an NFC/FCC recomposition?
+ *
+ * No need to handle discontiguous composition because that would not consume some
+ * intermediate character, so would not compose back to the original character.
+ * See comments in canChangeWithFollowing().
+ *
+ * No need to compose beyond where c canonically orders because if it is consumed
+ * then the result differs from the original anyway.
+ *
+ * Possible optimization:
+ * - Verify that there are no cases of the same combining mark stacking twice.
+ * - return FALSE right away if c inserts after a copy of itself
+ *   without attempting to recompose; will happen because each mark in
+ *   the decomposition will be enumerated and passed in as c.
+ *   More complicated and fragile though than it is already.
+ *
+ * markus 2002nov04
+ */
+static UBool
+doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
+    int32_t starter, i;
+
+    /* ignore trailing characters where cc<prevCC */
+    while(length>1 && cc<getCCFromCP(s[length-1])) {
+        --length;
+    }
+
+    /* start consuming/combining from the beginning */
+    starter=(int32_t)s[0];
+    for(i=1; i<length; ++i) {
+        starter=combine((uint32_t)starter, s[i]);
+        if(starter<0) {
+            fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04lx, %04lx, ...>[%ld], U+%04lx, %u)\n",
+                s[0], s[1], length, c, cc);
+            exit(U_INTERNAL_PROGRAM_ERROR);
+        }
+    }
+
+    /* try to combine/consume c, return TRUE if it is consumed */
+    return combine((uint32_t)starter, c)>=0;
+}
+
+/* does the starter s[0] combine forward with another char that is below trailCC? */
+static UBool
+canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
+    if(trailCC<=1) {
+        /* no character will combine ahead of the trailing char of the decomposition */
+        return FALSE;
+    }
+
+    /*
+     * We are only checking skippable condition (f).
+     * Therefore, the original character does not have quick check flag NFC_NO (c),
+     * i.e., the decomposition recomposes completely back into the original code point.
+     * So s[0] must be a true starter with cc==0 and
+     * combining with following code points.
+     *
+     * Similarly, length==1 is not possible because that would be a singleton
+     * decomposition which is marked with NFC_NO and does not pass (c).
+     *
+     * Only a character with cc<trailCC can change the composition.
+     * Reason: A char with cc>=trailCC would order after decomposition s[],
+     * composition would consume all of the decomposition, and here we know that
+     * the original char passed check d), i.e., it does not combine forward,
+     * therefore does not combine with anything after the decomposition is consumed.
+     *
+     * Now see if there is a character that
+     * 1. combines backward
+     * 2. has cc<trailCC
+     * 3. is consumed in recomposition
+     *
+     * length==2 is simple:
+     *
+     * Characters that fulfill these conditions are exactly the ones that combine directly
+     * with the starter c==s[0] because there is no intervening character after
+     * reordering.
+     * We can just enumerate all chars with which c combines (they all pass 1. and 3.)
+     * and see if one has cc<trailCC (passes 2.).
+     *
+     * length>2 is a little harder:
+     *
+     * Since we will get different starters during recomposition, we need to
+     * enumerate each backward-combining character (1.)
+     * with cc<trailCC (2.) and
+     * see if it gets consumed in recomposition. (3.)
+     * No need to enumerate both-ways combining characters because they must have cc==0.
+     */
+    if(length==2) {
+        /* enumerate all chars that combine with this one and check their cc */
+        CombiningTriple *triples;
+        uint32_t c, i, count;
+        uint8_t cc;
+
+        /* search for all triples with c as lead code point */
+        triples=utm_getStart(combiningTriplesMem);
+        count=combiningTriplesMem->index;
+        c=s[0];
+
+        /* triples are not sorted by code point but for each lead CP there is one contiguous block */
+        for(i=0; i<count && c!=triples[i].lead; ++i) {}
+
+        /* check each triple for this code point */
+        for(; i<count && c==triples[i].lead; ++i) {
+            cc=getCCFromCP(triples[i].trail);
+            if(cc>0 && cc<trailCC) {
+                /* this trail code point combines with c and has cc<trailCC */
+                return TRUE;
+            }
+        }
+    } else {
+        /* enumerate all chars that combine backward */
+        uint32_t c2;
+        uint16_t i;
+        uint8_t cc;
+
+        for(i=combineBothTop; i<combineBackTop; ++i) {
+            c2=combiningCPs[i]&0xffffff;
+            cc=getCCFromCP(c2);
+            /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
+            if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
+                return TRUE;
+            }
+        }
+    }
+
+    /* this decomposition is not modified by any appended character */
+    return FALSE;
+}
+
+/* see unormimp.h for details on NF*C Skippable flags */
+static uint32_t
+getSkippableFlags(const Norm *norm) {
+    /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
+
+    /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
+    if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
+        return 0;
+    }
+
+    /* ### check other data generation functions whether they should & do ignore Hangul/Jamo specials */
+
+    /*
+     * Note:
+     * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
+     *
+     * This means that (a)..(e) must always be derived from the runtime norm32 value,
+     * and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
+     * the form is NF*C and there is a canonical decomposition (NFD_NO).
+     *
+     * (a) unassigned code points get "not skippable"==false because they
+     * don't have a Norm struct so they won't get here
+     */
+
+    /* (b) not skippable if cc!=0 */
+    if(norm->udataCC!=0) {
+        return 0; /* non-zero flag for (f) only */
+    }
+
+    /*
+     * not NFC_Skippable if
+     * (c) quick check flag == NO  or
+     * (d) combines forward  or
+     * (e) combines back or
+     * (f) can change if another character is added
+     *
+     * for (f):
+     * For NF*C: Get corresponding decomposition, get its last starter (cc==0),
+     *           check its composition list,
+     *           see if any of the second code points in the list
+     *           has cc less than the trailCC of the decomposition.
+     *
+     * For FCC: Test at runtime if the decomposition has a trailCC>1
+     *          -> there are characters with cc==1, they would order before the trail char
+     *          and prevent contiguous combination with the trail char.
+     */
+    if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
+        (norm->combiningFlags&3)!=0) {
+        return 0; /* non-zero flag for (f) only */
+    }
+    if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
+        return _NORM_AUX_NFC_SKIP_F_MASK;
+    }
+
+    return 0; /* skippable */
+}
+
 static void
 makeAux() {
    Norm *norm;
@ -1302,6 +1559,8 @@ makeAux() {
        if(norm->unsafeStart || norm->udataCC!=0) {
            pData[i]|=_NORM_AUX_UNSAFE_MASK;
        }
+
+        pData[i]|=getSkippableFlags(norm);
    }
 }

@ -1430,8 +1689,9 @@ processData() {
    /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
    enumTrie(makeCanonSetFn, NULL);

-    /* clone the normalization trie to make the FCD trie */
-    if( NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0) ||
+    /* clone the normalization builder trie to make the final data tries */
+    if( NULL==utrie_clone(&norm32Trie, &normTrie, NULL, 0) ||
+        NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0) ||
        NULL==utrie_clone(&auxTrie, &normTrie, NULL, 0)
    ) {
        fprintf(stderr, "error: unable to clone the normalization trie\n");
@ -1469,7 +1729,7 @@ generateData(const char *dataDir) {
    UErrorCode errorCode=U_ZERO_ERROR;
    int32_t size, normTrieSize, fcdTrieSize, auxTrieSize, dataLength;

-    normTrieSize=utrie_serialize(&normTrie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
+    normTrieSize=utrie_serialize(&norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
        exit(errorCode);
@ -1595,6 +1855,7 @@ cleanUpData(void) {
    utm_close(extraMem);
    utm_close(combiningTriplesMem);
    utrie_close(&normTrie);
+    utrie_close(&norm32Trie);
    utrie_close(&fcdTrie);
    utrie_close(&auxTrie);
 }