diff --git a/icu4c/source/common/unormimp.h b/icu4c/source/common/unormimp.h index 999fc2195a8..966d1f42662 100644 --- a/icu4c/source/common/unormimp.h +++ b/icu4c/source/common/unormimp.h @@ -75,7 +75,8 @@ enum { /* value constants for auxTrie */ enum { _NORM_AUX_COMP_EX_SHIFT=10, - _NORM_AUX_UNSAFE_SHIFT=11 + _NORM_AUX_UNSAFE_SHIFT=11, + _NORM_AUX_NFC_SKIPPABLE_F_SHIFT=12 }; #define _NORM_AUX_MAX_FNC ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT) @@ -83,6 +84,7 @@ enum { #define _NORM_AUX_FNC_MASK (uint32_t)(_NORM_AUX_MAX_FNC-1) #define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT) #define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT) +#define _NORM_AUX_NFC_SKIP_F_MASK ((uint32_t)1<<_NORM_AUX_NFC_SKIPPABLE_F_SHIFT) /* canonStartSets[0..31] contains indexes for what is in the array */ enum { @@ -312,11 +314,27 @@ U_CAPI UBool U_EXPORT2 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet); /** - * Description of the format of unorm.dat version 2.1. + * Is c an NF-skippable code point? See unormimp.h. + * @internal + */ +U_CAPI UBool U_EXPORT2 +unorm_isNFSkippable(UChar32 c, UNormalizationMode mode); + +/** + * Enumerate each normalization data trie and add the + * start of each range of same properties to the set. + * @internal + */ +U_CAPI void U_EXPORT2 +unorm_addPropertyStarts(USet *set); + +/** + * Description of the format of unorm.dat version 2.2. * * Main change from version 1 to version 2: * Use of new, common UTrie instead of normalization-specific tries. * Change to version 2.1: add third/auxiliary trie with associated data. + * Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK). * * For more details of how to use the data structures see the code * in unorm.cpp (runtime normalization code) and @@ -520,7 +538,8 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet); * * The auxiliary 16-bit trie contains data for additional properties. * Bits - * 15..12 reserved (for skippable flags, see NormalizerTransliterator) + * 15..13 reserved + * 12 not NFC_Skippable (f) (formatVersion>=2.2) * 11 flag: not a safe starter for canonical closure * 10 composition exclusion * 9.. 0 index into extraData[] to FC_NFKC_Closure string @@ -541,6 +560,29 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet); * ++s; * } * + * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable: + * (used in NormalizerTransliterator) + * + * A skippable character is + * a) unassigned, or ALL of the following: + * b) of combining class 0. + * c) not decomposed by this normalization form. + * AND if NFC or NFKC, + * d) can never compose with a previous character. + * e) can never compose with a following character. + * f) can never change if another character is added. + * Example: a-breve might satisfy all but f, but if you + * add an ogonek it changes to a-ogonek + breve + * + * a)..e) must be tested from norm32. + * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built + * into the auxiliary trie. + * The same bit is used for NFC and NFKC; (c) differs for them. + * As usual, we build the "not skippable" flags so that unassigned + * code points get a 0 bit. + * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well. + * Test Hangul LV syllables entirely in code. + * * * - structure inside canonStartSets[] * diff --git a/icu4c/source/tools/gennorm/store.c b/icu4c/source/tools/gennorm/store.c index d4218373d31..ca710b2941c 100644 --- a/icu4c/source/tools/gennorm/store.c +++ b/icu4c/source/tools/gennorm/store.c @@ -55,8 +55,8 @@ static UDataInfo dataInfo={ 0, { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */ - { 2, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ - { 3, 1, 0, 0 } /* dataVersion (Unicode version) */ + { 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ + { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ }; extern void @@ -155,6 +155,7 @@ typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm); static UNewTrie normTrie={ {0},0,0,0,0,0,0,0,0,{0} }, + norm32Trie={ {0},0,0,0,0,0,0,0,0,{0} }, fcdTrie={ {0},0,0,0,0,0,0,0,0,{0} }, auxTrie={ {0},0,0,0,0,0,0,0,0,{0} }; @@ -168,10 +169,29 @@ static Norm *norms; */ static uint32_t haveSeenFlags[256]; +/* see addCombiningCP() for details */ static uint32_t combiningCPs[2000]; + +/* + * after processCombining() this contains for each code point in combiningCPs[] + * the runtime combining index + */ static uint16_t combiningIndexes[2000]; + +/* section limits for combiningCPs[], see addCombiningCP() */ static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0; +/** + * Structure for a triple of code points, stored in combiningTriplesMem. + * The lead and trail code points combine into the the combined one, + * i.e., there is a canonical decomposition of combined-> . + * + * Before processCombining() is called, leadIndex and trailIndex are 0. + * After processCombining(), they contain the indexes of the lead and trail + * code point in the combiningCPs[] array. + * They are then sorted by leadIndex, then trailIndex. + * They are not sorted by code points. + */ typedef struct CombiningTriple { uint16_t leadIndex, trailIndex; uint32_t lead, trail, combined; @@ -312,6 +332,24 @@ setHaveSeenString(const uint32_t *s, int32_t length) { /* handle combining data ---------------------------------------------------- */ +/* + * Insert an entry into combiningCPs[] for the new code point code with its flags. + * The flags indicate if code combines forward, backward, or both. + * + * combiningCPs[] contains three sections: + * 1. code points that combine forward + * 2. code points that combine forward and backward + * 3. code points that combine backward + * + * Search for code in the entire array. + * If it is found and already is in the right section (old flags==new flags) + * then we are done. + * If it is found but the flags are different, then remove it, + * union the old and new flags, and reinsert it into its correct section. + * If it is not found, then just insert it. + * + * Within each section, the code points are not sorted. + */ static void addCombiningCP(uint32_t code, uint8_t flags) { uint32_t newEntry; @@ -370,6 +408,12 @@ addCombiningCP(uint32_t code, uint8_t flags) { ++combineBackTop; } +/** + * Find the index in combiningCPs[] where code point code is stored. + * @param code code point to look for + * @param isLead is code a forward combining code point? + * @return index in combiningCPs[] where code is stored + */ static uint16_t findCombiningCP(uint32_t code, UBool isLead) { uint16_t i, limit; @@ -1161,7 +1205,7 @@ makeAll32() { norms[i].value32=make32BitNorm(norms+i); } - pNormData=utrie_getData(&normTrie, &normLength); + pNormData=utrie_getData(&norm32Trie, &normLength); count=0; for(i=0; icanonStart); + c=usetContainsOne(norm->canonStart); /* ### why? */ /* add an entry to the BMP or supplementary search table */ if(code<=0xffff) { @@ -1251,7 +1295,7 @@ makeCanonSetFn(void *context, uint32_t code, Norm *norm) { if(c>=0) { /* single-code point result for supplementary code point */ - table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00)); + table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00)); /* ### how does this work again? */ table[tableLength++]=(uint16_t)c; } else { table[tableLength++]=(uint16_t)canonStartSetsTop; @@ -1281,6 +1325,219 @@ makeCanonSetFn(void *context, uint32_t code, Norm *norm) { } } +/* for getSkippableFlags ---------------------------------------------------- */ + +/* combine the lead and trail code points; return <0 if they do not combine */ +static int32_t +combine(uint32_t lead, uint32_t trail) { + CombiningTriple *triples; + uint32_t i, count; + + /* search for all triples with c as lead code point */ + triples=utm_getStart(combiningTriplesMem); + count=combiningTriplesMem->index; + + /* triples are not sorted by code point but for each lead CP there is one contiguous block */ + for(i=0; i1 && cc[%ld], U+%04lx, %u)\n", + s[0], s[1], length, c, cc); + exit(U_INTERNAL_PROGRAM_ERROR); + } + } + + /* try to combine/consume c, return TRUE if it is consumed */ + return combine((uint32_t)starter, c)>=0; +} + +/* does the starter s[0] combine forward with another char that is below trailCC? */ +static UBool +canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) { + if(trailCC<=1) { + /* no character will combine ahead of the trailing char of the decomposition */ + return FALSE; + } + + /* + * We are only checking skippable condition (f). + * Therefore, the original character does not have quick check flag NFC_NO (c), + * i.e., the decomposition recomposes completely back into the original code point. + * So s[0] must be a true starter with cc==0 and + * combining with following code points. + * + * Similarly, length==1 is not possible because that would be a singleton + * decomposition which is marked with NFC_NO and does not pass (c). + * + * Only a character with cc=trailCC would order after decomposition s[], + * composition would consume all of the decomposition, and here we know that + * the original char passed check d), i.e., it does not combine forward, + * therefore does not combine with anything after the decomposition is consumed. + * + * Now see if there is a character that + * 1. combines backward + * 2. has cc2 is a little harder: + * + * Since we will get different starters during recomposition, we need to + * enumerate each backward-combining character (1.) + * with ccindex; + c=s[0]; + + /* triples are not sorted by code point but for each lead CP there is one contiguous block */ + for(i=0; i0 && cc0 && ccspecialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) { + return 0; + } + + /* ### check other data generation functions whether they should & do ignore Hangul/Jamo specials */ + + /* + * Note: + * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not. + * + * This means that (a)..(e) must always be derived from the runtime norm32 value, + * and (f) be checked from the auxTrie if the character is skippable per (a)..(e), + * the form is NF*C and there is a canonical decomposition (NFD_NO). + * + * (a) unassigned code points get "not skippable"==false because they + * don't have a Norm struct so they won't get here + */ + + /* (b) not skippable if cc!=0 */ + if(norm->udataCC!=0) { + return 0; /* non-zero flag for (f) only */ + } + + /* + * not NFC_Skippable if + * (c) quick check flag == NO or + * (d) combines forward or + * (e) combines back or + * (f) can change if another character is added + * + * for (f): + * For NF*C: Get corresponding decomposition, get its last starter (cc==0), + * check its composition list, + * see if any of the second code points in the list + * has cc less than the trailCC of the decomposition. + * + * For FCC: Test at runtime if the decomposition has a trailCC>1 + * -> there are characters with cc==1, they would order before the trail char + * and prevent contiguous combination with the trail char. + */ + if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 || + (norm->combiningFlags&3)!=0) { + return 0; /* non-zero flag for (f) only */ + } + if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) { + return _NORM_AUX_NFC_SKIP_F_MASK; + } + + return 0; /* skippable */ +} + static void makeAux() { Norm *norm; @@ -1302,6 +1559,8 @@ makeAux() { if(norm->unsafeStart || norm->udataCC!=0) { pData[i]|=_NORM_AUX_UNSAFE_MASK; } + + pData[i]|=getSkippableFlags(norm); } } @@ -1430,8 +1689,9 @@ processData() { /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */ enumTrie(makeCanonSetFn, NULL); - /* clone the normalization trie to make the FCD trie */ - if( NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0) || + /* clone the normalization builder trie to make the final data tries */ + if( NULL==utrie_clone(&norm32Trie, &normTrie, NULL, 0) || + NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0) || NULL==utrie_clone(&auxTrie, &normTrie, NULL, 0) ) { fprintf(stderr, "error: unable to clone the normalization trie\n"); @@ -1469,7 +1729,7 @@ generateData(const char *dataDir) { UErrorCode errorCode=U_ZERO_ERROR; int32_t size, normTrieSize, fcdTrieSize, auxTrieSize, dataLength; - normTrieSize=utrie_serialize(&normTrie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode); + normTrieSize=utrie_serialize(&norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode)); exit(errorCode); @@ -1595,6 +1855,7 @@ cleanUpData(void) { utm_close(extraMem); utm_close(combiningTriplesMem); utrie_close(&normTrie); + utrie_close(&norm32Trie); utrie_close(&fcdTrie); utrie_close(&auxTrie); }