ICU-1785 build skippables data; formatVersion 2.2

X-SVN-Rev: 10152
This commit is contained in:
Markus Scherer 2002-11-05 00:56:25 +00:00
parent c22abf74ad
commit 7f33a69caf
2 changed files with 315 additions and 12 deletions

View file

@ -75,7 +75,8 @@ enum {
/* value constants for auxTrie */
enum {
_NORM_AUX_COMP_EX_SHIFT=10,
_NORM_AUX_UNSAFE_SHIFT=11
_NORM_AUX_UNSAFE_SHIFT=11,
_NORM_AUX_NFC_SKIPPABLE_F_SHIFT=12
};
#define _NORM_AUX_MAX_FNC ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
@ -83,6 +84,7 @@ enum {
#define _NORM_AUX_FNC_MASK (uint32_t)(_NORM_AUX_MAX_FNC-1)
#define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
#define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
#define _NORM_AUX_NFC_SKIP_F_MASK ((uint32_t)1<<_NORM_AUX_NFC_SKIPPABLE_F_SHIFT)
/* canonStartSets[0..31] contains indexes for what is in the array */
enum {
@ -312,11 +314,27 @@ U_CAPI UBool U_EXPORT2
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
/**
* Description of the format of unorm.dat version 2.1.
* Is c an NF<mode>-skippable code point? See unormimp.h.
* @internal
*/
U_CAPI UBool U_EXPORT2
unorm_isNFSkippable(UChar32 c, UNormalizationMode mode);
/**
* Enumerate each normalization data trie and add the
* start of each range of same properties to the set.
* @internal
*/
U_CAPI void U_EXPORT2
unorm_addPropertyStarts(USet *set);
/**
* Description of the format of unorm.dat version 2.2.
*
* Main change from version 1 to version 2:
* Use of new, common UTrie instead of normalization-specific tries.
* Change to version 2.1: add third/auxiliary trie with associated data.
* Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK).
*
* For more details of how to use the data structures see the code
* in unorm.cpp (runtime normalization code) and
@ -520,7 +538,8 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
*
* The auxiliary 16-bit trie contains data for additional properties.
* Bits
* 15..12 reserved (for skippable flags, see NormalizerTransliterator)
* 15..13 reserved
* 12 not NFC_Skippable (f) (formatVersion>=2.2)
* 11 flag: not a safe starter for canonical closure
* 10 composition exclusion
* 9.. 0 index into extraData[] to FC_NFKC_Closure string
@ -541,6 +560,29 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
* ++s;
* }
*
* Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
* (used in NormalizerTransliterator)
*
* A skippable character is
* a) unassigned, or ALL of the following:
* b) of combining class 0.
* c) not decomposed by this normalization form.
* AND if NFC or NFKC,
* d) can never compose with a previous character.
* e) can never compose with a following character.
* f) can never change if another character is added.
* Example: a-breve might satisfy all but f, but if you
* add an ogonek it changes to a-ogonek + breve
*
* a)..e) must be tested from norm32.
* Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
* into the auxiliary trie.
* The same bit is used for NFC and NFKC; (c) differs for them.
* As usual, we build the "not skippable" flags so that unassigned
* code points get a 0 bit.
* This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
* Test Hangul LV syllables entirely in code.
*
*
* - structure inside canonStartSets[]
*

View file

@ -55,8 +55,8 @@ static UDataInfo dataInfo={
0,
{ 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
{ 2, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 3, 1, 0, 0 } /* dataVersion (Unicode version) */
{ 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 3, 2, 0, 0 } /* dataVersion (Unicode version) */
};
extern void
@ -155,6 +155,7 @@ typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
static UNewTrie
normTrie={ {0},0,0,0,0,0,0,0,0,{0} },
norm32Trie={ {0},0,0,0,0,0,0,0,0,{0} },
fcdTrie={ {0},0,0,0,0,0,0,0,0,{0} },
auxTrie={ {0},0,0,0,0,0,0,0,0,{0} };
@ -168,10 +169,29 @@ static Norm *norms;
*/
static uint32_t haveSeenFlags[256];
/* see addCombiningCP() for details */
static uint32_t combiningCPs[2000];
/*
* after processCombining() this contains for each code point in combiningCPs[]
* the runtime combining index
*/
static uint16_t combiningIndexes[2000];
/* section limits for combiningCPs[], see addCombiningCP() */
static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
/**
* Structure for a triple of code points, stored in combiningTriplesMem.
* The lead and trail code points combine into the the combined one,
* i.e., there is a canonical decomposition of combined-> <lead, trail>.
*
* Before processCombining() is called, leadIndex and trailIndex are 0.
* After processCombining(), they contain the indexes of the lead and trail
* code point in the combiningCPs[] array.
* They are then sorted by leadIndex, then trailIndex.
* They are not sorted by code points.
*/
typedef struct CombiningTriple {
uint16_t leadIndex, trailIndex;
uint32_t lead, trail, combined;
@ -312,6 +332,24 @@ setHaveSeenString(const uint32_t *s, int32_t length) {
/* handle combining data ---------------------------------------------------- */
/*
* Insert an entry into combiningCPs[] for the new code point code with its flags.
* The flags indicate if code combines forward, backward, or both.
*
* combiningCPs[] contains three sections:
* 1. code points that combine forward
* 2. code points that combine forward and backward
* 3. code points that combine backward
*
* Search for code in the entire array.
* If it is found and already is in the right section (old flags==new flags)
* then we are done.
* If it is found but the flags are different, then remove it,
* union the old and new flags, and reinsert it into its correct section.
* If it is not found, then just insert it.
*
* Within each section, the code points are not sorted.
*/
static void
addCombiningCP(uint32_t code, uint8_t flags) {
uint32_t newEntry;
@ -370,6 +408,12 @@ addCombiningCP(uint32_t code, uint8_t flags) {
++combineBackTop;
}
/**
* Find the index in combiningCPs[] where code point code is stored.
* @param code code point to look for
* @param isLead is code a forward combining code point?
* @return index in combiningCPs[] where code is stored
*/
static uint16_t
findCombiningCP(uint32_t code, UBool isLead) {
uint16_t i, limit;
@ -1161,7 +1205,7 @@ makeAll32() {
norms[i].value32=make32BitNorm(norms+i);
}
pNormData=utrie_getData(&normTrie, &normLength);
pNormData=utrie_getData(&norm32Trie, &normLength);
count=0;
for(i=0; i<normLength; ++i) {
@ -1208,7 +1252,7 @@ makeFCD() {
*/
static int32_t
usetContainsOne(const USet* set) {
if (uset_size(set) == 1) {
if (uset_size(set) == 1) { /* ### faster to count ranges and check only range?! */
UChar32 start, end;
UErrorCode ec = U_ZERO_ERROR;
int32_t len = uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
@ -1225,7 +1269,7 @@ makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
UErrorCode errorCode=U_ZERO_ERROR;
/* does the set contain exactly one code point? */
c=usetContainsOne(norm->canonStart);
c=usetContainsOne(norm->canonStart); /* ### why? */
/* add an entry to the BMP or supplementary search table */
if(code<=0xffff) {
@ -1251,7 +1295,7 @@ makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
if(c>=0) {
/* single-code point result for supplementary code point */
table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00)); /* ### how does this work again? */
table[tableLength++]=(uint16_t)c;
} else {
table[tableLength++]=(uint16_t)canonStartSetsTop;
@ -1281,6 +1325,219 @@ makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
}
}
/* for getSkippableFlags ---------------------------------------------------- */
/* combine the lead and trail code points; return <0 if they do not combine */
static int32_t
combine(uint32_t lead, uint32_t trail) {
CombiningTriple *triples;
uint32_t i, count;
/* search for all triples with c as lead code point */
triples=utm_getStart(combiningTriplesMem);
count=combiningTriplesMem->index;
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
for(i=0; i<count && lead!=triples[i].lead; ++i) {}
/* check each triple for this code point */
for(; i<count && lead==triples[i].lead; ++i) {
if(trail==triples[i].trail) {
return (int32_t)triples[i].combined;
}
}
return -1;
}
/*
* Starting from the canonical decomposition s[0..length[ of a single code point,
* is the code point c consumed in an NFC/FCC recomposition?
*
* No need to handle discontiguous composition because that would not consume some
* intermediate character, so would not compose back to the original character.
* See comments in canChangeWithFollowing().
*
* No need to compose beyond where c canonically orders because if it is consumed
* then the result differs from the original anyway.
*
* Possible optimization:
* - Verify that there are no cases of the same combining mark stacking twice.
* - return FALSE right away if c inserts after a copy of itself
* without attempting to recompose; will happen because each mark in
* the decomposition will be enumerated and passed in as c.
* More complicated and fragile though than it is already.
*
* markus 2002nov04
*/
static UBool
doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
int32_t starter, i;
/* ignore trailing characters where cc<prevCC */
while(length>1 && cc<getCCFromCP(s[length-1])) {
--length;
}
/* start consuming/combining from the beginning */
starter=(int32_t)s[0];
for(i=1; i<length; ++i) {
starter=combine((uint32_t)starter, s[i]);
if(starter<0) {
fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04lx, %04lx, ...>[%ld], U+%04lx, %u)\n",
s[0], s[1], length, c, cc);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}
/* try to combine/consume c, return TRUE if it is consumed */
return combine((uint32_t)starter, c)>=0;
}
/* does the starter s[0] combine forward with another char that is below trailCC? */
static UBool
canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
if(trailCC<=1) {
/* no character will combine ahead of the trailing char of the decomposition */
return FALSE;
}
/*
* We are only checking skippable condition (f).
* Therefore, the original character does not have quick check flag NFC_NO (c),
* i.e., the decomposition recomposes completely back into the original code point.
* So s[0] must be a true starter with cc==0 and
* combining with following code points.
*
* Similarly, length==1 is not possible because that would be a singleton
* decomposition which is marked with NFC_NO and does not pass (c).
*
* Only a character with cc<trailCC can change the composition.
* Reason: A char with cc>=trailCC would order after decomposition s[],
* composition would consume all of the decomposition, and here we know that
* the original char passed check d), i.e., it does not combine forward,
* therefore does not combine with anything after the decomposition is consumed.
*
* Now see if there is a character that
* 1. combines backward
* 2. has cc<trailCC
* 3. is consumed in recomposition
*
* length==2 is simple:
*
* Characters that fulfill these conditions are exactly the ones that combine directly
* with the starter c==s[0] because there is no intervening character after
* reordering.
* We can just enumerate all chars with which c combines (they all pass 1. and 3.)
* and see if one has cc<trailCC (passes 2.).
*
* length>2 is a little harder:
*
* Since we will get different starters during recomposition, we need to
* enumerate each backward-combining character (1.)
* with cc<trailCC (2.) and
* see if it gets consumed in recomposition. (3.)
* No need to enumerate both-ways combining characters because they must have cc==0.
*/
if(length==2) {
/* enumerate all chars that combine with this one and check their cc */
CombiningTriple *triples;
uint32_t c, i, count;
uint8_t cc;
/* search for all triples with c as lead code point */
triples=utm_getStart(combiningTriplesMem);
count=combiningTriplesMem->index;
c=s[0];
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
for(i=0; i<count && c!=triples[i].lead; ++i) {}
/* check each triple for this code point */
for(; i<count && c==triples[i].lead; ++i) {
cc=getCCFromCP(triples[i].trail);
if(cc>0 && cc<trailCC) {
/* this trail code point combines with c and has cc<trailCC */
return TRUE;
}
}
} else {
/* enumerate all chars that combine backward */
uint32_t c2;
uint16_t i;
uint8_t cc;
for(i=combineBothTop; i<combineBackTop; ++i) {
c2=combiningCPs[i]&0xffffff;
cc=getCCFromCP(c2);
/* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
return TRUE;
}
}
}
/* this decomposition is not modified by any appended character */
return FALSE;
}
/* see unormimp.h for details on NF*C Skippable flags */
static uint32_t
getSkippableFlags(const Norm *norm) {
/* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
/* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
return 0;
}
/* ### check other data generation functions whether they should & do ignore Hangul/Jamo specials */
/*
* Note:
* This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
*
* This means that (a)..(e) must always be derived from the runtime norm32 value,
* and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
* the form is NF*C and there is a canonical decomposition (NFD_NO).
*
* (a) unassigned code points get "not skippable"==false because they
* don't have a Norm struct so they won't get here
*/
/* (b) not skippable if cc!=0 */
if(norm->udataCC!=0) {
return 0; /* non-zero flag for (f) only */
}
/*
* not NFC_Skippable if
* (c) quick check flag == NO or
* (d) combines forward or
* (e) combines back or
* (f) can change if another character is added
*
* for (f):
* For NF*C: Get corresponding decomposition, get its last starter (cc==0),
* check its composition list,
* see if any of the second code points in the list
* has cc less than the trailCC of the decomposition.
*
* For FCC: Test at runtime if the decomposition has a trailCC>1
* -> there are characters with cc==1, they would order before the trail char
* and prevent contiguous combination with the trail char.
*/
if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
(norm->combiningFlags&3)!=0) {
return 0; /* non-zero flag for (f) only */
}
if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
return _NORM_AUX_NFC_SKIP_F_MASK;
}
return 0; /* skippable */
}
static void
makeAux() {
Norm *norm;
@ -1302,6 +1559,8 @@ makeAux() {
if(norm->unsafeStart || norm->udataCC!=0) {
pData[i]|=_NORM_AUX_UNSAFE_MASK;
}
pData[i]|=getSkippableFlags(norm);
}
}
@ -1430,8 +1689,9 @@ processData() {
/* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
enumTrie(makeCanonSetFn, NULL);
/* clone the normalization trie to make the FCD trie */
if( NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0) ||
/* clone the normalization builder trie to make the final data tries */
if( NULL==utrie_clone(&norm32Trie, &normTrie, NULL, 0) ||
NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0) ||
NULL==utrie_clone(&auxTrie, &normTrie, NULL, 0)
) {
fprintf(stderr, "error: unable to clone the normalization trie\n");
@ -1469,7 +1729,7 @@ generateData(const char *dataDir) {
UErrorCode errorCode=U_ZERO_ERROR;
int32_t size, normTrieSize, fcdTrieSize, auxTrieSize, dataLength;
normTrieSize=utrie_serialize(&normTrie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
normTrieSize=utrie_serialize(&norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
exit(errorCode);
@ -1595,6 +1855,7 @@ cleanUpData(void) {
utm_close(extraMem);
utm_close(combiningTriplesMem);
utrie_close(&normTrie);
utrie_close(&norm32Trie);
utrie_close(&fcdTrie);
utrie_close(&auxTrie);
}