mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-1754 store data for canonical equivalence more compactly
X-SVN-Rev: 8034
This commit is contained in:
parent
02228a0689
commit
facb0c82f7
3 changed files with 265 additions and 75 deletions
|
@ -154,14 +154,10 @@ getFoldingNormOffset(uint32_t norm32) {
|
|||
}
|
||||
}
|
||||
|
||||
/* auxTrie: if bit 31 is set, then the folding offset is in bits 29..20 of the 32-bit trie result */
|
||||
/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
|
||||
static int32_t U_CALLCONV
|
||||
getFoldingAuxOffset(uint32_t data) {
|
||||
if((int32_t)data<0) {
|
||||
return (int32_t)(data&_NORM_AUX_FNC_MASK)>>(_NORM_AUX_FNC_SHIFT-UTRIE_SURROGATE_BLOCK_BITS);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV
|
||||
|
@ -543,10 +539,10 @@ U_CAPI UBool U_EXPORT2
|
|||
unorm_internalIsFullCompositionExclusion(UChar32 c) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
if(_haveData(errorCode) && formatVersion_2_1) {
|
||||
uint32_t aux32;
|
||||
uint16_t aux;
|
||||
|
||||
UTRIE_GET32(&auxTrie, c, aux32);
|
||||
return (UBool)((aux32&_NORM_AUX_COMP_EX_MASK)!=0);
|
||||
UTRIE_GET16(&auxTrie, c, aux);
|
||||
return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -556,10 +552,10 @@ U_CAPI UBool U_EXPORT2
|
|||
unorm_isCanonSafeStart(UChar32 c) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
if(_haveData(errorCode) && formatVersion_2_1) {
|
||||
uint32_t aux32;
|
||||
uint16_t aux;
|
||||
|
||||
UTRIE_GET32(&auxTrie, c, aux32);
|
||||
return (UBool)((aux32&_NORM_AUX_UNSAFE_MASK)==0);
|
||||
UTRIE_GET16(&auxTrie, c, aux);
|
||||
return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -568,18 +564,94 @@ unorm_isCanonSafeStart(UChar32 c) {
|
|||
U_CAPI UBool U_EXPORT2
|
||||
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
if(fillSet!=NULL && _haveData(errorCode) && canonStartSets!=NULL) {
|
||||
uint32_t aux32;
|
||||
if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
|
||||
_haveData(errorCode) && canonStartSets!=NULL
|
||||
) {
|
||||
const uint16_t *table;
|
||||
int32_t i, start, limit;
|
||||
|
||||
UTRIE_GET32(&auxTrie, c, aux32);
|
||||
aux32&=_NORM_AUX_CANON_SET_MASK;
|
||||
return aux32!=0 &&
|
||||
uset_getSerializedSet(fillSet,
|
||||
canonStartSets+aux32,
|
||||
indexes[_NORM_INDEX_CANON_SET_COUNT]-aux32);
|
||||
} else {
|
||||
return FALSE;
|
||||
/*
|
||||
* binary search for c
|
||||
*
|
||||
* There are two search tables,
|
||||
* one for BMP code points and one for supplementary ones.
|
||||
* See unormimp.h for details.
|
||||
*/
|
||||
if(c<=0xffff) {
|
||||
table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
|
||||
start=0;
|
||||
limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
|
||||
|
||||
/* each entry is a pair { c, result } */
|
||||
while(start<limit-2) {
|
||||
i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
|
||||
if(c<table[i]) {
|
||||
limit=i;
|
||||
} else {
|
||||
start=i;
|
||||
}
|
||||
}
|
||||
|
||||
/* found? */
|
||||
if(c==table[start]) {
|
||||
i=table[start+1];
|
||||
if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
|
||||
/* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
|
||||
i&=(_NORM_MAX_CANON_SETS-1);
|
||||
return uset_getSerializedSet(fillSet,
|
||||
canonStartSets+i,
|
||||
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
|
||||
} else {
|
||||
/* other result values are BMP code points for single-code point sets */
|
||||
uset_setSerializedToOne(fillSet, (UChar32)i);
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
uint16_t high, low, h;
|
||||
|
||||
table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
|
||||
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
|
||||
start=0;
|
||||
limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
|
||||
|
||||
high=(uint16_t)(c>>16);
|
||||
low=(uint16_t)c;
|
||||
|
||||
/* each entry is a triplet { high(c), low(c), result } */
|
||||
while(start<limit-3) {
|
||||
i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
|
||||
h=table[i]&0x1f; /* high word */
|
||||
if(high<h || (high==h && low<table[i+1])) {
|
||||
limit=i;
|
||||
} else {
|
||||
start=i;
|
||||
}
|
||||
}
|
||||
|
||||
/* found? */
|
||||
h=table[start];
|
||||
if(high==(h&0x1f) && low==table[start+1]) {
|
||||
i=table[start+2];
|
||||
if((h&0x8000)==0) {
|
||||
/* the result is an index to a USerializedSet */
|
||||
return uset_getSerializedSet(fillSet,
|
||||
canonStartSets+i,
|
||||
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
|
||||
} else {
|
||||
/*
|
||||
* single-code point set {x} in
|
||||
* triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
|
||||
*/
|
||||
i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
|
||||
uset_setSerializedToOne(fillSet, (UChar32)i);
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return FALSE; /* not found */
|
||||
}
|
||||
|
||||
/* reorder UTF-16 in-place -------------------------------------------------- */
|
||||
|
|
|
@ -74,20 +74,33 @@ enum {
|
|||
|
||||
/* value constants for auxTrie */
|
||||
enum {
|
||||
_NORM_AUX_UNSAFE_SHIFT=14,
|
||||
_NORM_AUX_FNC_SHIFT=20,
|
||||
_NORM_AUX_COMP_EX_SHIFT=30,
|
||||
_NORM_AUX_IS_LEAD_SHIFT=31
|
||||
_NORM_AUX_COMP_EX_SHIFT=10,
|
||||
_NORM_AUX_UNSAFE_SHIFT=11
|
||||
};
|
||||
|
||||
#define _NORM_AUX_MAX_CANON_SET ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
|
||||
#define _NORM_AUX_MAX_FNC ((int32_t)1<<(_NORM_AUX_COMP_EX_SHIFT-_NORM_AUX_FNC_SHIFT))
|
||||
#define _NORM_AUX_MAX_FNC ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
|
||||
|
||||
#define _NORM_AUX_CANON_SET_MASK (_NORM_AUX_MAX_CANON_SET-1)
|
||||
#define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
|
||||
#define _NORM_AUX_FNC_MASK ((uint32_t)(_NORM_AUX_MAX_FNC-1)<<_NORM_AUX_FNC_SHIFT)
|
||||
#define _NORM_AUX_FNC_MASK (uint32_t)(_NORM_AUX_MAX_FNC-1)
|
||||
#define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
|
||||
#define _NORM_AUX_IS_LEAD_MASK ((uint32_t)1<<_NORM_AUX_IS_LEAD_SHIFT)
|
||||
#define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
|
||||
|
||||
/* canonStartSets[0..31] contains indexes for what is in the array */
|
||||
enum {
|
||||
_NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */
|
||||
_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */
|
||||
_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */
|
||||
|
||||
_NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */
|
||||
};
|
||||
|
||||
/* more constants for canonical starter sets */
|
||||
|
||||
/* 14 bit indexes to canonical USerializedSets */
|
||||
#define _NORM_MAX_CANON_SETS 0x4000
|
||||
|
||||
/* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */
|
||||
#define _NORM_CANON_SET_BMP_MASK 0xc000
|
||||
#define _NORM_CANON_SET_BMP_IS_INDEX 0x4000
|
||||
|
||||
/* indexes[] value names */
|
||||
enum {
|
||||
|
@ -298,7 +311,7 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
|
|||
* UTrie auxTrie; -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE]
|
||||
*
|
||||
* uint16_t canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[_NORM_INDEX_CANON_SET_COUNT]
|
||||
* serialized USets, see uset.c
|
||||
* serialized USets and binary search tables, see below
|
||||
*
|
||||
*
|
||||
* The indexes array contains lengths and sizes of the following arrays and structures
|
||||
|
@ -470,19 +483,14 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
|
|||
*
|
||||
* - Auxiliary trie and data
|
||||
*
|
||||
* The auxiliary 32-bit trie contains data for additional properties.
|
||||
* The auxiliary 16-bit trie contains data for additional properties.
|
||||
* Bits
|
||||
* 31 set if lead surrogate offset
|
||||
* 30 composition exclusion
|
||||
* 29..20 index into extraData[] to FC_NFKC_Closure string (bit 31==0),
|
||||
* or lead surrogate offset (bit 31==1)
|
||||
* 19..16 skippable flags
|
||||
* 15 reserved
|
||||
* 14 flag: not a safe starter for canonical closure
|
||||
* 13.. 0 index to serialized USet for canonical closure
|
||||
* the set lists the code points whose decompositions start with
|
||||
* the one that this data is for
|
||||
* for how USets are serialized see uset.c
|
||||
* 15..12 reserved (for skippable flags, see NormalizerTransliterator)
|
||||
* 11 flag: not a safe starter for canonical closure
|
||||
* 10 composition exclusion
|
||||
* 9.. 0 index into extraData[] to FC_NFKC_Closure string
|
||||
* (not for lead surrogate),
|
||||
* or lead surrogate offset (for lead surrogate, if 9..0 not zero)
|
||||
*
|
||||
* - FC_NFKC_Closure strings in extraData[]
|
||||
*
|
||||
|
@ -497,6 +505,42 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
|
|||
* length=*s&0xff;
|
||||
* ++s;
|
||||
* }
|
||||
*
|
||||
*
|
||||
* - structure inside canonStartSets[]
|
||||
*
|
||||
* This array maps from code points c to sets of code points (USerializedSet).
|
||||
* The result sets are the code points whose canonical decompositions start
|
||||
* with c.
|
||||
*
|
||||
* canonStartSets[] contains the following sub-arrays:
|
||||
*
|
||||
* indexes[_NORM_SET_INDEX_TOP]
|
||||
* - contains lengths of sub-arrays etc.
|
||||
*
|
||||
* startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
|
||||
* - contains serialized sets (USerializedSet) of canonical starters for
|
||||
* enumerating canonically equivalent strings
|
||||
* indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
|
||||
* for details about the structure see uset.c
|
||||
*
|
||||
* bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
|
||||
* - a sorted search table for BMP code points whose results are
|
||||
* either indexes to USerializedSets or single code points for
|
||||
* single-code point sets;
|
||||
* each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
|
||||
* if yy==01 then there is a USerializedSet at canonStartSets+x
|
||||
* else build a USerializedSet with result as the single code point
|
||||
*
|
||||
* suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
|
||||
* - a sorted search table for supplementary code points whose results are
|
||||
* either indexes to USerializedSets or single code points for
|
||||
* single-code point sets;
|
||||
* each entry is a triplet of { high16(cp), low16(cp), result }
|
||||
* each code point's high-word may contain extra data in bits 15..5:
|
||||
* if the high word has bit 15 set, then build a set with a single code point
|
||||
* which is (((high16(cp)&0x1f00)<<8)|result;
|
||||
* else there is a USerializedSet at canonStartSets+result
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
|
|
@ -181,8 +181,10 @@ typedef struct CombiningTriple {
|
|||
static uint16_t combiningTable[0x8000];
|
||||
static uint16_t combiningTableTop=0;
|
||||
|
||||
static uint16_t canonStartSets[_NORM_AUX_MAX_CANON_SET]={ 0 };
|
||||
static int32_t canonStartSetsTop=1;
|
||||
#define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
|
||||
static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH];
|
||||
static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
|
||||
static int32_t canonSetsCount=0;
|
||||
|
||||
extern void
|
||||
init() {
|
||||
|
@ -218,6 +220,9 @@ init() {
|
|||
indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
|
||||
indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
|
||||
indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
|
||||
|
||||
/* preset the indexes portion of canonStartSets */
|
||||
uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1196,6 +1201,70 @@ makeFCD() {
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
|
||||
if(!uset_isEmpty(norm->canonStart)) {
|
||||
uint16_t *table;
|
||||
int32_t c, tableLength;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
|
||||
/* does the set contain exactly one code point? */
|
||||
c=uset_containsOne(norm->canonStart);
|
||||
|
||||
/* add an entry to the BMP or supplementary search table */
|
||||
if(code<=0xffff) {
|
||||
table=canonStartSets+_NORM_MAX_CANON_SETS;
|
||||
tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
|
||||
|
||||
table[tableLength++]=(uint16_t)code;
|
||||
|
||||
if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
|
||||
/* single-code point BMP result for BMP code point */
|
||||
table[tableLength++]=(uint16_t)c;
|
||||
} else {
|
||||
table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
|
||||
c=-1;
|
||||
}
|
||||
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
|
||||
} else {
|
||||
table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
|
||||
tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
|
||||
|
||||
table[tableLength++]=(uint16_t)(code>>16);
|
||||
table[tableLength++]=(uint16_t)code;
|
||||
|
||||
if(c>=0) {
|
||||
/* single-code point result for supplementary code point */
|
||||
table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
|
||||
table[tableLength++]=(uint16_t)c;
|
||||
} else {
|
||||
table[tableLength++]=(uint16_t)canonStartSetsTop;
|
||||
}
|
||||
canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
|
||||
}
|
||||
|
||||
if(c<0) {
|
||||
/* write a USerializedSet */
|
||||
++canonSetsCount;
|
||||
canonStartSetsTop+=
|
||||
uset_serialize(norm->canonStart,
|
||||
canonStartSets+canonStartSetsTop,
|
||||
_NORM_MAX_CANON_SETS-canonStartSetsTop,
|
||||
&errorCode);
|
||||
}
|
||||
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), canonStartSetsTop);
|
||||
exit(errorCode);
|
||||
}
|
||||
if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
|
||||
fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
|
||||
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
makeAux() {
|
||||
Norm *norm;
|
||||
|
@ -1208,29 +1277,16 @@ makeAux() {
|
|||
for(i=0; i<length; ++i) {
|
||||
norm=norms+pData[i];
|
||||
/*
|
||||
* 32-bit auxiliary normalization properties
|
||||
* 16-bit auxiliary normalization properties
|
||||
* see unormimp.h
|
||||
*/
|
||||
pData[i]=
|
||||
((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
|
||||
(uint32_t)(norm->fncIndex<<_NORM_AUX_FNC_SHIFT);
|
||||
(uint32_t)norm->fncIndex;
|
||||
|
||||
if(norm->unsafeStart || norm->udataCC!=0) {
|
||||
pData[i]|=_NORM_AUX_UNSAFE_MASK;
|
||||
}
|
||||
|
||||
if(!uset_isEmpty(norm->canonStart)) {
|
||||
pData[i]|=(uint32_t)canonStartSetsTop;
|
||||
canonStartSetsTop+=
|
||||
uset_serialize(norm->canonStart,
|
||||
canonStartSets+canonStartSetsTop,
|
||||
_NORM_AUX_MAX_CANON_SET-canonStartSetsTop,
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), canonStartSetsTop);
|
||||
exit(errorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1299,9 +1355,9 @@ getFoldedFCDValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
|||
|
||||
/*
|
||||
* folding value for auxiliary data:
|
||||
* set bit 31 and store the offset in bits 29..20
|
||||
* if there is any non-0 entry
|
||||
* or together data bits 30 and 19..0 of all of the 1024 supplementary code points
|
||||
* store the non-zero offset in bits 9..0 (FNC bits)
|
||||
* if there is any non-0 entry;
|
||||
* "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
|
||||
*/
|
||||
static uint32_t U_CALLCONV
|
||||
getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
|
@ -1322,18 +1378,13 @@ getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
|||
}
|
||||
|
||||
if(oredValues!=0) {
|
||||
/* reduce variation of oredValues */
|
||||
if(oredValues&_NORM_AUX_CANON_SET_MASK) {
|
||||
oredValues|=_NORM_AUX_CANON_SET_MASK;
|
||||
}
|
||||
|
||||
/* move the 10 significant offset bits into bits 29..20 */
|
||||
offset=offset<<(_NORM_AUX_FNC_SHIFT-UTRIE_SURROGATE_BLOCK_BITS);
|
||||
/* move the 10 significant offset bits into bits 9..0 */
|
||||
offset>>=UTRIE_SURROGATE_BLOCK_BITS;
|
||||
if(offset>_NORM_AUX_FNC_MASK) {
|
||||
fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
|
||||
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
}
|
||||
return (uint32_t)offset|_NORM_AUX_IS_LEAD_MASK|(oredValues&~_NORM_AUX_FNC_MASK);
|
||||
return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
@ -1361,6 +1412,9 @@ processData() {
|
|||
/* add hangul/jamo specials */
|
||||
setHangulJamoSpecials();
|
||||
|
||||
/* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
|
||||
enumTrie(makeCanonSetFn, NULL);
|
||||
|
||||
/* clone the normalization trie to make the FCD trie */
|
||||
if( NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0) ||
|
||||
NULL==utrie_clone(&auxTrie, &normTrie, NULL, 0)
|
||||
|
@ -1412,12 +1466,27 @@ generateData(const char *dataDir) {
|
|||
exit(errorCode);
|
||||
}
|
||||
|
||||
auxTrieSize=utrie_serialize(&auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, FALSE, &errorCode);
|
||||
auxTrieSize=utrie_serialize(&auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
/* move the parts of canonStartSets[] together into a contiguous block */
|
||||
if(canonStartSetsTop<_NORM_MAX_CANON_SETS) {
|
||||
uprv_memmove(canonStartSets+canonStartSetsTop,
|
||||
canonStartSets+_NORM_MAX_CANON_SETS,
|
||||
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
|
||||
}
|
||||
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
|
||||
|
||||
if(canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH)) {
|
||||
uprv_memmove(canonStartSets+canonStartSetsTop,
|
||||
canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
|
||||
canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
|
||||
}
|
||||
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
|
||||
|
||||
/* make sure that the FCD trie is 4-aligned */
|
||||
if((extraMem->index+combiningTableTop)&1) {
|
||||
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
|
||||
|
@ -1444,7 +1513,12 @@ generateData(const char *dataDir) {
|
|||
printf("size of combining table %5lu uint16_t\n", combiningTableTop);
|
||||
printf("size of FCD trie %5lu bytes\n", fcdTrieSize);
|
||||
printf("size of auxiliary trie %5lu bytes\n", auxTrieSize);
|
||||
printf("size of canonStartSets %5lu uint16_t\n", canonStartSetsTop);
|
||||
printf("size of canonStartSets[] %5u uint16_t\n", canonStartSetsTop);
|
||||
printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP);
|
||||
printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
|
||||
printf(" number of sets %5ld\n", canonSetsCount);
|
||||
printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
|
||||
printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
|
||||
printf("size of " DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue