ICU-1754 store data for canonical equivalence more compactly

X-SVN-Rev: 8034
This commit is contained in:
Markus Scherer 2002-03-14 23:54:09 +00:00
parent 02228a0689
commit facb0c82f7
3 changed files with 265 additions and 75 deletions

View file

@ -154,14 +154,10 @@ getFoldingNormOffset(uint32_t norm32) {
}
}
/* auxTrie: if bit 31 is set, then the folding offset is in bits 29..20 of the 32-bit trie result */
/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
static int32_t U_CALLCONV
getFoldingAuxOffset(uint32_t data) {
if((int32_t)data<0) {
return (int32_t)(data&_NORM_AUX_FNC_MASK)>>(_NORM_AUX_FNC_SHIFT-UTRIE_SURROGATE_BLOCK_BITS);
} else {
return 0;
}
return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
}
static UBool U_CALLCONV
@ -543,10 +539,10 @@ U_CAPI UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
if(_haveData(errorCode) && formatVersion_2_1) {
uint32_t aux32;
uint16_t aux;
UTRIE_GET32(&auxTrie, c, aux32);
return (UBool)((aux32&_NORM_AUX_COMP_EX_MASK)!=0);
UTRIE_GET16(&auxTrie, c, aux);
return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
} else {
return FALSE;
}
@ -556,10 +552,10 @@ U_CAPI UBool U_EXPORT2
unorm_isCanonSafeStart(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
if(_haveData(errorCode) && formatVersion_2_1) {
uint32_t aux32;
uint16_t aux;
UTRIE_GET32(&auxTrie, c, aux32);
return (UBool)((aux32&_NORM_AUX_UNSAFE_MASK)==0);
UTRIE_GET16(&auxTrie, c, aux);
return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
} else {
return FALSE;
}
@ -568,18 +564,94 @@ unorm_isCanonSafeStart(UChar32 c) {
U_CAPI UBool U_EXPORT2
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
UErrorCode errorCode=U_ZERO_ERROR;
if(fillSet!=NULL && _haveData(errorCode) && canonStartSets!=NULL) {
uint32_t aux32;
if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
_haveData(errorCode) && canonStartSets!=NULL
) {
const uint16_t *table;
int32_t i, start, limit;
UTRIE_GET32(&auxTrie, c, aux32);
aux32&=_NORM_AUX_CANON_SET_MASK;
return aux32!=0 &&
uset_getSerializedSet(fillSet,
canonStartSets+aux32,
indexes[_NORM_INDEX_CANON_SET_COUNT]-aux32);
} else {
return FALSE;
/*
* binary search for c
*
* There are two search tables,
* one for BMP code points and one for supplementary ones.
* See unormimp.h for details.
*/
if(c<=0xffff) {
table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
start=0;
limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
/* each entry is a pair { c, result } */
while(start<limit-2) {
i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
if(c<table[i]) {
limit=i;
} else {
start=i;
}
}
/* found? */
if(c==table[start]) {
i=table[start+1];
if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
/* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
i&=(_NORM_MAX_CANON_SETS-1);
return uset_getSerializedSet(fillSet,
canonStartSets+i,
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
} else {
/* other result values are BMP code points for single-code point sets */
uset_setSerializedToOne(fillSet, (UChar32)i);
return TRUE;
}
}
} else {
uint16_t high, low, h;
table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
start=0;
limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
high=(uint16_t)(c>>16);
low=(uint16_t)c;
/* each entry is a triplet { high(c), low(c), result } */
while(start<limit-3) {
i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
h=table[i]&0x1f; /* high word */
if(high<h || (high==h && low<table[i+1])) {
limit=i;
} else {
start=i;
}
}
/* found? */
h=table[start];
if(high==(h&0x1f) && low==table[start+1]) {
i=table[start+2];
if((h&0x8000)==0) {
/* the result is an index to a USerializedSet */
return uset_getSerializedSet(fillSet,
canonStartSets+i,
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
} else {
/*
* single-code point set {x} in
* triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
*/
i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
uset_setSerializedToOne(fillSet, (UChar32)i);
return TRUE;
}
}
}
}
return FALSE; /* not found */
}
/* reorder UTF-16 in-place -------------------------------------------------- */

View file

@ -74,20 +74,33 @@ enum {
/* value constants for auxTrie */
enum {
_NORM_AUX_UNSAFE_SHIFT=14,
_NORM_AUX_FNC_SHIFT=20,
_NORM_AUX_COMP_EX_SHIFT=30,
_NORM_AUX_IS_LEAD_SHIFT=31
_NORM_AUX_COMP_EX_SHIFT=10,
_NORM_AUX_UNSAFE_SHIFT=11
};
#define _NORM_AUX_MAX_CANON_SET ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
#define _NORM_AUX_MAX_FNC ((int32_t)1<<(_NORM_AUX_COMP_EX_SHIFT-_NORM_AUX_FNC_SHIFT))
#define _NORM_AUX_MAX_FNC ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
#define _NORM_AUX_CANON_SET_MASK (_NORM_AUX_MAX_CANON_SET-1)
#define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
#define _NORM_AUX_FNC_MASK ((uint32_t)(_NORM_AUX_MAX_FNC-1)<<_NORM_AUX_FNC_SHIFT)
#define _NORM_AUX_FNC_MASK (uint32_t)(_NORM_AUX_MAX_FNC-1)
#define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
#define _NORM_AUX_IS_LEAD_MASK ((uint32_t)1<<_NORM_AUX_IS_LEAD_SHIFT)
#define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
/* canonStartSets[0..31] contains indexes for what is in the array */
enum {
_NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */
_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */
_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */
_NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */
};
/* more constants for canonical starter sets */
/* 14 bit indexes to canonical USerializedSets */
#define _NORM_MAX_CANON_SETS 0x4000
/* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */
#define _NORM_CANON_SET_BMP_MASK 0xc000
#define _NORM_CANON_SET_BMP_IS_INDEX 0x4000
/* indexes[] value names */
enum {
@ -298,7 +311,7 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
* UTrie auxTrie; -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE]
*
* uint16_t canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[_NORM_INDEX_CANON_SET_COUNT]
* serialized USets, see uset.c
* serialized USets and binary search tables, see below
*
*
* The indexes array contains lengths and sizes of the following arrays and structures
@ -470,19 +483,14 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
*
* - Auxiliary trie and data
*
* The auxiliary 32-bit trie contains data for additional properties.
* The auxiliary 16-bit trie contains data for additional properties.
* Bits
* 31 set if lead surrogate offset
* 30 composition exclusion
* 29..20 index into extraData[] to FC_NFKC_Closure string (bit 31==0),
* or lead surrogate offset (bit 31==1)
* 19..16 skippable flags
* 15 reserved
* 14 flag: not a safe starter for canonical closure
* 13.. 0 index to serialized USet for canonical closure
* the set lists the code points whose decompositions start with
* the one that this data is for
* for how USets are serialized see uset.c
* 15..12 reserved (for skippable flags, see NormalizerTransliterator)
* 11 flag: not a safe starter for canonical closure
* 10 composition exclusion
* 9.. 0 index into extraData[] to FC_NFKC_Closure string
* (not for lead surrogate),
* or lead surrogate offset (for lead surrogate, if 9..0 not zero)
*
* - FC_NFKC_Closure strings in extraData[]
*
@ -497,6 +505,42 @@ unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
* length=*s&0xff;
* ++s;
* }
*
*
* - structure inside canonStartSets[]
*
* This array maps from code points c to sets of code points (USerializedSet).
* The result sets are the code points whose canonical decompositions start
* with c.
*
* canonStartSets[] contains the following sub-arrays:
*
* indexes[_NORM_SET_INDEX_TOP]
* - contains lengths of sub-arrays etc.
*
* startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
* - contains serialized sets (USerializedSet) of canonical starters for
* enumerating canonically equivalent strings
* indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
* for details about the structure see uset.c
*
* bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
* - a sorted search table for BMP code points whose results are
* either indexes to USerializedSets or single code points for
* single-code point sets;
* each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
* if yy==01 then there is a USerializedSet at canonStartSets+x
* else build a USerializedSet with result as the single code point
*
* suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
* - a sorted search table for supplementary code points whose results are
* either indexes to USerializedSets or single code points for
* single-code point sets;
* each entry is a triplet of { high16(cp), low16(cp), result }
* each code point's high-word may contain extra data in bits 15..5:
* if the high word has bit 15 set, then build a set with a single code point
* which is (((high16(cp)&0x1f00)<<8)|result;
* else there is a USerializedSet at canonStartSets+result
*/
#endif

View file

@ -181,8 +181,10 @@ typedef struct CombiningTriple {
static uint16_t combiningTable[0x8000];
static uint16_t combiningTableTop=0;
static uint16_t canonStartSets[_NORM_AUX_MAX_CANON_SET]={ 0 };
static int32_t canonStartSetsTop=1;
#define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH];
static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
static int32_t canonSetsCount=0;
extern void
init() {
@ -218,6 +220,9 @@ init() {
indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
/* preset the indexes portion of canonStartSets */
uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
}
/*
@ -1196,6 +1201,70 @@ makeFCD() {
}
}
static void
makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
if(!uset_isEmpty(norm->canonStart)) {
uint16_t *table;
int32_t c, tableLength;
UErrorCode errorCode=U_ZERO_ERROR;
/* does the set contain exactly one code point? */
c=uset_containsOne(norm->canonStart);
/* add an entry to the BMP or supplementary search table */
if(code<=0xffff) {
table=canonStartSets+_NORM_MAX_CANON_SETS;
tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
table[tableLength++]=(uint16_t)code;
if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
/* single-code point BMP result for BMP code point */
table[tableLength++]=(uint16_t)c;
} else {
table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
c=-1;
}
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
} else {
table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
table[tableLength++]=(uint16_t)(code>>16);
table[tableLength++]=(uint16_t)code;
if(c>=0) {
/* single-code point result for supplementary code point */
table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
table[tableLength++]=(uint16_t)c;
} else {
table[tableLength++]=(uint16_t)canonStartSetsTop;
}
canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
}
if(c<0) {
/* write a USerializedSet */
++canonSetsCount;
canonStartSetsTop+=
uset_serialize(norm->canonStart,
canonStartSets+canonStartSetsTop,
_NORM_MAX_CANON_SETS-canonStartSetsTop,
&errorCode);
}
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), canonStartSetsTop);
exit(errorCode);
}
if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
}
}
static void
makeAux() {
Norm *norm;
@ -1208,29 +1277,16 @@ makeAux() {
for(i=0; i<length; ++i) {
norm=norms+pData[i];
/*
* 32-bit auxiliary normalization properties
* 16-bit auxiliary normalization properties
* see unormimp.h
*/
pData[i]=
((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
(uint32_t)(norm->fncIndex<<_NORM_AUX_FNC_SHIFT);
(uint32_t)norm->fncIndex;
if(norm->unsafeStart || norm->udataCC!=0) {
pData[i]|=_NORM_AUX_UNSAFE_MASK;
}
if(!uset_isEmpty(norm->canonStart)) {
pData[i]|=(uint32_t)canonStartSetsTop;
canonStartSetsTop+=
uset_serialize(norm->canonStart,
canonStartSets+canonStartSetsTop,
_NORM_AUX_MAX_CANON_SET-canonStartSetsTop,
&errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), canonStartSetsTop);
exit(errorCode);
}
}
}
}
@ -1299,9 +1355,9 @@ getFoldedFCDValue(UNewTrie *trie, UChar32 start, int32_t offset) {
/*
* folding value for auxiliary data:
* set bit 31 and store the offset in bits 29..20
* if there is any non-0 entry
* or together data bits 30 and 19..0 of all of the 1024 supplementary code points
* store the non-zero offset in bits 9..0 (FNC bits)
* if there is any non-0 entry;
* "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
*/
static uint32_t U_CALLCONV
getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
@ -1322,18 +1378,13 @@ getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
}
if(oredValues!=0) {
/* reduce variation of oredValues */
if(oredValues&_NORM_AUX_CANON_SET_MASK) {
oredValues|=_NORM_AUX_CANON_SET_MASK;
}
/* move the 10 significant offset bits into bits 29..20 */
offset=offset<<(_NORM_AUX_FNC_SHIFT-UTRIE_SURROGATE_BLOCK_BITS);
/* move the 10 significant offset bits into bits 9..0 */
offset>>=UTRIE_SURROGATE_BLOCK_BITS;
if(offset>_NORM_AUX_FNC_MASK) {
fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
return (uint32_t)offset|_NORM_AUX_IS_LEAD_MASK|(oredValues&~_NORM_AUX_FNC_MASK);
return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
} else {
return 0;
}
@ -1361,6 +1412,9 @@ processData() {
/* add hangul/jamo specials */
setHangulJamoSpecials();
/* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
enumTrie(makeCanonSetFn, NULL);
/* clone the normalization trie to make the FCD trie */
if( NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0) ||
NULL==utrie_clone(&auxTrie, &normTrie, NULL, 0)
@ -1412,12 +1466,27 @@ generateData(const char *dataDir) {
exit(errorCode);
}
auxTrieSize=utrie_serialize(&auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, FALSE, &errorCode);
auxTrieSize=utrie_serialize(&auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
exit(errorCode);
}
/* move the parts of canonStartSets[] together into a contiguous block */
if(canonStartSetsTop<_NORM_MAX_CANON_SETS) {
uprv_memmove(canonStartSets+canonStartSetsTop,
canonStartSets+_NORM_MAX_CANON_SETS,
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
}
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
if(canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH)) {
uprv_memmove(canonStartSets+canonStartSetsTop,
canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
}
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
/* make sure that the FCD trie is 4-aligned */
if((extraMem->index+combiningTableTop)&1) {
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
@ -1444,7 +1513,12 @@ generateData(const char *dataDir) {
printf("size of combining table %5lu uint16_t\n", combiningTableTop);
printf("size of FCD trie %5lu bytes\n", fcdTrieSize);
printf("size of auxiliary trie %5lu bytes\n", auxTrieSize);
printf("size of canonStartSets %5lu uint16_t\n", canonStartSetsTop);
printf("size of canonStartSets[] %5u uint16_t\n", canonStartSetsTop);
printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP);
printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
printf(" number of sets %5ld\n", canonSetsCount);
printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
printf("size of " DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
}