ICU-1754 parse/store in unorm.dat Full_Composition_Exclusion and FC_NFKC_Closure

X-SVN-Rev: 7905
This commit is contained in:
Markus Scherer 2002-03-07 19:56:30 +00:00
parent 3da93e7247
commit efe9c23d9f
2 changed files with 119 additions and 18 deletions

View file

@ -110,7 +110,7 @@ static UErrorCode dataErrorCode=U_ZERO_ERROR;
static int8_t haveNormData=0;
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 };
static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
/*
* pointers into the memory-mapped unorm.dat
@ -118,6 +118,9 @@ static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 };
static const uint16_t *extraData=NULL,
*combiningTable=NULL;
static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
static UBool formatVersion_2_1=FALSE;
/* the Unicode version of the normalization data */
static UVersionInfo dataVersion={ 3, 1, 0, 0 };
@ -135,6 +138,29 @@ unorm_cleanup() {
return TRUE;
}
/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
static int32_t U_CALLCONV
getFoldingNormOffset(uint32_t norm32) {
if(isNorm32LeadSurrogate(norm32)) {
return
UTRIE_BMP_INDEX_LENGTH+
(((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
(0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
} else {
return 0;
}
}
/* auxTrie: if bit 31 is set, then the folding offset is in bits 29..20 of the 32-bit trie result */
static int32_t U_CALLCONV
getFoldingAuxOffset(uint32_t data) {
if((int32_t)data<0) {
return (int32_t)(data&_NORM_AUX_FNC_MASK)>>(_NORM_AUX_FNC_SHIFT-UTRIE_SURROGATE_BLOCK_BITS);
} else {
return 0;
}
}
static UBool U_CALLCONV
isAcceptable(void * /* context */,
const char * /* type */, const char * /* name */,
@ -151,6 +177,7 @@ isAcceptable(void * /* context */,
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
return TRUE;
} else {
@ -164,9 +191,10 @@ static int8_t
loadNormData(UErrorCode &errorCode) {
/* load Unicode normalization data from file */
if(haveNormData==0) {
UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 };
UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
UDataMemory *data;
const int32_t *p=NULL;
const uint8_t *pb;
if(&errorCode==NULL || U_FAILURE(errorCode)) {
return 0;
@ -180,13 +208,19 @@ loadNormData(UErrorCode &errorCode) {
}
p=(const int32_t *)udata_getMemory(data);
pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
_normTrie.getFoldingOffset=getFoldingNormOffset;
pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
_auxTrie.getFoldingOffset=getFoldingAuxOffset;
}
utrie_unserialize(&_normTrie, (uint8_t *)(p+_NORM_INDEX_TOP), p[_NORM_INDEX_TRIE_SIZE], &errorCode);
utrie_unserialize(
&_fcdTrie,
(uint8_t *)(p+_NORM_INDEX_TOP)+p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2,
p[_NORM_INDEX_FCD_TRIE_SIZE],
&errorCode);
if(U_FAILURE(errorCode)) {
dataErrorCode=errorCode;
udata_close(data);
@ -202,6 +236,7 @@ loadNormData(UErrorCode &errorCode) {
uprv_memcpy(&indexes, p, sizeof(indexes));
uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
} else {
p=(const int32_t *)udata_getMemory(normData);
}
@ -210,6 +245,7 @@ loadNormData(UErrorCode &errorCode) {
/* initialize some variables */
extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
haveNormData=1;
/* if a different thread set it first, then close the extra data */
@ -488,20 +524,26 @@ u_getCombiningClass(UChar32 c) {
if(_haveData(errorCode)) {
uint32_t norm32;
if((uint32_t)c<=0xffff) {
norm32=_getNorm32((UChar)c);
} else {
norm32=_getNorm32(UTF16_LEAD(c));
if((norm32&_NORM_CC_MASK)!=0) {
norm32=_getNorm32FromSurrogatePair(norm32, UTF16_TRAIL(c));
}
}
UTRIE_GET32(&normTrie, c, norm32);
return (uint8_t)(norm32>>_NORM_CC_SHIFT);
} else {
return 0;
}
}
U_CAPI UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
if(_haveData(errorCode) && formatVersion_2_1) {
uint32_t aux32;
UTRIE_GET32(&auxTrie, c, aux32);
return (UBool)((aux32&_NORM_AUX_COMP_EX_MASK)!=0);
} else {
return FALSE;
}
}
/* reorder UTF-16 in-place -------------------------------------------------- */
/*

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001, International Business Machines
* Copyright (C) 2001-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -71,6 +71,20 @@ enum {
#define _NORM_MIN_JAMO_V 0xfff20000
#define _NORM_JAMO_V_TOP 0xfff30000
/* value constants for auxTrie */
enum {
_NORM_AUX_CANON_FLAG_SHIFT=11,
_NORM_AUX_FNC_SHIFT=20,
_NORM_AUX_COMP_EX_SHIFT=30,
_NORM_AUX_IS_LEAD_SHIFT=31
};
#define _NORM_AUX_MAX_FNC ((int32_t)1<<(_NORM_AUX_COMP_EX_SHIFT-_NORM_AUX_FNC_SHIFT))
#define _NORM_AUX_CANON_SET_MASK (((uint32_t)1<<_NORM_AUX_CANON_FLAG_SHIFT)-1)
#define _NORM_AUX_FNC_MASK ((uint32_t)(_NORM_AUX_MAX_FNC-1)<<_NORM_AUX_FNC_SHIFT)
#define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
#define _NORM_AUX_IS_LEAD_MASK ((uint32_t)1<<_NORM_AUX_IS_LEAD_SHIFT)
/* indexes[] value names */
enum {
@ -89,6 +103,9 @@ enum {
_NORM_INDEX_FCD_TRIE_SIZE, /* number of bytes in FCD trie */
_NORM_INDEX_AUX_TRIE_SIZE, /* number of bytes in the auxiliary trie */
_NORM_INDEX_UNICODE_SET_COUNT, /* number of int32_t in the UnicodeSet array */
_NORM_INDEX_TOP=32 /* changing this requires a new formatVersion */
};
@ -223,10 +240,18 @@ U_NAMESPACE_END
#endif
/**
* Description of the format of unorm.dat version 2.0.
* internal API, used by uprops.cpp
* @internal
*/
U_CAPI UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c);
/**
* Description of the format of unorm.dat version 2.1.
*
* Main change from version 1 to version 2:
* Use of new, common UTrie instead of normalization-specific tries.
* Change to version 2.1: add third/auxiliary trie with associated data.
*
* For more details of how to use the data structures see the code
* in unorm.cpp (runtime normalization code) and
@ -244,6 +269,8 @@ U_NAMESPACE_END
* UTrie normTrie; -- size in bytes=indexes[_NORM_INDEX_TRIE_SIZE]
*
* uint16_t extraData[extraDataTop]; -- extraDataTop=indexes[_NORM_INDEX_UCHAR_COUNT]
* extraData[0] contains the number of units for
* FC_NFKC_Closure (formatVersion>=2.1)
*
* uint16_t combiningTable[combiningTableTop]; -- combiningTableTop=indexes[_NORM_INDEX_COMBINE_DATA_COUNT]
* combiningTableTop may include one 16-bit padding unit
@ -251,6 +278,10 @@ U_NAMESPACE_END
*
* UTrie fcdTrie; -- size in bytes=indexes[_NORM_INDEX_FCD_TRIE_SIZE]
*
* UTrie auxTrie; -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE]
*
* int32_t unicodeSets[unicodeSetsTop] -- unicodeSetsTop=indexes[_NORM_INDEX_UNICODE_SET_COUNT]
*
*
* The indexes array contains lengths and sizes of the following arrays and structures
* as well as the following values:
@ -417,6 +448,34 @@ U_NAMESPACE_END
* This is done only if the 16-bit data word is not zero.
* If the code unit is a leading surrogate and the data word is not zero,
* then instead of cc's it contains the offset for the second trie lookup.
*
*
* - Auxiliary trie and data
*
* The auxiliary 32-bit trie contains data for additional properties.
* Bits
* 31 set if lead surrogate offset
* 30 composition exclusion
* 29..20 index into extraData[] to FC_NFKC_Closure string (bit 31==0),
* or lead surrogate offset (bit 31==1)
* 19..16 skippable flags
* 15..13 reserved
* 11 flag: not a safe starter for canonical closure
* 10.. 0 index to UnicodeSet for canonical closure
*
* - FC_NFKC_Closure strings in extraData[]
*
* Strings are either stored as a single code unit or as the length
* followed by that many units.
* const UChar *s=extraData+(index from auxTrie data bits 29..20);
* int32_t length;
* if(*s<0xff00) {
* // s points to the single-unit string
* length=1;
* } else {
* length=*s&0xff;
* ++s;
* }
*/
#endif