mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-19 11:45:45 +00:00
ICU-1586 use new UTrie in normalization
X-SVN-Rev: 7354
This commit is contained in:
parent
81befd9d61
commit
14e0d7c46c
4 changed files with 236 additions and 514 deletions
|
@ -109,22 +109,14 @@ static UDataMemory *normData=NULL;
|
|||
static UErrorCode dataErrorCode=U_ZERO_ERROR;
|
||||
static int8_t haveNormData=0;
|
||||
|
||||
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
|
||||
static UTrie normTrie={ 0 }, fcdTrie={ 0 };
|
||||
|
||||
/*
|
||||
* pointers into the memory-mapped unorm.dat
|
||||
*/
|
||||
static const uint16_t *indexes=NULL,
|
||||
*normTrieIndex=NULL, *extraData=NULL,
|
||||
*combiningTable=NULL,
|
||||
*fcdTrieIndex=NULL;
|
||||
|
||||
/*
|
||||
* note that there is no uint32_t *normTrieData:
|
||||
* the indexes in the trie are adjusted so that they point to the data based on
|
||||
* (uint32_t *)normTrieIndex - this saves one variable at runtime
|
||||
*/
|
||||
#define normTrieData ((uint32_t *)normTrieIndex)
|
||||
|
||||
/* similarly for the FCD trie index and data - but both are uint16_t * */
|
||||
static const uint16_t *extraData=NULL,
|
||||
*combiningTable=NULL;
|
||||
|
||||
/* the Unicode version of the normalization data */
|
||||
static UVersionInfo dataVersion={ 3, 1, 0, 0 };
|
||||
|
@ -155,8 +147,9 @@ isAcceptable(void * /* context */,
|
|||
pInfo->dataFormat[1]==0x6f &&
|
||||
pInfo->dataFormat[2]==0x72 &&
|
||||
pInfo->dataFormat[3]==0x6d &&
|
||||
pInfo->formatVersion[0]==1 &&
|
||||
pInfo->formatVersion[3]==_NORM_TRIE_SHIFT
|
||||
pInfo->formatVersion[0]==2 &&
|
||||
pInfo->formatVersion[2]==UTRIE_SHIFT &&
|
||||
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
|
||||
) {
|
||||
uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
|
||||
return TRUE;
|
||||
|
@ -171,8 +164,9 @@ static int8_t
|
|||
loadNormData(UErrorCode &errorCode) {
|
||||
/* load Unicode normalization data from file */
|
||||
if(haveNormData==0) {
|
||||
UTrie _normTrie={ 0 }, _fcdTrie={ 0 };
|
||||
UDataMemory *data;
|
||||
const uint16_t *p=NULL;
|
||||
const int32_t *p=NULL;
|
||||
|
||||
if(&errorCode==NULL || U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
|
@ -185,23 +179,37 @@ loadNormData(UErrorCode &errorCode) {
|
|||
return haveNormData=-1;
|
||||
}
|
||||
|
||||
p=(const uint16_t *)udata_getMemory(data);
|
||||
p=(const int32_t *)udata_getMemory(data);
|
||||
|
||||
utrie_unserialize(&_normTrie, (uint8_t *)(p+_NORM_INDEX_TOP), p[_NORM_INDEX_TRIE_SIZE], &errorCode);
|
||||
utrie_unserialize(
|
||||
&_fcdTrie,
|
||||
(uint8_t *)(p+_NORM_INDEX_TOP)+p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2,
|
||||
p[_NORM_INDEX_FCD_TRIE_SIZE],
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dataErrorCode=errorCode;
|
||||
udata_close(data);
|
||||
return haveNormData=-1;
|
||||
}
|
||||
|
||||
/* in the mutex block, set the data for this process */
|
||||
umtx_lock(NULL);
|
||||
if(normData==NULL) {
|
||||
normData=data;
|
||||
data=NULL;
|
||||
indexes=p;
|
||||
p=NULL;
|
||||
|
||||
uprv_memcpy(&indexes, p, sizeof(indexes));
|
||||
uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
|
||||
uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
|
||||
} else {
|
||||
p=(const int32_t *)udata_getMemory(normData);
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
|
||||
/* initialize some variables */
|
||||
normTrieIndex=indexes+indexes[_NORM_INDEX_COUNT];
|
||||
extraData=normTrieIndex+indexes[_NORM_INDEX_TRIE_INDEX_COUNT]+2*indexes[_NORM_INDEX_TRIE_DATA_COUNT];
|
||||
extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
|
||||
combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
|
||||
fcdTrieIndex=combiningTable+indexes[_NORM_INDEX_COMBINE_DATA_COUNT];
|
||||
haveNormData=1;
|
||||
|
||||
/* if a different thread set it first, then close the extra data */
|
||||
|
@ -231,7 +239,7 @@ unorm_haveData(UErrorCode *pErrorCode) {
|
|||
U_CAPI const uint16_t * U_EXPORT2
|
||||
unorm_getFCDTrie(UErrorCode *pErrorCode) {
|
||||
if(_haveData(*pErrorCode)) {
|
||||
return fcdTrieIndex;
|
||||
return fcdTrie.index;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
|
@ -241,29 +249,20 @@ unorm_getFCDTrie(UErrorCode *pErrorCode) {
|
|||
|
||||
static inline uint32_t
|
||||
_getNorm32(UChar c) {
|
||||
return
|
||||
normTrieData[
|
||||
normTrieIndex[
|
||||
c>>_NORM_TRIE_SHIFT
|
||||
]+
|
||||
(c&_NORM_STAGE_2_MASK)
|
||||
];
|
||||
return UTRIE_GET32_FROM_LEAD(&normTrie, c);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
_getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
|
||||
/* the surrogate index in norm32 is an offset over the BMP top of stage 1 */
|
||||
uint32_t c=
|
||||
((norm32>>(_NORM_EXTRA_SHIFT-10))&0xffc00)|
|
||||
(c2&0x3ff);
|
||||
return
|
||||
normTrieData[
|
||||
normTrieIndex[
|
||||
_NORM_STAGE_1_BMP_COUNT+
|
||||
(c>>_NORM_TRIE_SHIFT)
|
||||
]+
|
||||
(c&_NORM_STAGE_2_MASK)
|
||||
];
|
||||
/*
|
||||
* the surrogate index in norm32 stores only the number of the surrogate index block
|
||||
* see gennorm/store.c/getFoldedNormValue()
|
||||
*/
|
||||
norm32=
|
||||
UTRIE_BMP_INDEX_LENGTH+
|
||||
((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
|
||||
(0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
|
||||
return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -282,28 +281,13 @@ _getNorm32(const UChar *p, uint32_t mask) {
|
|||
|
||||
static inline uint16_t
|
||||
_getFCD16(UChar c) {
|
||||
return
|
||||
fcdTrieIndex[
|
||||
fcdTrieIndex[
|
||||
c>>_NORM_TRIE_SHIFT
|
||||
]+
|
||||
(c&_NORM_STAGE_2_MASK)
|
||||
];
|
||||
return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
|
||||
}
|
||||
|
||||
static inline uint16_t
|
||||
_getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
|
||||
/* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
|
||||
uint32_t c=
|
||||
((uint32_t)fcd16<<10)|
|
||||
(c2&0x3ff);
|
||||
return
|
||||
fcdTrieIndex[
|
||||
fcdTrieIndex[
|
||||
c>>_NORM_TRIE_SHIFT
|
||||
]+
|
||||
(c&_NORM_STAGE_2_MASK)
|
||||
];
|
||||
return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
|
||||
}
|
||||
|
||||
static inline const uint16_t *
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#ifdef XP_CPLUSPLUS
|
||||
# include "unicode/chariter.h"
|
||||
#endif
|
||||
#include "utrie.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/*
|
||||
|
@ -30,27 +31,6 @@
|
|||
* The format of that file is described at the end of this file.
|
||||
*/
|
||||
|
||||
/* trie constants */
|
||||
enum {
|
||||
/**
|
||||
* Normalization trie table shift value.
|
||||
* This value must be <=10:
|
||||
* above 10, a lead surrogate's block is smaller than a stage 2 block
|
||||
*/
|
||||
_NORM_TRIE_SHIFT=5,
|
||||
|
||||
_NORM_STAGE_2_BLOCK_COUNT=1<<_NORM_TRIE_SHIFT,
|
||||
_NORM_STAGE_2_MASK=_NORM_STAGE_2_BLOCK_COUNT-1,
|
||||
|
||||
_NORM_STAGE_1_BMP_COUNT=(1<<(16-_NORM_TRIE_SHIFT)),
|
||||
|
||||
_NORM_SURROGATE_BLOCK_BITS=10-_NORM_TRIE_SHIFT,
|
||||
_NORM_SURROGATE_BLOCK_COUNT=(1<<_NORM_SURROGATE_BLOCK_BITS)
|
||||
};
|
||||
|
||||
/* this may be >0xffff and may not work as an enum */
|
||||
#define _NORM_STAGE_1_MAX_COUNT (0x110000>>_NORM_TRIE_SHIFT)
|
||||
|
||||
/* norm32 value constants */
|
||||
enum {
|
||||
/* quick check flags 0..3 set mean "no" for their forms */
|
||||
|
@ -96,26 +76,22 @@ enum {
|
|||
|
||||
/* indexes[] value names */
|
||||
enum {
|
||||
_NORM_INDEX_COUNT,
|
||||
_NORM_INDEX_TRIE_SHIFT,
|
||||
_NORM_INDEX_TRIE_INDEX_COUNT,
|
||||
_NORM_INDEX_TRIE_DATA_COUNT,
|
||||
_NORM_INDEX_UCHAR_COUNT,
|
||||
_NORM_INDEX_TRIE_SIZE, /* number of bytes in normalization trie */
|
||||
_NORM_INDEX_UCHAR_COUNT, /* number of UChars in extra data */
|
||||
|
||||
_NORM_INDEX_COMBINE_DATA_COUNT,
|
||||
_NORM_INDEX_COMBINE_FWD_COUNT,
|
||||
_NORM_INDEX_COMBINE_BOTH_COUNT,
|
||||
_NORM_INDEX_COMBINE_BACK_COUNT,
|
||||
_NORM_INDEX_COMBINE_DATA_COUNT, /* number of uint16_t words for combining data */
|
||||
_NORM_INDEX_COMBINE_FWD_COUNT, /* number of code points that combine forward */
|
||||
_NORM_INDEX_COMBINE_BOTH_COUNT, /* number of code points that combine forward and backward */
|
||||
_NORM_INDEX_COMBINE_BACK_COUNT, /* number of code points that combine backward */
|
||||
|
||||
_NORM_INDEX_MIN_NFC_NO_MAYBE,
|
||||
_NORM_INDEX_MIN_NFKC_NO_MAYBE,
|
||||
_NORM_INDEX_MIN_NFD_NO_MAYBE,
|
||||
_NORM_INDEX_MIN_NFKD_NO_MAYBE,
|
||||
_NORM_INDEX_MIN_NFC_NO_MAYBE, /* first code point with quick check NFC NO/MAYBE */
|
||||
_NORM_INDEX_MIN_NFKC_NO_MAYBE, /* first code point with quick check NFKC NO/MAYBE */
|
||||
_NORM_INDEX_MIN_NFD_NO_MAYBE, /* first code point with quick check NFD NO/MAYBE */
|
||||
_NORM_INDEX_MIN_NFKD_NO_MAYBE, /* first code point with quick check NFKD NO/MAYBE */
|
||||
|
||||
_NORM_INDEX_FCD_TRIE_INDEX_COUNT,
|
||||
_NORM_INDEX_FCD_TRIE_DATA_COUNT,
|
||||
_NORM_INDEX_FCD_TRIE_SIZE, /* number of bytes in FCD trie */
|
||||
|
||||
_NORM_INDEX_TOP=16
|
||||
_NORM_INDEX_TOP=32 /* changing this requires a new formatVersion */
|
||||
};
|
||||
|
||||
enum {
|
||||
|
@ -215,10 +191,10 @@ inline uint16_t
|
|||
unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
|
||||
return
|
||||
fcdTrieIndex[
|
||||
fcdTrieIndex[
|
||||
c>>_NORM_TRIE_SHIFT
|
||||
]+
|
||||
(c&_NORM_STAGE_2_MASK)
|
||||
(fcdTrieIndex[
|
||||
c>>UTRIE_SHIFT
|
||||
]<<UTRIE_INDEX_SHIFT)+
|
||||
(c&UTRIE_MASK)
|
||||
];
|
||||
}
|
||||
|
||||
|
@ -235,16 +211,12 @@ unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
|
|||
*/
|
||||
inline uint16_t
|
||||
unorm_getFCD16FromSurrogatePair(const uint16_t *fcdTrieIndex, uint16_t fcd16, UChar c2) {
|
||||
/* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
|
||||
uint32_t c=
|
||||
((uint32_t)fcd16<<10)|
|
||||
(c2&0x3ff);
|
||||
return
|
||||
fcdTrieIndex[
|
||||
fcdTrieIndex[
|
||||
c>>_NORM_TRIE_SHIFT
|
||||
]+
|
||||
(c&_NORM_STAGE_2_MASK)
|
||||
(fcdTrieIndex[
|
||||
(int32_t)fcd16+((c2&0x3ff)>>UTRIE_SHIFT)
|
||||
]<<UTRIE_INDEX_SHIFT)+
|
||||
(c2&UTRIE_MASK)
|
||||
];
|
||||
}
|
||||
|
||||
|
@ -386,7 +358,7 @@ unorm_previousNormalize(UChar *dest, int32_t destCapacity,
|
|||
U_CDECL_END
|
||||
|
||||
/**
|
||||
* Description of the format of unorm.dat.
|
||||
* Description of the format of unorm.dat. ### TODO describe 2.0 with UTrie
|
||||
*
|
||||
* For more details of how to use the data structures see the code
|
||||
* in unorm.cpp (runtime normalization code) and
|
||||
|
|
|
@ -32,6 +32,7 @@ typedef struct Norm {
|
|||
uint8_t qcFlags, combiningFlags;
|
||||
uint16_t canonBothCCs, compatBothCCs, combiningIndex, specialTag;
|
||||
uint32_t *nfd, *nfkd;
|
||||
uint32_t value32; /* temporary variable for generating runtime norm32 and fcd values */
|
||||
} Norm;
|
||||
|
||||
/* global flags */
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "cstring.h"
|
||||
#include "filestrm.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "utrie.h"
|
||||
#include "unewdata.h"
|
||||
#include "unormimp.h"
|
||||
#include "gennorm.h"
|
||||
|
@ -53,8 +54,8 @@ static UDataInfo dataInfo={
|
|||
0,
|
||||
|
||||
{ 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
|
||||
{1, 0, 0, _NORM_TRIE_SHIFT}, /* formatVersion - [3] contains the trie shift! */
|
||||
{3, 1, 0, 0} /* dataVersion (Unicode version) */
|
||||
{ 2, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 3, 1, 0, 0 } /* dataVersion (Unicode version) */
|
||||
};
|
||||
|
||||
extern void
|
||||
|
@ -64,7 +65,7 @@ setUnicodeVersion(const char *v) {
|
|||
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
||||
}
|
||||
|
||||
static uint16_t indexes[_NORM_INDEX_TOP]={ 0 };
|
||||
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
|
||||
|
||||
/* tool memory helper ------------------------------------------------------- */
|
||||
|
||||
|
@ -102,7 +103,6 @@ utm_open(const char *name, uint32_t count, uint32_t size) {
|
|||
return mem;
|
||||
}
|
||||
|
||||
/* we don't use this - we don't clean up memory here... */
|
||||
static void
|
||||
utm_close(UToolMemory *mem) {
|
||||
if(mem!=NULL) {
|
||||
|
@ -147,10 +147,9 @@ utm_allocN(UToolMemory *mem, int32_t n) {
|
|||
|
||||
typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
|
||||
|
||||
static UToolMemory *stage2Mem, *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
|
||||
static UNewTrie normTrie={ 0 }, fcdTrie={ 0 };
|
||||
|
||||
static uint16_t stage1[_NORM_STAGE_1_MAX_COUNT], fcdStage1[_NORM_STAGE_1_MAX_COUNT];
|
||||
static uint16_t *stage2;
|
||||
static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
|
||||
|
||||
static Norm *norms;
|
||||
|
||||
|
@ -173,21 +172,13 @@ typedef struct CombiningTriple {
|
|||
static uint16_t combiningTable[0x8000];
|
||||
static uint16_t combiningTableTop=0;
|
||||
|
||||
/* stage 2 table after turning Norm structs into 32-bit words */
|
||||
static uint32_t *norm32Table=NULL, *fcdTable=NULL;
|
||||
|
||||
/* number of units used in stage 1 and norm32Table, and same for FCD */
|
||||
static uint16_t stage1Top, fcdStage1Top,
|
||||
norm32TableTop, fcdTableTop;
|
||||
|
||||
extern void
|
||||
init() {
|
||||
/* reset stage 1 of the trie */
|
||||
uprv_memset(stage1, 0, sizeof(stage1));
|
||||
|
||||
/* allocate stage 2 of the trie and reset the first block */
|
||||
stage2Mem=utm_open("gennorm trie stage 2", 30000, sizeof(*stage2));
|
||||
stage2=utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT);
|
||||
/* initialize the two tries */
|
||||
if(NULL==utrie_open(&normTrie, NULL, 30000, 0, FALSE)) {
|
||||
fprintf(stderr, "error: failed to initialize tries\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
/* allocate Norm structures and reset the first one */
|
||||
normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm));
|
||||
|
@ -212,24 +203,6 @@ init() {
|
|||
indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
|
||||
}
|
||||
|
||||
/* get or create a block in stage 2 of the trie */
|
||||
static uint16_t
|
||||
createStage2Block(uint32_t code) {
|
||||
uint32_t i;
|
||||
uint16_t j;
|
||||
|
||||
i=code>>_NORM_TRIE_SHIFT;
|
||||
j=stage1[i];
|
||||
if(j==0) {
|
||||
/* allocate a stage 2 block */
|
||||
uint16_t *p;
|
||||
|
||||
p=(uint16_t *)utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT);
|
||||
stage1[i]=j=(uint16_t)(p-stage2);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/*
|
||||
* get or create a Norm unit;
|
||||
* get or create the intermediate trie entries for it as well
|
||||
|
@ -237,16 +210,18 @@ createStage2Block(uint32_t code) {
|
|||
static Norm *
|
||||
createNorm(uint32_t code) {
|
||||
Norm *p;
|
||||
uint16_t stage2Block, k;
|
||||
uint32_t i;
|
||||
|
||||
stage2Block=createStage2Block(code);
|
||||
k=(uint16_t)(stage2Block+(code&_NORM_STAGE_2_MASK));
|
||||
if(stage2[k]==0) {
|
||||
i=utrie_get32(&normTrie, (UChar32)code, NULL);
|
||||
if(i!=0) {
|
||||
p=norms+i;
|
||||
} else {
|
||||
/* allocate Norm */
|
||||
p=(Norm *)utm_alloc(normMem);
|
||||
stage2[k]=(uint16_t)(p-norms);
|
||||
} else {
|
||||
p=norms+stage2[k];
|
||||
if(!utrie_set32(&normTrie, (UChar32)code, (uint32_t)(p-norms))) {
|
||||
fprintf(stderr, "error: too many normalization entries\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
@ -255,23 +230,12 @@ createNorm(uint32_t code) {
|
|||
static Norm *
|
||||
getNorm(uint32_t code) {
|
||||
uint32_t i;
|
||||
uint16_t j;
|
||||
|
||||
/* access stage 1 and get the stage 2 block start index */
|
||||
i=code>>_NORM_TRIE_SHIFT;
|
||||
j=stage1[i];
|
||||
if(j==0) {
|
||||
i=utrie_get32(&normTrie, (UChar32)code, NULL);
|
||||
if(i==0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* access stage 2 and get the Norm unit */
|
||||
i=(uint16_t)(j+(code&_NORM_STAGE_2_MASK));
|
||||
j=stage2[i];
|
||||
if(j==0) {
|
||||
return NULL;
|
||||
} else {
|
||||
return norms+j;
|
||||
}
|
||||
return norms+i;
|
||||
}
|
||||
|
||||
/* get the canonical combining class of a character */
|
||||
|
@ -291,24 +255,20 @@ getCCFromCP(uint32_t code) {
|
|||
*/
|
||||
static uint32_t
|
||||
enumTrie(EnumTrieFn *fn, void *context) {
|
||||
uint32_t code, count, i;
|
||||
uint16_t j, k, l;
|
||||
uint32_t count, i;
|
||||
UChar32 code;
|
||||
UBool isInBlockZero;
|
||||
|
||||
code=0;
|
||||
count=0;
|
||||
for(i=0; i<_NORM_STAGE_1_MAX_COUNT; ++i) {
|
||||
j=stage1[i];
|
||||
if(j!=0) {
|
||||
for(k=0; k<_NORM_STAGE_2_BLOCK_COUNT; ++k) {
|
||||
l=stage2[j+k];
|
||||
if(l!=0) {
|
||||
fn(context, code, norms+l);
|
||||
++count;
|
||||
}
|
||||
++code;
|
||||
}
|
||||
for(code=0; code<=0x10ffff;) {
|
||||
i=utrie_get32(&normTrie, code, &isInBlockZero);
|
||||
if(isInBlockZero) {
|
||||
code+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
code+=_NORM_STAGE_2_BLOCK_COUNT;
|
||||
if(i!=0) {
|
||||
fn(context, (uint32_t)code, norms+i);
|
||||
++count;
|
||||
}
|
||||
++code;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
|
@ -806,9 +766,7 @@ setCompositionExclusion(uint32_t code) {
|
|||
static void
|
||||
setHangulJamoSpecials() {
|
||||
Norm *norm;
|
||||
uint16_t *pStage2Block;
|
||||
uint32_t c;
|
||||
uint16_t i;
|
||||
|
||||
/*
|
||||
* Hangul syllables are algorithmically decomposed into Jamos,
|
||||
|
@ -842,34 +800,9 @@ setHangulJamoSpecials() {
|
|||
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
|
||||
norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
|
||||
|
||||
/* set one complete stage 2 block with this Hangul information */
|
||||
pStage2Block=(uint16_t *)utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT);
|
||||
for(i=0; i<_NORM_STAGE_2_BLOCK_COUNT; ++i) {
|
||||
pStage2Block[i]=(uint16_t)(norm-norms);
|
||||
}
|
||||
|
||||
/* set these data for U+ac00..U+d7a3 */
|
||||
c=0xac00;
|
||||
|
||||
/* set a partial stage 2 block before pStage2Block can be repeated */
|
||||
if(c&_NORM_STAGE_2_MASK) {
|
||||
i=(uint16_t)(createStage2Block(c)+(c&_NORM_STAGE_2_MASK));
|
||||
do {
|
||||
stage2[i++]=(uint16_t)(norm-norms);
|
||||
} while(++c&_NORM_STAGE_2_MASK);
|
||||
}
|
||||
|
||||
/* set full stage 1 blocks to the common stage 2 block */
|
||||
while(c<(0xd7a3&~_NORM_STAGE_2_MASK)) {
|
||||
stage1[c>>_NORM_TRIE_SHIFT]=(uint16_t)(pStage2Block-stage2);
|
||||
c+=_NORM_STAGE_2_BLOCK_COUNT;
|
||||
}
|
||||
|
||||
/* set a partial stage 2 block after the repetition */
|
||||
i=createStage2Block(c);
|
||||
while(c<=0xd7a3) {
|
||||
stage2[i++]=(uint16_t)(norm-norms);
|
||||
++c;
|
||||
if(!utrie_setRange32(&normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
|
||||
fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -938,10 +871,10 @@ postParseFn(void *context, uint32_t code, Norm *norm) {
|
|||
|
||||
/* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
|
||||
if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
|
||||
printf("U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
|
||||
fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
|
||||
}
|
||||
if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
|
||||
printf("U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
|
||||
fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
|
||||
}
|
||||
|
||||
/* ### see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
|
||||
|
@ -950,21 +883,21 @@ postParseFn(void *context, uint32_t code, Norm *norm) {
|
|||
if(norm->combiningFlags&1) {
|
||||
if(norm->udataCC!=0) {
|
||||
/* illegal - data-derivable composition exclusion */
|
||||
printf("U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
|
||||
fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
|
||||
}
|
||||
}
|
||||
if(norm->combiningFlags&2) {
|
||||
if((norm->qcFlags&0x11)==0) {
|
||||
printf("U+%04lx combines backward but qcNF?C==0\n", (long)code);
|
||||
fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
|
||||
}
|
||||
#if 0
|
||||
/* occurs sometimes */
|
||||
/* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
|
||||
if(norm->udataCC==0) {
|
||||
printf("U+%04lx combines backward but udataCC==0\n", (long)code);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
if((norm->combiningFlags&3)==3) {
|
||||
if((norm->combiningFlags&3)==3 && beVerbose) {
|
||||
printf("U+%04lx combines both ways\n", (long)code);
|
||||
}
|
||||
}
|
||||
|
@ -1104,38 +1037,31 @@ make32BitNorm(Norm *norm) {
|
|||
/* turn all Norm structs into corresponding 32-bit norm values */
|
||||
static void
|
||||
makeAll32() {
|
||||
uint16_t i, count;
|
||||
uint32_t *pNormData;
|
||||
uint32_t n;
|
||||
int32_t i, normLength, count;
|
||||
|
||||
/*
|
||||
* allocate and fill the table of 32-bit normalization data
|
||||
* leave space for data for the up to 1024 lead surrogates
|
||||
*/
|
||||
norm32TableTop=(uint16_t)stage2Mem->index;
|
||||
norm32Table=(uint32_t *)uprv_malloc((norm32TableTop+1024)*4);
|
||||
if(norm32Table==NULL) {
|
||||
fprintf(stderr, "error: gennorm - unable to allocate %ld 32-bit words for norm32Table\n",
|
||||
(long)(norm32TableTop+1024));
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
count=(int32_t)normMem->index;
|
||||
for(i=0; i<count; ++i) {
|
||||
norms[i].value32=make32BitNorm(norms+i);
|
||||
}
|
||||
|
||||
/* reset all entries */
|
||||
uprv_memset(norm32Table, 0, (norm32TableTop+1024)*4);
|
||||
pNormData=utrie_getData(&normTrie, &normLength);
|
||||
|
||||
count=0;
|
||||
|
||||
/* skip the first, all-empty block */
|
||||
for(i=_NORM_STAGE_2_BLOCK_COUNT; i<norm32TableTop; ++i) {
|
||||
if(stage2[i]!=0) {
|
||||
if(0!=(norm32Table[i]=make32BitNorm(norms+stage2[i]))) {
|
||||
++count;
|
||||
}
|
||||
for(i=0; i<normLength; ++i) {
|
||||
n=pNormData[i];
|
||||
if(0!=(pNormData[i]=norms[n].value32)) {
|
||||
++count;
|
||||
}
|
||||
}
|
||||
|
||||
printf("count of 16-bit extra data: %lu\n", (long)extraMem->index);
|
||||
printf("count of (uncompacted) non-zero 32-bit words: %lu\n", (long)count);
|
||||
printf("count CC frequencies: same %lu trail %lu two %lu\n",
|
||||
(long)countCCSame, (long)countCCTrail, (long)countCCTwo);
|
||||
if(beVerbose) {
|
||||
printf("count of 16-bit extra data: %lu\n", (long)extraMem->index);
|
||||
printf("count of (uncompacted) non-zero 32-bit words: %lu\n", (long)count);
|
||||
printf("count CC frequencies: same %lu trail %lu two %lu\n",
|
||||
(long)countCCSame, (long)countCCTrail, (long)countCCTwo);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1144,232 +1070,91 @@ makeAll32() {
|
|||
*/
|
||||
static void
|
||||
makeFCD() {
|
||||
static uint16_t map[0x10000>>_NORM_TRIE_SHIFT];
|
||||
Norm *norm;
|
||||
uint32_t i, oredValues;
|
||||
uint16_t bothCCs, delta;
|
||||
uint32_t *pFCDData;
|
||||
uint32_t n;
|
||||
int32_t i, count, fcdLength;
|
||||
uint16_t bothCCs;
|
||||
|
||||
/*
|
||||
* allocate and fill the table of 32-bit normalization data
|
||||
* leave space for data for the up to 1024 lead surrogates
|
||||
*/
|
||||
fcdTableTop=(uint16_t)stage2Mem->index;
|
||||
fcdTable=(uint32_t *)uprv_malloc((fcdTableTop+1024)*4);
|
||||
if(fcdTable==NULL) {
|
||||
fprintf(stderr, "error: gennorm - unable to allocate %ld 32-bit words for fcdTable\n",
|
||||
(long)(fcdTableTop+1024));
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
/* reset all entries */
|
||||
uprv_memset(fcdTable, 0, (fcdTableTop+1024)*4);
|
||||
|
||||
/* compact out the all-zero stage 2 blocks */
|
||||
map[0]=0;
|
||||
delta=0;
|
||||
|
||||
/* oredValues detects all-zero stage 2 blocks that will be removed from fcdStage1 */
|
||||
oredValues=0;
|
||||
|
||||
/* skip the first, all-empty block */
|
||||
for(i=_NORM_STAGE_2_BLOCK_COUNT; i<fcdTableTop; ++i) {
|
||||
if(stage2[i]!=0) {
|
||||
norm=norms+stage2[i];
|
||||
bothCCs=norm->canonBothCCs;
|
||||
if(bothCCs==0) {
|
||||
/* if there are no decomposition cc's then use the udataCC twice */
|
||||
bothCCs=norm->udataCC;
|
||||
bothCCs|=bothCCs<<8;
|
||||
}
|
||||
oredValues|=fcdTable[i-delta]=bothCCs;
|
||||
}
|
||||
|
||||
if((i&_NORM_STAGE_2_MASK)==_NORM_STAGE_2_MASK) {
|
||||
/* at the end of a stage 2 block, check if there are any non-zero entries */
|
||||
if(oredValues==0) {
|
||||
/* all zero: skip this block */
|
||||
delta+=_NORM_STAGE_2_BLOCK_COUNT;
|
||||
map[i>>_NORM_TRIE_SHIFT]=(uint16_t)0;
|
||||
} else {
|
||||
/* keep this block */
|
||||
map[i>>_NORM_TRIE_SHIFT]=(uint16_t)((i&~_NORM_STAGE_2_MASK)-delta);
|
||||
oredValues=0;
|
||||
}
|
||||
count=(int32_t)normMem->index;
|
||||
for(i=0; i<count; ++i) {
|
||||
bothCCs=norms[i].canonBothCCs;
|
||||
if(bothCCs==0) {
|
||||
/* if there are no decomposition cc's then use the udataCC twice */
|
||||
bothCCs=norms[i].udataCC;
|
||||
bothCCs|=bothCCs<<8;
|
||||
}
|
||||
norms[i].value32=bothCCs;
|
||||
}
|
||||
|
||||
/* now adjust stage 1 */
|
||||
for(i=0; i<_NORM_STAGE_1_MAX_COUNT; ++i) {
|
||||
fcdStage1[i]=map[fcdStage1[i]>>_NORM_TRIE_SHIFT];
|
||||
pFCDData=utrie_getData(&fcdTrie, &fcdLength);
|
||||
|
||||
for(i=0; i<fcdLength; ++i) {
|
||||
n=pFCDData[i];
|
||||
pFCDData[i]=norms[n].value32;
|
||||
}
|
||||
|
||||
printf("FCD: omitted %u stage 2 entries in all-zero blocks\n", delta);
|
||||
|
||||
/* adjust the table top */
|
||||
fcdTableTop-=delta;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold the supplementary code point data for one lead surrogate.
|
||||
*/
|
||||
static uint16_t
|
||||
foldLeadSurrogate(uint16_t *parent, uint16_t parentCount,
|
||||
uint32_t *stage, uint16_t *pStageCount,
|
||||
uint32_t base,
|
||||
UBool isNorm32) {
|
||||
uint32_t leadNorm32=0;
|
||||
uint32_t i, j, s2;
|
||||
uint32_t leadSurrogate=0xd7c0+(base>>10);
|
||||
/* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
|
||||
static uint32_t U_CALLCONV
|
||||
getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t value, leadNorm32=0;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
printf("supplementary data for lead surrogate U+%04lx\n", (long)leadSurrogate);
|
||||
|
||||
/* calculate the 32-bit data word for the lead surrogate */
|
||||
for(i=0; i<_NORM_SURROGATE_BLOCK_COUNT; ++i) {
|
||||
s2=parent[(base>>_NORM_TRIE_SHIFT)+i];
|
||||
if(s2!=0) {
|
||||
for(j=0; j<_NORM_STAGE_2_BLOCK_COUNT; ++j) {
|
||||
/* basically, or all 32-bit data into the one for the lead surrogate */
|
||||
leadNorm32|=stage[s2+j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(isNorm32) {
|
||||
/* turn multi-bit fields into the worst-case value */
|
||||
if(leadNorm32&_NORM_CC_MASK) {
|
||||
leadNorm32|=_NORM_CC_MASK;
|
||||
}
|
||||
|
||||
/* clean up unnecessarily ored bit fields */
|
||||
leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
|
||||
|
||||
if(leadNorm32==0) {
|
||||
/* nothing to do (only composition exclusions?) */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
|
||||
leadNorm32|=(
|
||||
(uint32_t)_NORM_EXTRA_INDEX_TOP+
|
||||
(uint32_t)((parentCount-_NORM_STAGE_1_BMP_COUNT)>>_NORM_SURROGATE_BLOCK_BITS)
|
||||
)<<_NORM_EXTRA_SHIFT;
|
||||
} else {
|
||||
if(leadNorm32==0) {
|
||||
/* FCD: nothing to do */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* For FCD, replace the entire combined value by the surrogate index
|
||||
* and make sure that it is not 0 (by not offsetting it by the BMP top,
|
||||
* since here we have enough bits for this);
|
||||
* lead surrogates are tested at runtime on the character code itself
|
||||
* instead on special values of the trie data -
|
||||
* this is because 16 bits in the FCD trie data do not allow for anything
|
||||
* but the two leading and trailing combining classes of the canonical decomposition.
|
||||
*/
|
||||
leadNorm32=parentCount>>_NORM_SURROGATE_BLOCK_BITS;
|
||||
}
|
||||
|
||||
/* enter the lead surrogate's data */
|
||||
s2=parent[leadSurrogate>>_NORM_TRIE_SHIFT];
|
||||
if(s2==0) {
|
||||
/* allocate a new stage 2 block in stage (the memory is there from makeAll32()/makeFCD()) */
|
||||
s2=parent[leadSurrogate>>_NORM_TRIE_SHIFT]=*pStageCount;
|
||||
*pStageCount+=_NORM_STAGE_2_BLOCK_COUNT;
|
||||
}
|
||||
stage[s2+(leadSurrogate&_NORM_STAGE_2_MASK)]=leadNorm32;
|
||||
|
||||
/* move the actual stage 1 indexes from the supplementary position to the new one */
|
||||
uprv_memmove(parent+parentCount, parent+(base>>_NORM_TRIE_SHIFT), _NORM_SURROGATE_BLOCK_COUNT*2);
|
||||
|
||||
/* increment stage 1 top */
|
||||
return _NORM_SURROGATE_BLOCK_COUNT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold the normalization data for supplementary code points into
|
||||
* a compact area on top of the BMP-part of the trie index,
|
||||
* with the lead surrogates indexing this compact area.
|
||||
*
|
||||
* Use after makeAll32().
|
||||
*/
|
||||
static uint16_t
|
||||
foldSupplementary(uint16_t *parent, uint16_t parentCount,
|
||||
uint32_t *stage, uint16_t *pStageCount,
|
||||
UBool isNorm32) {
|
||||
uint32_t c;
|
||||
uint16_t i;
|
||||
|
||||
/* search for any stage 1 entries for supplementary code points */
|
||||
for(c=0x10000; c<0x110000;) {
|
||||
i=parent[c>>_NORM_TRIE_SHIFT];
|
||||
if(i!=0) {
|
||||
/* there is data, treat the full block for a lead surrogate */
|
||||
c&=~0x3ff;
|
||||
parentCount+=foldLeadSurrogate(parent, parentCount, stage, pStageCount, c, isNorm32);
|
||||
c+=0x400;
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
c+=_NORM_STAGE_2_BLOCK_COUNT;
|
||||
if(value!=0) {
|
||||
leadNorm32|=value;
|
||||
}
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
printf("trie index count: BMP %u all Unicode %lu folded %u\n",
|
||||
_NORM_STAGE_1_BMP_COUNT, (long)_NORM_STAGE_1_MAX_COUNT, parentCount);
|
||||
return parentCount;
|
||||
/* turn multi-bit fields into the worst-case value */
|
||||
if(leadNorm32&_NORM_CC_MASK) {
|
||||
leadNorm32|=_NORM_CC_MASK;
|
||||
}
|
||||
|
||||
/* clean up unnecessarily ored bit fields */
|
||||
leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
|
||||
|
||||
if(leadNorm32==0) {
|
||||
/* nothing to do (only composition exclusions?) */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
|
||||
leadNorm32|=(
|
||||
(uint32_t)_NORM_EXTRA_INDEX_TOP+
|
||||
(uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
|
||||
)<<_NORM_EXTRA_SHIFT;
|
||||
|
||||
return leadNorm32;
|
||||
}
|
||||
|
||||
static uint16_t
|
||||
compact(uint16_t *parent, uint16_t parentCount,
|
||||
uint32_t *stage, uint16_t stageCount) {
|
||||
/*
|
||||
* This function is the common implementation for compacting
|
||||
* the stage 2 tables of 32-bit values.
|
||||
* It is a copy of genprops/store.c's compactStage() adapted for the 32-bit stage 2 tables.
|
||||
*/
|
||||
static uint16_t map[0x10000>>_NORM_TRIE_SHIFT];
|
||||
uint32_t x;
|
||||
uint16_t i, start, prevEnd, newStart;
|
||||
/* folding value for FCD: just store the offset (16 bits) if there is any non-0 entry */
|
||||
static uint32_t U_CALLCONV
|
||||
getFoldedFCDValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t value;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
map[0]=0;
|
||||
newStart=_NORM_STAGE_2_BLOCK_COUNT;
|
||||
for(start=newStart; start<stageCount;) {
|
||||
prevEnd=(uint16_t)(newStart-1);
|
||||
x=stage[start];
|
||||
if(x==stage[prevEnd]) {
|
||||
/* overlap by at least one */
|
||||
for(i=1; i<_NORM_STAGE_2_BLOCK_COUNT && x==stage[start+i] && x==stage[prevEnd-i]; ++i) {}
|
||||
|
||||
/* overlap by i */
|
||||
map[start>>_NORM_TRIE_SHIFT]=(uint16_t)(newStart-i);
|
||||
|
||||
/* move the non-overlapping indexes to their new positions */
|
||||
start+=i;
|
||||
for(i=(uint16_t)(_NORM_STAGE_2_BLOCK_COUNT-i); i>0; --i) {
|
||||
stage[newStart++]=stage[start++];
|
||||
}
|
||||
} else if(newStart<start) {
|
||||
/* move the indexes to their new positions */
|
||||
map[start>>_NORM_TRIE_SHIFT]=newStart;
|
||||
for(i=_NORM_STAGE_2_BLOCK_COUNT; i>0; --i) {
|
||||
stage[newStart++]=stage[start++];
|
||||
}
|
||||
} else /* no overlap && newStart==start */ {
|
||||
map[start>>_NORM_TRIE_SHIFT]=start;
|
||||
newStart+=_NORM_STAGE_2_BLOCK_COUNT;
|
||||
start=newStart;
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else if(value!=0) {
|
||||
return (uint32_t)offset;
|
||||
} else {
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
/* now adjust the parent table */
|
||||
for(i=0; i<parentCount; ++i) {
|
||||
parent[i]=map[parent[i]>>_NORM_TRIE_SHIFT];
|
||||
}
|
||||
|
||||
/* we saved some space */
|
||||
printf("compacting trie: count of 32-bit words %lu->%lu\n",
|
||||
(long)stageCount, (long)newStart);
|
||||
return newStart;
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern void
|
||||
|
@ -1394,63 +1179,64 @@ processData() {
|
|||
/* add hangul/jamo specials */
|
||||
setHangulJamoSpecials();
|
||||
|
||||
/* copy stage 1 for the FCD trie */
|
||||
uprv_memcpy(fcdStage1, stage1, sizeof(stage1));
|
||||
/* clone the normalization trie to make the FCD trie */
|
||||
if(NULL==utrie_clone(&fcdTrie, &normTrie, NULL, 0)) {
|
||||
fprintf(stderr, "error: unable to clone the normalization trie\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
/* --- finalize data for quick checks & normalization: stage1/norm32Table --- */
|
||||
/* --- finalize data for quick checks & normalization --- */
|
||||
|
||||
/* turn the Norm structs (stage2, norms) into 32-bit data words (norm32Table) */
|
||||
makeAll32();
|
||||
|
||||
/* fold supplementary code points into lead surrogates */
|
||||
stage1Top=foldSupplementary(stage1, _NORM_STAGE_1_BMP_COUNT, norm32Table, &norm32TableTop, TRUE);
|
||||
|
||||
/* compact stage 2 */
|
||||
norm32TableTop=compact(stage1, stage1Top, norm32Table, norm32TableTop);
|
||||
|
||||
/* --- finalize data for FCD checks: fcdStage1/fcdTable --- */
|
||||
|
||||
/* FCD data: take Norm.canonBothCCs and store them in the FCD table */
|
||||
makeFCD();
|
||||
|
||||
/* FCD: fold supplementary code points into lead surrogates */
|
||||
fcdStage1Top=foldSupplementary(fcdStage1, _NORM_STAGE_1_BMP_COUNT, fcdTable, &fcdTableTop, FALSE);
|
||||
|
||||
/* FCD: compact stage 2 */
|
||||
fcdTableTop=compact(fcdStage1, fcdStage1Top, fcdTable, fcdTableTop);
|
||||
|
||||
/* ### debug output */
|
||||
if(beVerbose) {
|
||||
#if 0
|
||||
printf("number of stage 2 entries: %ld\n", stage2Mem->index);
|
||||
printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
|
||||
printf("number of stage 2 entries: %ld\n", stage2Mem->index);
|
||||
printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
|
||||
#endif
|
||||
printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop);
|
||||
printf("combining table count: %u\n", combiningTableTop);
|
||||
printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop);
|
||||
printf("combining table count: %u\n", combiningTableTop);
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir) {
|
||||
static uint8_t normTrieBlock[100000], fcdTrieBlock[100000];
|
||||
|
||||
UNewDataMemory *pData;
|
||||
uint16_t *p16;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
uint32_t size, dataLength;
|
||||
uint16_t i;
|
||||
int32_t size, normTrieSize, fcdTrieSize, dataLength;
|
||||
|
||||
normTrieSize=utrie_serialize(&normTrie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
fcdTrieSize=utrie_serialize(&fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), getFoldedFCDValue, TRUE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
size=
|
||||
_NORM_INDEX_TOP*2+
|
||||
stage1Top*2+
|
||||
norm32TableTop*4+
|
||||
_NORM_INDEX_TOP*4+
|
||||
normTrieSize+
|
||||
extraMem->index*2+
|
||||
combiningTableTop*2+
|
||||
fcdStage1Top*2+
|
||||
fcdTableTop*2;
|
||||
fcdTrieSize;
|
||||
|
||||
printf("size of " DATA_NAME "." DATA_TYPE " contents: %lu bytes\n", (long)size);
|
||||
if(beVerbose) {
|
||||
printf("size of " DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
|
||||
}
|
||||
|
||||
indexes[_NORM_INDEX_COUNT]=_NORM_INDEX_TOP;
|
||||
indexes[_NORM_INDEX_TRIE_SHIFT]=_NORM_TRIE_SHIFT;
|
||||
indexes[_NORM_INDEX_TRIE_INDEX_COUNT]=stage1Top;
|
||||
indexes[_NORM_INDEX_TRIE_DATA_COUNT]=norm32TableTop;
|
||||
indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
|
||||
indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index;
|
||||
|
||||
indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
|
||||
|
@ -1458,26 +1244,9 @@ generateData(const char *dataDir) {
|
|||
indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
|
||||
indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
|
||||
|
||||
indexes[_NORM_INDEX_FCD_TRIE_INDEX_COUNT]=fcdStage1Top;
|
||||
indexes[_NORM_INDEX_FCD_TRIE_DATA_COUNT]=fcdTableTop;
|
||||
/* the quick check minimum code points are already set */
|
||||
|
||||
/* adjust the stage 1 indexes to offset stage 2 from the beginning of stage 1 */
|
||||
|
||||
/* stage1/norm32Table */
|
||||
for(i=0; i<stage1Top; ++i) {
|
||||
stage1[i]+=stage1Top/2; /* stage 2 is 32-bit indexed */
|
||||
}
|
||||
|
||||
/* fcdStage1/fcdTable */
|
||||
for(i=0; i<fcdStage1Top; ++i) {
|
||||
fcdStage1[i]+=fcdStage1Top; /* FCD stage 2 is 16-bit indexed */
|
||||
}
|
||||
|
||||
/* reduce the contents of fcdTable from 32-bit values to 16-bit values, in-place (destructive!) */
|
||||
p16=(uint16_t *)fcdTable;
|
||||
for(i=0; i<fcdTableTop; ++i) {
|
||||
p16[i]=(uint16_t)fcdTable[i];
|
||||
}
|
||||
indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
|
||||
|
||||
/* write the data */
|
||||
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
|
||||
|
@ -1488,12 +1257,10 @@ generateData(const char *dataDir) {
|
|||
}
|
||||
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, stage1, stage1Top*2);
|
||||
udata_writeBlock(pData, norm32Table, norm32TableTop*4);
|
||||
udata_writeBlock(pData, normTrieBlock, normTrieSize);
|
||||
udata_writeBlock(pData, utm_getStart(extraMem), extraMem->index*2);
|
||||
udata_writeBlock(pData, combiningTable, combiningTableTop*2);
|
||||
udata_writeBlock(pData, fcdStage1, fcdStage1Top*2);
|
||||
udata_writeBlock(pData, fcdTable, fcdTableTop*2);
|
||||
udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
|
||||
|
||||
/* finish up */
|
||||
dataLength=udata_finish(pData, &errorCode);
|
||||
|
@ -1503,7 +1270,7 @@ generateData(const char *dataDir) {
|
|||
}
|
||||
|
||||
if(dataLength!=size) {
|
||||
fprintf(stderr, "gennorm: data length %lu != calculated size %lu\n",
|
||||
fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
|
||||
(long)dataLength, (long)size);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
|
@ -1511,14 +1278,12 @@ generateData(const char *dataDir) {
|
|||
|
||||
extern void
|
||||
cleanUpData(void) {
|
||||
uprv_free(norm32Table);
|
||||
uprv_free(fcdTable);
|
||||
|
||||
utm_close(stage2Mem);
|
||||
utm_close(normMem);
|
||||
utm_close(utf32Mem);
|
||||
utm_close(extraMem);
|
||||
utm_close(combiningTriplesMem);
|
||||
utrie_close(&normTrie);
|
||||
utrie_close(&fcdTrie);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
Loading…
Add table
Reference in a new issue