mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 00:43:32 +00:00
ICU-3858 remove data for properties that were moved to ucase.icu and ubidi.icu, and simplify remaining structure
X-SVN-Rev: 17074
This commit is contained in:
parent
1f69d77027
commit
aa6cd66256
6 changed files with 351 additions and 1454 deletions
icu4c/source
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
********************************************************************************
|
||||
* Copyright (C) 1996-2004, International Business Machines
|
||||
* Copyright (C) 1996-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************************
|
||||
*
|
||||
|
@ -49,8 +49,7 @@ static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
|
|||
static UVersionInfo dataVersion={ 0, 0, 0, 0 };
|
||||
|
||||
static UTrie propsTrie={ 0 }, propsVectorsTrie={ 0 };
|
||||
static const uint32_t *pData32=NULL, *props32Table=NULL, *exceptionsTable=NULL, *propsVectors=NULL;
|
||||
static const UChar *ucharsTable=NULL;
|
||||
static const uint32_t *pData32=NULL, *propsVectors=NULL;
|
||||
static int32_t countPropsVectors=0, propsVectorsColumns=0;
|
||||
|
||||
static int8_t havePropsData=0; /* == 0 -> Data has not been loaded.
|
||||
|
@ -61,16 +60,6 @@ static int8_t havePropsData=0; /* == 0 -> Data has not been loaded.
|
|||
/* index values loaded from uprops.dat */
|
||||
static int32_t indexes[UPROPS_INDEX_COUNT];
|
||||
|
||||
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
|
||||
static int32_t U_CALLCONV
|
||||
getFoldingPropsOffset(uint32_t data) {
|
||||
if(data&0x8000) {
|
||||
return (int32_t)(data&0x7fff);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV
|
||||
isAcceptable(void *context,
|
||||
const char *type, const char *name,
|
||||
|
@ -83,7 +72,7 @@ isAcceptable(void *context,
|
|||
pInfo->dataFormat[1]==0x50 &&
|
||||
pInfo->dataFormat[2]==0x72 &&
|
||||
pInfo->dataFormat[3]==0x6f &&
|
||||
pInfo->formatVersion[0]==3 &&
|
||||
pInfo->formatVersion[0]==4 &&
|
||||
pInfo->formatVersion[2]==UTRIE_SHIFT &&
|
||||
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
|
||||
) {
|
||||
|
@ -102,9 +91,6 @@ static UBool U_CALLCONV uchar_cleanup(void)
|
|||
propsData=NULL;
|
||||
}
|
||||
pData32=NULL;
|
||||
props32Table=NULL;
|
||||
exceptionsTable=NULL;
|
||||
ucharsTable=NULL;
|
||||
propsVectors=NULL;
|
||||
countPropsVectors=0;
|
||||
dataErrorCode=U_ZERO_ERROR;
|
||||
|
@ -139,19 +125,12 @@ _openProps(UCharProps *ucp, UErrorCode *pErrorCode) {
|
|||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
ucp->propsTrie.getFoldingOffset=getFoldingPropsOffset;
|
||||
|
||||
/* unserialize the properties vectors trie, if any */
|
||||
if( p[UPROPS_ADDITIONAL_TRIE_INDEX]!=0 &&
|
||||
p[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0
|
||||
) {
|
||||
length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4;
|
||||
length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie));
|
||||
} else {
|
||||
ucp->propsVectorsTrie.getFoldingOffset=getFoldingPropsOffset;
|
||||
}
|
||||
/* unserialize the properties vectors trie */
|
||||
length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4;
|
||||
length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -190,9 +169,6 @@ uprv_loadPropsData(UErrorCode *pErrorCode) {
|
|||
|
||||
/* initialize some variables */
|
||||
uprv_memcpy(indexes, pData32, sizeof(indexes));
|
||||
props32Table=pData32+indexes[UPROPS_PROPS32_INDEX];
|
||||
exceptionsTable=pData32+indexes[UPROPS_EXCEPTIONS_INDEX];
|
||||
ucharsTable=(const UChar *)(pData32+indexes[UPROPS_EXCEPTIONS_TOP_INDEX]);
|
||||
|
||||
/* additional properties */
|
||||
if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) {
|
||||
|
@ -250,7 +226,7 @@ uprops_swap(const UDataSwapper *ds,
|
|||
pInfo->dataFormat[1]==0x50 &&
|
||||
pInfo->dataFormat[2]==0x72 &&
|
||||
pInfo->dataFormat[3]==0x6f &&
|
||||
pInfo->formatVersion[0]==3 &&
|
||||
(pInfo->formatVersion[0]==3 || pInfo->formatVersion[0]==4) &&
|
||||
pInfo->formatVersion[2]==UTRIE_SHIFT &&
|
||||
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
|
||||
)) {
|
||||
|
@ -360,10 +336,8 @@ uprops_swap(const UDataSwapper *ds,
|
|||
|
||||
/* getting a uint32_t properties word from the data */
|
||||
#define HAVE_DATA (havePropsData>0 || loadPropsData()>0)
|
||||
#define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
|
||||
#define GET_PROPS_UNSAFE(c, result) \
|
||||
UTRIE_GET16(&propsTrie, c, result); \
|
||||
(result)=props32Table[(result)]
|
||||
UTRIE_GET16(&propsTrie, c, result);
|
||||
#define GET_PROPS(c, result) \
|
||||
if(HAVE_DATA) { \
|
||||
GET_PROPS_UNSAFE(c, result); \
|
||||
|
@ -371,39 +345,6 @@ uprops_swap(const UDataSwapper *ds,
|
|||
(result)=0; \
|
||||
}
|
||||
|
||||
/* finding an exception value */
|
||||
#define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1UL<<(index)))
|
||||
|
||||
/* number of bits in an 8-bit integer value */
|
||||
#define EXC_GROUP 8
|
||||
static const uint8_t flagsOffset[256]={
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
|
||||
};
|
||||
|
||||
#define ADD_EXCEPTION_OFFSET(flags, index, offset) { \
|
||||
if((index)>=EXC_GROUP) { \
|
||||
(offset)+=flagsOffset[(flags)&((1<<EXC_GROUP)-1)]; \
|
||||
(flags)>>=EXC_GROUP; \
|
||||
(index)-=EXC_GROUP; \
|
||||
} \
|
||||
(offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \
|
||||
}
|
||||
|
||||
U_CFUNC UBool
|
||||
uprv_haveProperties(UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
|
@ -437,8 +378,7 @@ struct _EnumTypeCallback {
|
|||
|
||||
static uint32_t U_CALLCONV
|
||||
_enumTypeValue(const void *context, uint32_t value) {
|
||||
/* access the general category from the 32-bit properties, and those from the 16-bit trie value */
|
||||
return GET_CATEGORY(props32Table[value]);
|
||||
return GET_CATEGORY(value);
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV
|
||||
|
@ -695,114 +635,82 @@ u_isJavaIDPart(UChar32 c) {
|
|||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_charDigitValue(UChar32 c) {
|
||||
uint32_t props, numericType;
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
numericType=GET_NUMERIC_TYPE(props);
|
||||
|
||||
if(numericType==1) {
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
return GET_SIGNED_VALUE(props);
|
||||
} else {
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_NUMERIC_VALUE)) {
|
||||
int i=EXC_NUMERIC_VALUE;
|
||||
++pe;
|
||||
ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
|
||||
return (int32_t)*pe;
|
||||
}
|
||||
}
|
||||
if(GET_NUMERIC_TYPE(props)==1) {
|
||||
return GET_NUMERIC_VALUE(props);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
U_CAPI double U_EXPORT2
|
||||
u_getNumericValue(UChar32 c) {
|
||||
uint32_t props, numericType;
|
||||
uint32_t props, numericType, numericValue;
|
||||
GET_PROPS(c, props);
|
||||
numericType=GET_NUMERIC_TYPE(props);
|
||||
|
||||
if(numericType==0 || numericType>=(int32_t)U_NT_COUNT) {
|
||||
if(numericType==0 || numericType>=UPROPS_NT_COUNT) {
|
||||
return U_NO_NUMERIC_VALUE;
|
||||
} else {
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
return GET_SIGNED_VALUE(props);
|
||||
} else {
|
||||
const uint32_t *pe;
|
||||
uint32_t firstExceptionValue;
|
||||
}
|
||||
|
||||
double numValue;
|
||||
uint32_t denominator;
|
||||
numericValue=GET_NUMERIC_VALUE(props);
|
||||
|
||||
pe=GET_EXCEPTIONS(props);
|
||||
firstExceptionValue=*pe++;
|
||||
if(numericType<U_NT_COUNT) {
|
||||
/* normal type, the value is stored directly */
|
||||
return numericValue;
|
||||
} else if(numericType==UPROPS_NT_FRACTION) {
|
||||
/* fraction value */
|
||||
int32_t numerator;
|
||||
uint32_t denominator;
|
||||
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_NUMERIC_VALUE)) {
|
||||
uint32_t flags=firstExceptionValue;
|
||||
int i=EXC_NUMERIC_VALUE;
|
||||
const uint32_t *p=pe;
|
||||
int32_t numerator;
|
||||
numerator=(int32_t)numericValue>>UPROPS_FRACTION_NUM_SHIFT;
|
||||
denominator=(numericValue&UPROPS_FRACTION_DEN_MASK)+UPROPS_FRACTION_DEN_OFFSET;
|
||||
|
||||
ADD_EXCEPTION_OFFSET(flags, i, p);
|
||||
numerator=(int32_t)*p;
|
||||
|
||||
/*
|
||||
* There are special values for huge numbers that are powers of ten.
|
||||
* genprops/store.c documents:
|
||||
* if numericValue=0x7fffff00+x then numericValue=10^x
|
||||
*/
|
||||
if(numerator<0x7fffff00) {
|
||||
numValue=(double)numerator;
|
||||
} else {
|
||||
numerator&=0xff;
|
||||
|
||||
/* 10^x without math.h */
|
||||
numValue=1.;
|
||||
while(numerator>=4) {
|
||||
numValue*=10000.;
|
||||
numerator-=4;
|
||||
}
|
||||
switch(numerator) {
|
||||
case 3:
|
||||
numValue*=1000.;
|
||||
break;
|
||||
case 2:
|
||||
numValue*=100.;
|
||||
break;
|
||||
case 1:
|
||||
numValue*=10.;
|
||||
break;
|
||||
case 0:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
numValue=0.;
|
||||
}
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_DENOMINATOR_VALUE)) {
|
||||
uint32_t flags=firstExceptionValue;
|
||||
int i=EXC_DENOMINATOR_VALUE;
|
||||
const uint32_t *p=pe;
|
||||
ADD_EXCEPTION_OFFSET(flags, i, p);
|
||||
denominator=*p;
|
||||
} else {
|
||||
denominator=0;
|
||||
}
|
||||
|
||||
switch(firstExceptionValue&((1UL<<EXC_NUMERIC_VALUE)|(1UL<<EXC_DENOMINATOR_VALUE))) {
|
||||
case 1UL<<EXC_NUMERIC_VALUE:
|
||||
return numValue;
|
||||
case 1UL<<EXC_DENOMINATOR_VALUE:
|
||||
return (double)1./(double)denominator;
|
||||
case (1UL<<EXC_NUMERIC_VALUE)|(1UL<<EXC_DENOMINATOR_VALUE):
|
||||
return numValue/(double)denominator;
|
||||
case 0: /* none (should not occur with numericType>0) */
|
||||
default:
|
||||
return U_NO_NUMERIC_VALUE;
|
||||
}
|
||||
if(numerator==0) {
|
||||
numerator=-1;
|
||||
}
|
||||
return (double)numerator/(double)denominator;
|
||||
} else /* numericType==UPROPS_NT_LARGE */ {
|
||||
/* large value with exponent */
|
||||
double numValue;
|
||||
int32_t mant, exp;
|
||||
|
||||
mant=(int32_t)numericValue>>UPROPS_LARGE_MANT_SHIFT;
|
||||
exp=(int32_t)numericValue&UPROPS_LARGE_EXP_MASK;
|
||||
if(mant==0) {
|
||||
mant=1;
|
||||
exp+=UPROPS_LARGE_EXP_OFFSET_EXTRA;
|
||||
} else if(mant>9) {
|
||||
return U_NO_NUMERIC_VALUE; /* reserved mantissa value */
|
||||
} else {
|
||||
exp+=UPROPS_LARGE_EXP_OFFSET;
|
||||
}
|
||||
|
||||
numValue=mant;
|
||||
|
||||
/* multiply by 10^exp without math.h */
|
||||
while(exp>=4) {
|
||||
numValue*=10000.;
|
||||
exp-=4;
|
||||
}
|
||||
switch(exp) {
|
||||
case 3:
|
||||
numValue*=1000.;
|
||||
break;
|
||||
case 2:
|
||||
numValue*=100.;
|
||||
break;
|
||||
case 1:
|
||||
numValue*=10.;
|
||||
break;
|
||||
case 0:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return numValue;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -866,7 +774,6 @@ u_getUnicodeProperties(UChar32 c, int32_t column) {
|
|||
GET_PROPS(c, props);
|
||||
return props;
|
||||
} else if( !HAVE_DATA || countPropsVectors==0 ||
|
||||
(uint32_t)c>0x10ffff ||
|
||||
column<0 || column>=propsVectorsColumns
|
||||
) {
|
||||
return 0;
|
||||
|
@ -1069,18 +976,6 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
|||
USET_ADD_CP_AND_NEXT(sa, FIGURESP);
|
||||
USET_ADD_CP_AND_NEXT(sa, NNBSP);
|
||||
|
||||
/* add for u_charDigitValue() */
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x3007);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x4e00);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x4e8c);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x4e09);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x56db);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x4e94);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x516d);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x4e03);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x516b);
|
||||
USET_ADD_CP_AND_NEXT(sa, 0x4e5d);
|
||||
|
||||
/* add for u_digit() */
|
||||
sa->add(sa->set, U_a);
|
||||
sa->add(sa->set, U_z+1);
|
||||
|
@ -1096,8 +991,4 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
|||
|
||||
/* add for UCHAR_GRAPHEME_BASE and others */
|
||||
USET_ADD_CP_AND_NEXT(sa, CGJ);
|
||||
|
||||
/* add for UCHAR_JOINING_TYPE */
|
||||
sa->add(sa->set, ZWNJ); /* range ZWNJ..ZWJ */
|
||||
sa->add(sa->set, ZWJ+1);
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2004, International Business Machines
|
||||
* Copyright (C) 2002-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -14,7 +14,7 @@
|
|||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Constants for mostly non-core Unicode character properties
|
||||
* stored in uprops.dat.
|
||||
* stored in uprops.icu.
|
||||
*/
|
||||
|
||||
#ifndef __UPROPS_H__
|
||||
|
@ -23,7 +23,6 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "uset_imp.h"
|
||||
#include "ucase.h"
|
||||
#include "udataswp.h"
|
||||
|
||||
/* indexes[] entries */
|
||||
|
@ -49,42 +48,44 @@ enum {
|
|||
/* definitions for the main properties words */
|
||||
enum {
|
||||
/* general category shift==0 0 (5 bits) */
|
||||
UPROPS_EXCEPTION_SHIFT=5, /* 5 (1 bit) */
|
||||
UPROPS_BIDI_SHIFT, /* 6 (5 bits) */
|
||||
UPROPS_MIRROR_SHIFT=UPROPS_BIDI_SHIFT+5, /* 11 (1 bit) */
|
||||
UPROPS_NUMERIC_TYPE_SHIFT, /* 12 (3 bits) */
|
||||
UPROPS_CASE_SENSITIVE_SHIFT=UPROPS_NUMERIC_TYPE_SHIFT+3,/* 15 (1 bit) format version 3.2 */
|
||||
UPROPS_RESERVED_SHIFT, /* 16 (4 bits) */
|
||||
UPROPS_VALUE_SHIFT=20, /* 20 */
|
||||
|
||||
UPROPS_EXCEPTION_BIT=1UL<<UPROPS_EXCEPTION_SHIFT,
|
||||
UPROPS_VALUE_BITS=32-UPROPS_VALUE_SHIFT,
|
||||
|
||||
UPROPS_MIN_VALUE=-(1L<<(UPROPS_VALUE_BITS-1)),
|
||||
UPROPS_MAX_VALUE=(1L<<(UPROPS_VALUE_BITS-1))-1,
|
||||
UPROPS_MAX_EXCEPTIONS_COUNT=1L<<UPROPS_VALUE_BITS
|
||||
UPROPS_NUMERIC_TYPE_SHIFT=5, /* 5 (3 bits) */
|
||||
UPROPS_NUMERIC_VALUE_SHIFT=8 /* 8 (8 bits) */
|
||||
};
|
||||
|
||||
#define PROPS_VALUE_IS_EXCEPTION(props) ((props)&UPROPS_EXCEPTION_BIT)
|
||||
#define GET_CATEGORY(props) ((props)&0x1f)
|
||||
#define GET_BIDI_CLASS(props) ((props>>UPROPS_BIDI_SHIFT)&0x1f)
|
||||
#define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7)
|
||||
#define GET_UNSIGNED_VALUE(props) ((props)>>UPROPS_VALUE_SHIFT)
|
||||
#define GET_SIGNED_VALUE(props) ((int32_t)(props)>>UPROPS_VALUE_SHIFT)
|
||||
#define GET_EXCEPTIONS(props) (exceptionsTable+GET_UNSIGNED_VALUE(props))
|
||||
|
||||
#define CAT_MASK(props) U_MASK(GET_CATEGORY(props))
|
||||
|
||||
#define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7)
|
||||
#define GET_NUMERIC_VALUE(props) (((props)>>UPROPS_NUMERIC_VALUE_SHIFT)&0xff)
|
||||
|
||||
/* internal numeric pseudo-types for special encodings of numeric values */
|
||||
enum {
|
||||
EXC_UPPERCASE,
|
||||
EXC_LOWERCASE,
|
||||
EXC_TITLECASE,
|
||||
EXC_UNUSED,
|
||||
EXC_NUMERIC_VALUE,
|
||||
EXC_DENOMINATOR_VALUE,
|
||||
EXC_MIRROR_MAPPING,
|
||||
EXC_SPECIAL_CASING,
|
||||
EXC_CASE_FOLDING
|
||||
UPROPS_NT_FRACTION=4, /* ==U_NT_COUNT, must not change unless binary format version changes */
|
||||
UPROPS_NT_LARGE,
|
||||
UPROPS_NT_COUNT
|
||||
};
|
||||
|
||||
/* encoding of fractional and large numbers */
|
||||
enum {
|
||||
UPROPS_MAX_SMALL_NUMBER=0xff,
|
||||
|
||||
UPROPS_FRACTION_NUM_SHIFT=3, /* numerator: bits 7..3 */
|
||||
UPROPS_FRACTION_DEN_MASK=7, /* denominator: bits 2..0 */
|
||||
|
||||
UPROPS_FRACTION_MAX_NUM=31,
|
||||
UPROPS_FRACTION_DEN_OFFSET=2, /* denominator values are 2..9 */
|
||||
|
||||
UPROPS_FRACTION_MIN_DEN=UPROPS_FRACTION_DEN_OFFSET,
|
||||
UPROPS_FRACTION_MAX_DEN=UPROPS_FRACTION_MIN_DEN+UPROPS_FRACTION_DEN_MASK,
|
||||
|
||||
UPROPS_LARGE_MANT_SHIFT=4, /* mantissa: bits 7..4 */
|
||||
UPROPS_LARGE_EXP_MASK=0xf, /* exponent: bits 3..0 */
|
||||
UPROPS_LARGE_EXP_OFFSET=2, /* regular exponents 2..17 */
|
||||
UPROPS_LARGE_EXP_OFFSET_EXTRA=18, /* extra large exponents 18..33 */
|
||||
|
||||
UPROPS_LARGE_MIN_EXP=UPROPS_LARGE_EXP_OFFSET,
|
||||
UPROPS_LARGE_MAX_EXP=UPROPS_LARGE_MIN_EXP+UPROPS_LARGE_EXP_MASK,
|
||||
UPROPS_LARGE_MAX_EXP_EXTRA=UPROPS_LARGE_EXP_OFFSET_EXTRA+UPROPS_LARGE_EXP_MASK
|
||||
};
|
||||
|
||||
/* number of properties vector words */
|
||||
|
@ -129,8 +130,8 @@ enum {
|
|||
*/
|
||||
enum {
|
||||
UPROPS_WHITE_SPACE,
|
||||
UPROPS_BIDI_CONTROL,
|
||||
UPROPS_JOIN_CONTROL,
|
||||
UPROPS_WAS_BIDI_CONTROL, /* reserved, was used in format version 3 */
|
||||
UPROPS_WAS_JOIN_CONTROL,
|
||||
UPROPS_DASH,
|
||||
UPROPS_HYPHEN,
|
||||
UPROPS_QUOTATION_MARK,
|
||||
|
@ -142,8 +143,8 @@ enum {
|
|||
UPROPS_IDEOGRAPHIC,
|
||||
UPROPS_DIACRITIC,
|
||||
UPROPS_EXTENDER,
|
||||
UPROPS_LOWERCASE,
|
||||
UPROPS_UPPERCASE,
|
||||
UPROPS_WAS_LOWERCASE, /* reserved, was used in format version 3 */
|
||||
UPROPS_WAS_UPPERCASE,
|
||||
UPROPS_NONCHARACTER_CODE_POINT,
|
||||
UPROPS_GRAPHEME_EXTEND,
|
||||
UPROPS_GRAPHEME_LINK,
|
||||
|
@ -153,7 +154,7 @@ enum {
|
|||
UPROPS_UNIFIED_IDEOGRAPH,
|
||||
UPROPS_DEFAULT_IGNORABLE_CODE_POINT,
|
||||
UPROPS_DEPRECATED,
|
||||
UPROPS_SOFT_DOTTED,
|
||||
UPROPS_WAS_SOFT_DOTTED, /* reserved, was used in format version 3 */
|
||||
UPROPS_LOGICAL_ORDER_EXCEPTION,
|
||||
UPROPS_XID_START,
|
||||
UPROPS_XID_CONTINUE,
|
||||
|
@ -167,15 +168,15 @@ enum {
|
|||
* Properties in vector word 2
|
||||
* Bits
|
||||
* 31..24 More binary properties
|
||||
* 13..11 Joining Type
|
||||
* 10.. 5 Joining Group
|
||||
* 13..11 reserved, was Joining Type in format version 3
|
||||
* 10.. 5 reserved, was Joining Group in format version 3
|
||||
* 4.. 0 Decomposition Type
|
||||
*/
|
||||
#define UPROPS_JT_MASK 0x00003800
|
||||
#define UPROPS_JT_SHIFT 11
|
||||
#define UPROPS_WAS_JT_MASK 0x00003800
|
||||
#define UPROPS_WAS_JT_SHIFT 11
|
||||
|
||||
#define UPROPS_JG_MASK 0x000007e0
|
||||
#define UPROPS_JG_SHIFT 5
|
||||
#define UPROPS_WAS_JG_MASK 0x000007e0
|
||||
#define UPROPS_WAS_JG_SHIFT 5
|
||||
|
||||
#define UPROPS_DT_MASK 0x0000001f
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2003, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -24,7 +24,6 @@
|
|||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -43,31 +42,13 @@ U_CDECL_END
|
|||
|
||||
UBool beVerbose=FALSE, haveCopyright=TRUE;
|
||||
|
||||
/*
|
||||
* Unicode set collecting the case-sensitive characters;
|
||||
* see uchar.h UCHAR_CASE_SENSITIVE.
|
||||
* Add code points from case mappings/foldings in
|
||||
* the root locale and with default options.
|
||||
*/
|
||||
static USet *caseSensitive;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
parseBidiMirroring(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseDB(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
enum
|
||||
{
|
||||
HELP_H,
|
||||
|
@ -174,19 +155,6 @@ main(int argc, char* argv[]) {
|
|||
|
||||
/* initialize */
|
||||
initStore();
|
||||
caseSensitive=uset_open(1, 0); /* empty set (start>end) */
|
||||
|
||||
/* process BidiMirroring.txt */
|
||||
writeUCDFilename(basename, "BidiMirroring", suffix);
|
||||
parseBidiMirroring(filename, &errorCode);
|
||||
|
||||
/* process SpecialCasing.txt */
|
||||
writeUCDFilename(basename, "SpecialCasing", suffix);
|
||||
parseSpecialCasing(filename, &errorCode);
|
||||
|
||||
/* process CaseFolding.txt */
|
||||
writeUCDFilename(basename, "CaseFolding", suffix);
|
||||
parseCaseFolding(filename, &errorCode);
|
||||
|
||||
/* process UnicodeData.txt */
|
||||
writeUCDFilename(basename, "UnicodeData", suffix);
|
||||
|
@ -202,6 +170,7 @@ main(int argc, char* argv[]) {
|
|||
generateData(destDir);
|
||||
}
|
||||
|
||||
exitStore();
|
||||
u_cleanup();
|
||||
return errorCode;
|
||||
}
|
||||
|
@ -270,301 +239,6 @@ getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
|
|||
return -1;
|
||||
}
|
||||
|
||||
static void
|
||||
_set_addAll(USet *set, const UChar *s, int32_t length) {
|
||||
UChar32 c;
|
||||
int32_t i;
|
||||
|
||||
/* needs length>=0 */
|
||||
for(i=0; i<length; /* U16_NEXT advances i */) {
|
||||
U16_NEXT(s, i, length, c);
|
||||
uset_add(set, c);
|
||||
}
|
||||
}
|
||||
|
||||
/* parser for BidiMirroring.txt --------------------------------------------- */
|
||||
|
||||
#define MAX_MIRROR_COUNT 2000
|
||||
|
||||
static uint32_t mirrorMappings[MAX_MIRROR_COUNT][2];
|
||||
static int32_t mirrorCount=0;
|
||||
|
||||
static void U_CALLCONV
|
||||
mirrorLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *end;
|
||||
static uint32_t prevCode=0;
|
||||
|
||||
mirrorMappings[mirrorCount][0]=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
mirrorMappings[mirrorCount][1]=(uint32_t)uprv_strtoul(fields[1][0], &end, 16);
|
||||
if(end<=fields[1][0] || end!=fields[1][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* check that the code points (mirrorMappings[mirrorCount][0]) are in ascending order */
|
||||
if(mirrorMappings[mirrorCount][0]<=prevCode && mirrorMappings[mirrorCount][0]>0) {
|
||||
fprintf(stderr, "genprops: error - BidiMirroring entries out of order, U+%04lx after U+%04lx\n",
|
||||
(unsigned long)mirrorMappings[mirrorCount][0],
|
||||
(unsigned long)prevCode);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
prevCode=mirrorMappings[mirrorCount][0];
|
||||
|
||||
if(++mirrorCount==MAX_MIRROR_COUNT) {
|
||||
fprintf(stderr, "genprops: too many mirror mappings\n");
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) {
|
||||
char *fields[2][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode);
|
||||
}
|
||||
|
||||
/* parser for SpecialCasing.txt --------------------------------------------- */
|
||||
|
||||
#define MAX_SPECIAL_CASING_COUNT 500
|
||||
|
||||
static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
|
||||
static int32_t specialCasingCount=0;
|
||||
|
||||
static void U_CALLCONV
|
||||
specialCasingLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *end;
|
||||
|
||||
/* get code point */
|
||||
specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
|
||||
end=(char *)u_skipWhitespace(end);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* is this a complex mapping? */
|
||||
if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
|
||||
/* there is some condition text in the fifth field */
|
||||
specialCasings[specialCasingCount].isComplex=TRUE;
|
||||
|
||||
/* do not store any actual mappings for this */
|
||||
specialCasings[specialCasingCount].lowerCase[0]=0;
|
||||
specialCasings[specialCasingCount].upperCase[0]=0;
|
||||
specialCasings[specialCasingCount].titleCase[0]=0;
|
||||
} else {
|
||||
/* just set the "complex" flag and get the case mappings */
|
||||
specialCasings[specialCasingCount].isComplex=FALSE;
|
||||
specialCasings[specialCasingCount].lowerCase[0]=
|
||||
(UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
|
||||
specialCasings[specialCasingCount].upperCase[0]=
|
||||
(UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
|
||||
specialCasings[specialCasingCount].titleCase[0]=
|
||||
(UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
|
||||
_set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
|
||||
_set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
|
||||
_set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
|
||||
}
|
||||
|
||||
if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
|
||||
fprintf(stderr, "genprops: too many special casing mappings\n");
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
compareSpecialCasings(const void *left, const void *right) {
|
||||
return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
|
||||
}
|
||||
|
||||
static void
|
||||
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
|
||||
char *fields[5][2];
|
||||
int32_t i, j;
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
|
||||
|
||||
/* sort the special casing entries by code point */
|
||||
if(specialCasingCount>0) {
|
||||
qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
|
||||
}
|
||||
|
||||
/* replace multiple entries for any code point by one "complex" one */
|
||||
j=0;
|
||||
for(i=1; i<specialCasingCount; ++i) {
|
||||
if(specialCasings[i-1].code==specialCasings[i].code) {
|
||||
/* there is a duplicate code point */
|
||||
specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following qsort */
|
||||
specialCasings[i].isComplex=TRUE; /* make the following one complex */
|
||||
specialCasings[i].lowerCase[0]=0;
|
||||
specialCasings[i].upperCase[0]=0;
|
||||
specialCasings[i].titleCase[0]=0;
|
||||
++j;
|
||||
}
|
||||
}
|
||||
|
||||
/* if some entries just were removed, then re-sort */
|
||||
if(j>0) {
|
||||
qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
|
||||
specialCasingCount-=j;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add one complex mapping to caseSensitive that was filtered out above:
|
||||
* Greek final Sigma has a conditional mapping but not locale-sensitive,
|
||||
* and it is taken when lowercasing just U+03A3 alone.
|
||||
* 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
||||
*/
|
||||
uset_add(caseSensitive, 0x3c2);
|
||||
}
|
||||
|
||||
/* parser for CaseFolding.txt ----------------------------------------------- */
|
||||
|
||||
#define MAX_CASE_FOLDING_COUNT 2000
|
||||
|
||||
static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
|
||||
static int32_t caseFoldingCount=0;
|
||||
|
||||
static void U_CALLCONV
|
||||
caseFoldingLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *end;
|
||||
static uint32_t prevCode=0;
|
||||
int32_t count;
|
||||
char status;
|
||||
|
||||
/* get code point */
|
||||
caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
|
||||
end=(char *)u_skipWhitespace(end);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* get the status of this mapping */
|
||||
caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
|
||||
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
|
||||
fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
|
||||
if(status=='L') {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the mapping */
|
||||
count=caseFoldings[caseFoldingCount].full[0]=
|
||||
(UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, &caseFoldings[caseFoldingCount].simple, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
/* there is a simple mapping only if there is exactly one code point (count is in UChars) */
|
||||
if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
|
||||
caseFoldings[caseFoldingCount].simple=0;
|
||||
}
|
||||
|
||||
/* update the case-sensitive set */
|
||||
if(status!='T') {
|
||||
uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
|
||||
_set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
|
||||
}
|
||||
|
||||
/* check the status */
|
||||
if(status=='S') {
|
||||
/* check if there was a full mapping for this code point before */
|
||||
if( caseFoldingCount>0 &&
|
||||
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
|
||||
caseFoldings[caseFoldingCount-1].status=='F'
|
||||
) {
|
||||
/* merge the two entries */
|
||||
caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
|
||||
return;
|
||||
}
|
||||
} else if(status=='F') {
|
||||
/* check if there was a simple mapping for this code point before */
|
||||
if( caseFoldingCount>0 &&
|
||||
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
|
||||
caseFoldings[caseFoldingCount-1].status=='S'
|
||||
) {
|
||||
/* merge the two entries */
|
||||
uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
|
||||
return;
|
||||
}
|
||||
} else if(status=='I' || status=='T') {
|
||||
/* check if there was a default mapping for this code point before (remove it) */
|
||||
while(caseFoldingCount>0 &&
|
||||
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
|
||||
) {
|
||||
prevCode=0;
|
||||
--caseFoldingCount;
|
||||
}
|
||||
/* store only a marker for special handling for cases like dotless i */
|
||||
caseFoldings[caseFoldingCount].simple=0;
|
||||
caseFoldings[caseFoldingCount].full[0]=0;
|
||||
}
|
||||
|
||||
/* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
|
||||
if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
|
||||
fprintf(stderr, "genprops: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
|
||||
(unsigned long)caseFoldings[caseFoldingCount].code,
|
||||
(unsigned long)prevCode);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
prevCode=caseFoldings[caseFoldingCount].code;
|
||||
|
||||
if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
|
||||
fprintf(stderr, "genprops: too many case folding mappings\n");
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
|
||||
char *fields[3][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
|
||||
}
|
||||
|
||||
/* parser for UnicodeData.txt ----------------------------------------------- */
|
||||
|
||||
/* general categories */
|
||||
|
@ -580,12 +254,6 @@ genCategoryNames[U_CHAR_CATEGORY_COUNT]={
|
|||
"Pi", "Pf"
|
||||
};
|
||||
|
||||
const char *const
|
||||
bidiNames[U_CHAR_DIRECTION_COUNT]={
|
||||
"L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
|
||||
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
|
||||
};
|
||||
|
||||
const char *const
|
||||
decompositionTypeNames[U_DT_COUNT]={
|
||||
NULL,
|
||||
|
@ -613,7 +281,7 @@ static struct {
|
|||
char name[80];
|
||||
} unicodeAreas[32];
|
||||
|
||||
static int32_t unicodeAreaIndex=0, mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0;
|
||||
static int32_t unicodeAreaIndex=0;
|
||||
|
||||
static void U_CALLCONV
|
||||
unicodeDataLineFn(void *context,
|
||||
|
@ -647,17 +315,6 @@ unicodeDataLineFn(void *context,
|
|||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* get BiDi category, field 4 */
|
||||
i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]);
|
||||
if(i>=0) {
|
||||
p.bidi=(uint8_t)i;
|
||||
} else {
|
||||
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
|
||||
fields[4][0], (unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* get decomposition type, field 5 */
|
||||
if(fields[5][0]<fields[5][1]) {
|
||||
/* there is some decomposition */
|
||||
|
@ -771,80 +428,6 @@ unicodeDataLineFn(void *context,
|
|||
}
|
||||
}
|
||||
|
||||
/* get Mirrored flag, field 9 */
|
||||
if(*fields[9][0]=='Y') {
|
||||
p.isMirrored=1;
|
||||
} else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') {
|
||||
fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n",
|
||||
(unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* get uppercase mapping, field 12 */
|
||||
value=(uint32_t)uprv_strtoul(fields[12][0], &end, 16);
|
||||
if(end!=fields[12][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in field 12 at code 0x%lx\n",
|
||||
(unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
if(value!=0 && value!=p.code) {
|
||||
p.upperCase=value;
|
||||
uset_add(caseSensitive, (UChar32)p.code);
|
||||
uset_add(caseSensitive, (UChar32)value);
|
||||
}
|
||||
|
||||
/* get lowercase value, field 13 */
|
||||
value=(uint32_t)uprv_strtoul(fields[13][0], &end, 16);
|
||||
if(end!=fields[13][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in field 13 at code 0x%lx\n",
|
||||
(unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
if(value!=0 && value!=p.code) {
|
||||
p.lowerCase=value;
|
||||
uset_add(caseSensitive, (UChar32)p.code);
|
||||
uset_add(caseSensitive, (UChar32)value);
|
||||
}
|
||||
|
||||
/* get titlecase value, field 14 */
|
||||
value=(uint32_t)uprv_strtoul(fields[14][0], &end, 16);
|
||||
if(end!=fields[14][1]) {
|
||||
fprintf(stderr, "genprops: syntax error in field 14 at code 0x%lx\n",
|
||||
(unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
if(value!=0 && value!=p.code) {
|
||||
p.titleCase=value;
|
||||
uset_add(caseSensitive, (UChar32)p.code);
|
||||
uset_add(caseSensitive, (UChar32)value);
|
||||
}
|
||||
|
||||
/* set additional properties from previously parsed files */
|
||||
if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
|
||||
p.mirrorMapping=mirrorMappings[mirrorIndex++][1];
|
||||
}
|
||||
if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
|
||||
p.specialCasing=specialCasings+specialCasingIndex++;
|
||||
} else {
|
||||
p.specialCasing=NULL;
|
||||
}
|
||||
if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
|
||||
p.caseFolding=caseFoldings+caseFoldingIndex++;
|
||||
|
||||
/* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
|
||||
if( p.caseFolding->status=='C' &&
|
||||
p.caseFolding->simple==p.lowerCase
|
||||
) {
|
||||
p.caseFolding=NULL;
|
||||
}
|
||||
} else {
|
||||
p.caseFolding=NULL;
|
||||
}
|
||||
|
||||
value=makeProps(&p);
|
||||
|
||||
if(*fields[1][0]=='<') {
|
||||
|
@ -966,41 +549,12 @@ repeatAreaProps() {
|
|||
|
||||
static void
|
||||
parseDB(const char *filename, UErrorCode *pErrorCode) {
|
||||
/* default Bidi classes for unassigned code points */
|
||||
static const uint32_t defaultBidi[][2]={ /* { limit, class } */
|
||||
{ 0x0590, U_LEFT_TO_RIGHT },
|
||||
{ 0x0600, U_RIGHT_TO_LEFT },
|
||||
{ 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
|
||||
{ 0xFB1D, U_LEFT_TO_RIGHT },
|
||||
{ 0xFB50, U_RIGHT_TO_LEFT },
|
||||
{ 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
|
||||
{ 0xFE70, U_LEFT_TO_RIGHT },
|
||||
{ 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
|
||||
{ 0x110000, U_LEFT_TO_RIGHT }
|
||||
};
|
||||
|
||||
char *fields[15][2];
|
||||
UChar32 start, end;
|
||||
uint32_t prev;
|
||||
int32_t i;
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set default Bidi classes for unassigned code points.
|
||||
* See table 3-7 "Bidirectional Character Types" in UAX #9.
|
||||
* http://www.unicode.org/reports/tr9/
|
||||
*/
|
||||
prev=0;
|
||||
for(i=0; i<LENGTHOF(defaultBidi); ++i) {
|
||||
if(defaultBidi[i][1]!=0) {
|
||||
repeatProps(prev, defaultBidi[i][0]-1, defaultBidi[i][1]<<UPROPS_BIDI_SHIFT);
|
||||
}
|
||||
prev=defaultBidi[i][0];
|
||||
}
|
||||
|
||||
/* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
|
||||
unicodeAreas[0].first=0xffffffff;
|
||||
|
||||
|
@ -1016,36 +570,9 @@ parseDB(const char *filename, UErrorCode *pErrorCode) {
|
|||
|
||||
repeatAreaProps();
|
||||
|
||||
/* are all sub-properties consumed? */
|
||||
if(mirrorIndex<mirrorCount) {
|
||||
fprintf(stderr, "genprops: error - some code points in BidiMirroring.txt are missing from UnicodeData.txt\n");
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
if(specialCasingIndex<specialCasingCount) {
|
||||
fprintf(stderr, "genprops: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
if(caseFoldingIndex<caseFoldingCount) {
|
||||
fprintf(stderr, "genprops: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for(i=0;
|
||||
0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
|
||||
++i
|
||||
) {
|
||||
addCaseSensitive(start, end);
|
||||
}
|
||||
if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1056,4 +583,3 @@ parseDB(const char *filename, UErrorCode *pErrorCode) {
|
|||
* End:
|
||||
*
|
||||
*/
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -24,37 +24,17 @@
|
|||
#define DATA_NAME "uprops"
|
||||
#define DATA_TYPE "icu"
|
||||
|
||||
/* special casing data */
|
||||
typedef struct {
|
||||
uint32_t code;
|
||||
UBool isComplex;
|
||||
UChar lowerCase[32], upperCase[32], titleCase[32];
|
||||
} SpecialCasing;
|
||||
|
||||
/* case folding data */
|
||||
typedef struct {
|
||||
uint32_t code, simple;
|
||||
char status;
|
||||
UChar full[32];
|
||||
} CaseFolding;
|
||||
|
||||
/* character properties */
|
||||
typedef struct {
|
||||
uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping;
|
||||
uint32_t code;
|
||||
int32_t numericValue; /* see numericType */
|
||||
uint32_t denominator; /* 0: no value */
|
||||
uint8_t generalCategory, bidi, isMirrored, numericType;
|
||||
SpecialCasing *specialCasing;
|
||||
CaseFolding *caseFolding;
|
||||
uint8_t generalCategory, numericType, exponent;
|
||||
} Props;
|
||||
|
||||
/* global flags */
|
||||
extern UBool beVerbose, haveCopyright;
|
||||
|
||||
/* name tables */
|
||||
extern const char *const
|
||||
bidiNames[];
|
||||
|
||||
extern const char *const
|
||||
genCategoryNames[];
|
||||
|
||||
|
@ -77,6 +57,9 @@ setUnicodeVersion(const char *v);
|
|||
extern void
|
||||
initStore(void);
|
||||
|
||||
extern void
|
||||
exitStore();
|
||||
|
||||
extern uint32_t
|
||||
makeProps(Props *p);
|
||||
|
||||
|
@ -89,12 +72,6 @@ getProps(uint32_t c);
|
|||
extern void
|
||||
repeatProps(uint32_t first, uint32_t last, uint32_t props);
|
||||
|
||||
U_CFUNC uint32_t U_EXPORT2
|
||||
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset);
|
||||
|
||||
extern void
|
||||
addCaseSensitive(UChar32 first, UChar32 last);
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir);
|
||||
|
||||
|
@ -102,6 +79,9 @@ generateData(const char *dataDir);
|
|||
U_CFUNC void
|
||||
initAdditionalProperties(void);
|
||||
|
||||
U_CFUNC void
|
||||
exitAdditionalProperties();
|
||||
|
||||
U_CFUNC void
|
||||
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode);
|
||||
|
||||
|
@ -109,4 +89,3 @@ U_CFUNC int32_t
|
|||
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2004, International Business Machines
|
||||
* Copyright (C) 2002-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -102,11 +102,6 @@ numericLineFn(void *context,
|
|||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
static void U_CALLCONV
|
||||
bidiClassLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/* parse files with single enumerated properties ---------------------------- */
|
||||
|
||||
struct SingleEnum {
|
||||
|
@ -146,18 +141,6 @@ static const SingleEnum eawSingleEnum={
|
|||
0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
|
||||
};
|
||||
|
||||
static const SingleEnum jtSingleEnum={
|
||||
"DerivedJoiningType", "joining type",
|
||||
UCHAR_JOINING_TYPE,
|
||||
2, UPROPS_JT_SHIFT, UPROPS_JT_MASK
|
||||
};
|
||||
|
||||
static const SingleEnum jgSingleEnum={
|
||||
"DerivedJoiningGroup", "joining group",
|
||||
UCHAR_JOINING_GROUP,
|
||||
2, UPROPS_JG_SHIFT, UPROPS_JG_MASK
|
||||
};
|
||||
|
||||
static void U_CALLCONV
|
||||
singleEnumLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
|
@ -246,8 +229,6 @@ typedef struct Binaries Binaries;
|
|||
static const Binary
|
||||
propListNames[]={
|
||||
{ "White_Space", 1, UPROPS_WHITE_SPACE },
|
||||
{ "Bidi_Control", 1, UPROPS_BIDI_CONTROL },
|
||||
{ "Join_Control", 1, UPROPS_JOIN_CONTROL },
|
||||
{ "Dash", 1, UPROPS_DASH },
|
||||
{ "Hyphen", 1, UPROPS_HYPHEN },
|
||||
{ "Quotation_Mark", 1, UPROPS_QUOTATION_MARK },
|
||||
|
@ -264,7 +245,6 @@ propListNames[]={
|
|||
{ "Radical", 1, UPROPS_RADICAL },
|
||||
{ "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
|
||||
{ "Deprecated", 1, UPROPS_DEPRECATED },
|
||||
{ "Soft_Dotted", 1, UPROPS_SOFT_DOTTED },
|
||||
{ "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
|
||||
|
||||
/* new properties in Unicode 4.0.1 */
|
||||
|
@ -285,8 +265,6 @@ derCorePropsNames[]={
|
|||
/* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
|
||||
{ "Math", 1, UPROPS_MATH },
|
||||
{ "Alphabetic", 1, UPROPS_ALPHABETIC },
|
||||
{ "Lowercase", 1, UPROPS_LOWERCASE },
|
||||
{ "Uppercase", 1, UPROPS_UPPERCASE },
|
||||
{ "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND },
|
||||
{ "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
|
||||
|
||||
|
@ -340,7 +318,9 @@ binariesLineFn(void *context,
|
|||
for(i=0;; ++i) {
|
||||
if(i==bin->binariesCount) {
|
||||
/* ignore unrecognized properties */
|
||||
addIgnoredProp(s, fields[1][1]);
|
||||
if(beVerbose) {
|
||||
addIgnoredProp(s, fields[1][1]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if(isToken(bin->binaries[i].propName, s)) {
|
||||
|
@ -382,8 +362,10 @@ parseBinariesFile(char *filename, char *basename, const char *suffix,
|
|||
fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
|
||||
}
|
||||
|
||||
for(i=0; i<ignoredPropsCount; ++i) {
|
||||
printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
|
||||
if(beVerbose) {
|
||||
for(i=0; i<ignoredPropsCount; ++i) {
|
||||
printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -394,6 +376,12 @@ initAdditionalProperties() {
|
|||
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
exitAdditionalProperties() {
|
||||
utrie_close(trie);
|
||||
upvec_close(pv);
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
|
||||
char *basename;
|
||||
|
@ -405,9 +393,6 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
|
|||
/* add Han numeric types & values */
|
||||
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
|
||||
|
||||
/* set proper bidi class for unassigned code points (Cn) */
|
||||
parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, pErrorCode);
|
||||
|
||||
parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
|
||||
|
||||
/*
|
||||
|
@ -441,10 +426,6 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
|
|||
*/
|
||||
parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
|
||||
|
||||
parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, pErrorCode);
|
||||
|
||||
parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, pErrorCode);
|
||||
|
||||
/*
|
||||
* Preset East Asian Width defaults:
|
||||
*
|
||||
|
@ -481,7 +462,7 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
|
|||
return;
|
||||
}
|
||||
|
||||
pvCount=upvec_toTrie(pv, trie, pErrorCode);
|
||||
pvCount=upvec_compact(pv, upvec_compactToTrieHandler, trie, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
|
@ -538,7 +519,7 @@ static void U_CALLCONV
|
|||
numericLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
Props newProps;
|
||||
Props newProps={ 0 };
|
||||
char *s, *end;
|
||||
uint32_t start, limit, value, oldProps32;
|
||||
int32_t oldType;
|
||||
|
@ -575,11 +556,14 @@ numericLineFn(void *context,
|
|||
/* try large powers of 10 first, may otherwise overflow strtoul() */
|
||||
if(0==uprv_strncmp(s, "10000000000", 11)) {
|
||||
/* large powers of 10 are encoded in a special way, see store.c */
|
||||
value=0x7fffff00;
|
||||
uint8_t exp=0;
|
||||
|
||||
end=s;
|
||||
while(*(++end)=='0') {
|
||||
++value;
|
||||
++exp;
|
||||
}
|
||||
value=1;
|
||||
newProps.exponent=exp;
|
||||
} else {
|
||||
/* normal number parsing */
|
||||
value=(uint32_t)uprv_strtoul(s, &end, 10);
|
||||
|
@ -599,108 +583,51 @@ numericLineFn(void *context,
|
|||
* specific properties for single characters.
|
||||
*/
|
||||
|
||||
/* set the new numeric type and value */
|
||||
newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
|
||||
newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
|
||||
/* the exponent may have been set above */
|
||||
value=makeProps(&newProps);
|
||||
|
||||
for(; start<limit; ++start) {
|
||||
oldProps32=getProps(start);
|
||||
oldType=(int32_t)GET_NUMERIC_TYPE(oldProps32);
|
||||
if(oldType!=0) {
|
||||
/* this code point was already listed with its numeric value in UnicodeData.txt */
|
||||
continue;
|
||||
|
||||
if(isFraction) {
|
||||
if(oldType!=0) {
|
||||
/* this code point was already listed with its numeric value in UnicodeData.txt */
|
||||
continue;
|
||||
} else {
|
||||
fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not set a numeric value for code points that have other
|
||||
* values or exceptions because the code below is not prepared
|
||||
* to maintain such values and exceptions.
|
||||
*
|
||||
* Check store.c (e.g., file format description and makeProps())
|
||||
* for details of what code points get their value field interpreted.
|
||||
* For example, case mappings for Ll/Lt/Lu and mirror mappings for mirrored characters.
|
||||
*
|
||||
* For simplicity, and because we only expect to set numeric values for Han characters,
|
||||
* for now we only allow to set these values for Lo characters.
|
||||
*/
|
||||
if(GET_UNSIGNED_VALUE(oldProps32)!=0 || PROPS_VALUE_IS_EXCEPTION(oldProps32) || GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
|
||||
fprintf(stderr, "genprops error: new numeric value for a character with some other value in DerivedNumericValues.txt at %s\n", fields[0][0]);
|
||||
if(oldType==0 && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
|
||||
fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
if(isFraction) {
|
||||
fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
|
||||
exit(U_PARSE_ERROR);
|
||||
/* verify that we do not change an existing value (fractions were excluded above) */
|
||||
if(oldType!=0) {
|
||||
/* the code point already has a value stored */
|
||||
if((oldProps32&0xff00)!=(value&0xff00)) {
|
||||
fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
/* same value, continue */
|
||||
} else {
|
||||
/* the code point is getting a new numeric value */
|
||||
if(beVerbose) {
|
||||
printf("adding U+%04x numeric type %d value 0x%04x from %s\n", (int)start, U_NT_NUMERIC, (int)value, fields[0][0]);
|
||||
}
|
||||
|
||||
addProps(start, value|GET_CATEGORY(oldProps32));
|
||||
}
|
||||
|
||||
if(beVerbose) {
|
||||
printf("adding U+%04x numeric type %d value %u\n", (int)start, U_NT_NUMERIC, (int)value);
|
||||
}
|
||||
|
||||
/* reconstruct the properties and set the new numeric type and value */
|
||||
uprv_memset(&newProps, 0, sizeof(newProps));
|
||||
newProps.code=start;
|
||||
newProps.generalCategory=(uint8_t)GET_CATEGORY(oldProps32);
|
||||
newProps.bidi=(uint8_t)GET_BIDI_CLASS(oldProps32);
|
||||
newProps.isMirrored=(uint8_t)(oldProps32&(1UL<<UPROPS_MIRROR_SHIFT) ? TRUE : FALSE);
|
||||
newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
|
||||
newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
|
||||
addProps(start, makeProps(&newProps));
|
||||
}
|
||||
}
|
||||
|
||||
/* DerivedBidiClass.txt ----------------------------------------------------- */
|
||||
|
||||
static void U_CALLCONV
|
||||
bidiClassLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *s;
|
||||
uint32_t oldStart, start, limit, value, props32;
|
||||
UBool didSet;
|
||||
|
||||
/* get the code point range */
|
||||
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops: syntax error in DerivedBidiClass.txt field 0 at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
++limit;
|
||||
|
||||
/* parse bidi class */
|
||||
s=trimTerminateField(fields[1][0], fields[1][1]);
|
||||
value=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, s);
|
||||
if((int32_t)value<0) {
|
||||
fprintf(stderr, "genprops error: unknown bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
didSet=FALSE;
|
||||
oldStart=start;
|
||||
for(; start<limit; ++start) {
|
||||
props32=getProps(start);
|
||||
|
||||
/* ignore if this bidi class is already set */
|
||||
if(value==GET_BIDI_CLASS(props32)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* ignore old bidi class, set only for unassigned code points (Cn) */
|
||||
if(GET_CATEGORY(props32)!=0) {
|
||||
/* error if this one contradicts what we parsed from UnicodeData.txt */
|
||||
fprintf(stderr, "genprops error: different bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* remove whatever bidi class was set before */
|
||||
props32&=~(0x1f<<UPROPS_BIDI_SHIFT);
|
||||
|
||||
/* set bidi class for Cn according to DerivedBidiClass.txt */
|
||||
props32|=value<<UPROPS_BIDI_SHIFT;
|
||||
|
||||
/* set the modified properties */
|
||||
addProps(start, props32);
|
||||
didSet=TRUE;
|
||||
}
|
||||
|
||||
if(didSet && beVerbose) {
|
||||
printf("setting U+%04x..U+%04x bidi class %d\n", (int)oldStart, (int)limit-1, (int)value);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -712,7 +639,7 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
|
|||
UErrorCode errorCode;
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
|
||||
length=utrie_serialize(trie, p, capacity, NULL, TRUE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
|
@ -737,8 +664,6 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
|
|||
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
|
||||
((int32_t)USCRIPT_CODE_LIMIT-1);
|
||||
indexes[UPROPS_MAX_VALUES_2_INDEX]=
|
||||
(((int32_t)U_JT_COUNT-1)<<UPROPS_JT_SHIFT)|
|
||||
(((int32_t)U_JG_COUNT-1)<<UPROPS_JG_SHIFT)|
|
||||
((int32_t)U_DT_COUNT-1);
|
||||
}
|
||||
|
||||
|
@ -751,9 +676,5 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
|
|||
}
|
||||
length+=pvCount*4;
|
||||
|
||||
if(p!=NULL) {
|
||||
utrie_close(trie);
|
||||
upvec_close(pv);
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -18,12 +18,10 @@
|
|||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "filestrm.h"
|
||||
#include "utrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unewdata.h"
|
||||
|
@ -42,7 +40,15 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
|
|||
precedes the actual data. It contains platform properties values and the
|
||||
file format version.
|
||||
|
||||
The following is a description of format version 3 .
|
||||
The following is a description of format version 4 .
|
||||
|
||||
The format changes between version 3 and 4 because the properties related to
|
||||
case mappings and bidi/shaping are pulled out into separate files
|
||||
for modularization.
|
||||
In order to reduce the need for code changes, some of the previous data
|
||||
structures are omitted, rather than rearranging everything.
|
||||
|
||||
For details see "Changes in format version 4" below.
|
||||
|
||||
Data contents:
|
||||
|
||||
|
@ -63,6 +69,10 @@ Formally, the file contains the following structures:
|
|||
|
||||
const int32_t indexes[16] with values i0..i15:
|
||||
|
||||
i0 indicates the length of the main trie.
|
||||
i0..i3 all have the same value in format version 4.0;
|
||||
the related props32[] and exceptions[] and uchars[] were used in format version 3
|
||||
|
||||
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
|
||||
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
|
||||
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
|
||||
|
@ -74,12 +84,14 @@ Formally, the file contains the following structures:
|
|||
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
|
||||
i7..i9 reservedIndexes; -- reserved values; 0 for now
|
||||
|
||||
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (format version 3.1+)
|
||||
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (format version 3.2)
|
||||
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
|
||||
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
|
||||
i12..i15 reservedIndexes; -- reserved values; 0 for now
|
||||
|
||||
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
|
||||
|
||||
P, E, and U are not used (empty) in format version 4
|
||||
|
||||
P const uint32_t props32[i1-i0];
|
||||
E const uint32_t exceptions[i2-i1];
|
||||
U const UChar uchars[2*(i3-i2)];
|
||||
|
@ -99,14 +111,7 @@ the Unicode code assignment are exploited:
|
|||
|
||||
The lookup of properties for a given code point is done with a trie lookup,
|
||||
using the UTrie implementation.
|
||||
The trie lookup result is a 16-bit index in the props32[] table where the
|
||||
actual 32-bit properties word is stored. This is done to save space.
|
||||
|
||||
(There are thousands of 16-bit entries in the trie data table, but
|
||||
only a few hundred unique 32-bit properties words.
|
||||
If the trie data table contained 32-bit words directly, then that would be
|
||||
larger because the length of the table would be the same as now but the
|
||||
width would be 32 bits instead of 16. This saves more than 10kB.)
|
||||
The trie lookup result is a 16-bit properties word.
|
||||
|
||||
With a given Unicode code point
|
||||
|
||||
|
@ -114,141 +119,51 @@ With a given Unicode code point
|
|||
|
||||
and 0<=c<0x110000, the lookup is done like this:
|
||||
|
||||
uint16_t i;
|
||||
UTRIE_GET16(c, i);
|
||||
uint32_t props=p32[i];
|
||||
uint16_t props;
|
||||
UTRIE_GET16(trie, c, props);
|
||||
|
||||
For some characters, not all of the properties can be efficiently encoded
|
||||
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
|
||||
array:
|
||||
|
||||
if(props&EXCEPTION_BIT)) {
|
||||
uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
|
||||
...
|
||||
}
|
||||
|
||||
The exception values are a variable number of uint32_t starting at
|
||||
|
||||
const uint32_t *pe=p32+exceptionsIndex+e;
|
||||
|
||||
The first uint32_t there contains flags about what values actually follow it.
|
||||
Some of the exception values are UChar32 code points for the case mappings,
|
||||
others are numeric values etc.
|
||||
|
||||
32-bit properties sets:
|
||||
|
||||
Each 32-bit properties word contains:
|
||||
Each 16-bit properties word contains:
|
||||
|
||||
0.. 4 general category
|
||||
5 has exception values
|
||||
6..10 BiDi category
|
||||
11 is mirrored
|
||||
12..14 numericType:
|
||||
0 no numeric value
|
||||
1 decimal digit value
|
||||
2 digit value
|
||||
3 numeric value
|
||||
### TODO: type 4 for Han digits & numbers?!
|
||||
15..19 reserved
|
||||
20..31 value according to bits 0..5:
|
||||
if(has exception) {
|
||||
exception index;
|
||||
} else switch(general category) {
|
||||
case Ll: delta to uppercase; -- same as titlecase
|
||||
case Lu: -delta to lowercase; -- titlecase is same as c
|
||||
case Lt: -delta to lowercase; -- uppercase is same as c
|
||||
default:
|
||||
if(is mirrored) {
|
||||
delta to mirror;
|
||||
} else if(numericType!=0) {
|
||||
numericValue;
|
||||
} else {
|
||||
0;
|
||||
};
|
||||
}
|
||||
5.. 7 numeric type
|
||||
non-digit numbers are stored with multiple types and pseudo-types
|
||||
in order to facilitate compact encoding:
|
||||
0 no numeric value (0)
|
||||
1 decimal digit value (0..9)
|
||||
2 digit value (0..9)
|
||||
3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff
|
||||
4 (internal type UPROPS_NT_FRACTION) fraction
|
||||
5 (internal type UPROPS_NT_LARGE) large number >0xff
|
||||
6..7 reserved
|
||||
|
||||
Exception values:
|
||||
when returning the numeric type from a public API,
|
||||
internal types must be turned into U_NT_NUMERIC
|
||||
|
||||
In the first uint32_t exception word for a code point,
|
||||
bits
|
||||
31..16 reserved
|
||||
15..0 flags that indicate which values follow:
|
||||
8..15 numeric value
|
||||
encoding of fractions and large numbers see below
|
||||
|
||||
bit
|
||||
0 has uppercase mapping
|
||||
1 has lowercase mapping
|
||||
2 has titlecase mapping
|
||||
3 unused
|
||||
4 has numeric value (numerator)
|
||||
if numericValue=0x7fffff00+x then numericValue=10^x
|
||||
5 has denominator value
|
||||
6 has a mirror-image Unicode code point
|
||||
7 has SpecialCasing.txt entries
|
||||
8 has CaseFolding.txt entries
|
||||
Fractions:
|
||||
// n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
|
||||
int32_t num, den;
|
||||
num=n>>3; // num=0..31
|
||||
den=(n&7)+2; // den=2..9
|
||||
if(num==0) {
|
||||
num=-1; // num=-1 or 1..31
|
||||
}
|
||||
double result=(double)num/(double)den;
|
||||
|
||||
According to the flags in this word, one or more uint32_t words follow it
|
||||
in the sequence of the bit flags in the flags word; if a flag is not set,
|
||||
then the value is missing or 0:
|
||||
|
||||
For the case mappings and the mirror-image Unicode code point,
|
||||
one uint32_t or UChar32 each is the code point.
|
||||
If the titlecase mapping is missing, then it is the same as the uppercase mapping.
|
||||
|
||||
For the digit values, bits 31..16 contain the decimal digit value, and
|
||||
bits 15..0 contain the digit value. A value of -1 indicates that
|
||||
this value is missing.
|
||||
|
||||
For the numeric/numerator value, an int32_t word contains the value directly,
|
||||
except for when there is no numerator but a denominator, then the numerator
|
||||
is implicitly 1. This means:
|
||||
numerator denominator result
|
||||
none none none
|
||||
x none x
|
||||
none y 1/y
|
||||
x y x/y
|
||||
|
||||
If the numerator value is 0x7fffff00+x then it is replaced with 10^x.
|
||||
|
||||
For the denominator value, a uint32_t word contains the value directly.
|
||||
|
||||
For special casing mappings, the 32-bit exception word contains:
|
||||
31 if set, this character has complex, conditional mappings
|
||||
that are not stored;
|
||||
otherwise, the mappings are stored according to the following bits
|
||||
30..24 number of UChars used for mappings
|
||||
23..16 reserved
|
||||
15.. 0 UChar offset from the beginning of the UChars array where the
|
||||
UChars for the special case mappings are stored in the following format:
|
||||
|
||||
Format of special casing UChars:
|
||||
One UChar value with lengths as follows:
|
||||
14..10 number of UChars for titlecase mapping
|
||||
9.. 5 number of UChars for uppercase mapping
|
||||
4.. 0 number of UChars for lowercase mapping
|
||||
|
||||
Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
|
||||
|
||||
For case folding mappings, the 32-bit exception word contains:
|
||||
31..24 number of UChars used for the full mapping
|
||||
23..16 reserved
|
||||
15.. 0 UChar offset from the beginning of the UChars array where the
|
||||
UChars for the special case mappings are stored in the following format:
|
||||
|
||||
Format of case folding UChars:
|
||||
Two UChars contain the simple mapping as follows:
|
||||
0, 0 no simple mapping
|
||||
BMP,0 a simple mapping to a BMP code point
|
||||
s1, s2 a simple mapping to a supplementary code point stored as two surrogates
|
||||
This is followed by the UChars for the full case folding mappings.
|
||||
|
||||
Example:
|
||||
U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
|
||||
mapping and a numeric value.
|
||||
Its exception values would be stored as 3 uint32_t words:
|
||||
|
||||
- flags=0x0a (see above) with combining class 0
|
||||
- lowercase mapping 0x2170
|
||||
- numeric value=1
|
||||
Large numbers:
|
||||
// n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
|
||||
int32_t m, e;
|
||||
m=n>>4; // m=0..15
|
||||
e=(n&0xf);
|
||||
if(m==0) {
|
||||
m=1; // for large powers of 10
|
||||
e+=18; // e=18..33
|
||||
} else {
|
||||
e+=2; // e=2..17
|
||||
} // m==10..15 are reserved
|
||||
double result=(double)m*10^e;
|
||||
|
||||
--- Additional properties (new in format version 2.1) ---
|
||||
|
||||
|
@ -277,6 +192,32 @@ See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
|
|||
- i10 also contains U_LB_COUNT and U_EA_COUNT.
|
||||
- i11 contains maxValues2 for vector word 2.
|
||||
|
||||
--- Changes in format version 4 ---
|
||||
|
||||
The format changes between version 3 and 4 because the properties related to
|
||||
case mappings and bidi/shaping are pulled out into separate files
|
||||
for modularization.
|
||||
In order to reduce the need for code changes, some of the previous data
|
||||
structures are omitted, rather than rearranging everything.
|
||||
|
||||
(The change to format version 4 is for ICU 3.4. The last CVS revision of
|
||||
genprops/store.c for format version 3.2 is 1.48.)
|
||||
|
||||
The main trie's data is significantly simplified:
|
||||
- The trie's 16-bit data word is used directly instead of as an index
|
||||
into props32[].
|
||||
- The trie uses the default trie folding functions instead of custom ones.
|
||||
- Numeric values are stored directly in the trie data word, with special
|
||||
encodings.
|
||||
- No more exception data (the data that needed it was pulled out, or, in the
|
||||
case of numeric values, encoded differently).
|
||||
- No more string data (pulled out - was for case mappings).
|
||||
|
||||
Also, some of the previously used properties vector bits are reserved again.
|
||||
|
||||
The indexes[] values for the omitted structures are still filled in
|
||||
(indicating zero-length arrays) so that the swapper code remains unchanged.
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
/* UDataInfo cf. udata.h */
|
||||
|
@ -290,46 +231,12 @@ static UDataInfo dataInfo={
|
|||
0,
|
||||
|
||||
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
|
||||
{ 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 4, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 4, 0, 1, 0 } /* dataVersion */
|
||||
};
|
||||
|
||||
/* definitions of expected data size limits */
|
||||
enum {
|
||||
MAX_PROPS_COUNT=25000,
|
||||
MAX_UCHAR_COUNT=10000
|
||||
};
|
||||
|
||||
static UNewTrie *pTrie=NULL;
|
||||
|
||||
/* props32[] contains unique properties words after compacting the array of properties */
|
||||
static uint32_t props32[MAX_PROPS_COUNT];
|
||||
|
||||
/* context pointer for compareProps() - temporarily holds a pointer to the trie data */
|
||||
static uint32_t *props;
|
||||
|
||||
/* length of props32[] after compaction */
|
||||
static int32_t propsTop;
|
||||
|
||||
/* exceptions values */
|
||||
static uint32_t exceptions[UPROPS_MAX_EXCEPTIONS_COUNT+20];
|
||||
static uint16_t exceptionsTop=0;
|
||||
|
||||
/* Unicode characters, e.g. for special casing or decomposition */
|
||||
static UChar uchars[MAX_UCHAR_COUNT+20];
|
||||
static uint32_t ucharsTop=0;
|
||||
|
||||
/* statistics */
|
||||
static uint16_t exceptionsCount=0;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static int
|
||||
compareProps(const void *l, const void *r);
|
||||
|
||||
static uint32_t
|
||||
addUChars(const UChar *s, uint32_t length);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
extern void
|
||||
|
@ -341,266 +248,106 @@ setUnicodeVersion(const char *v) {
|
|||
|
||||
extern void
|
||||
initStore() {
|
||||
pTrie=utrie_open(NULL, NULL, MAX_PROPS_COUNT, 0, 0, TRUE);
|
||||
pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
|
||||
if(pTrie==NULL) {
|
||||
fprintf(stderr, "error: unable to create a UNewTrie\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
uprv_memset(props32, 0, sizeof(props32));
|
||||
initAdditionalProperties();
|
||||
}
|
||||
|
||||
extern void
|
||||
exitStore() {
|
||||
utrie_close(pTrie);
|
||||
exitAdditionalProperties();
|
||||
}
|
||||
|
||||
/* store a character's properties ------------------------------------------- */
|
||||
|
||||
extern uint32_t
|
||||
makeProps(Props *p) {
|
||||
uint32_t x;
|
||||
int32_t value;
|
||||
uint16_t count;
|
||||
UBool isNumber;
|
||||
uint32_t den;
|
||||
int32_t type, value, exp;
|
||||
|
||||
/*
|
||||
* Simple ideas for reducing the number of bits for one character's
|
||||
* properties:
|
||||
*
|
||||
* Some fields are only used for characters of certain
|
||||
* general categories:
|
||||
* - casing fields for letters and others, not for
|
||||
* numbers & Mn
|
||||
* + uppercase not for uppercase letters
|
||||
* + lowercase not for lowercase letters
|
||||
* + titlecase not for titlecase letters
|
||||
*
|
||||
* * most of the time, uppercase=titlecase
|
||||
* - numeric fields for various digit & other types
|
||||
* - canonical combining classes for non-spacing marks (Mn)
|
||||
* * the above is not always true, for all three cases
|
||||
*
|
||||
* Using the same bits for alternate fields saves some space.
|
||||
*
|
||||
* For the canonical categories, there are only few actually used
|
||||
* most of the time.
|
||||
* They can be stored using 5 bits.
|
||||
*
|
||||
* In the BiDi categories, the 5 explicit codes are only ever
|
||||
* assigned 1:1 to 5 well-known code points. Storing only one
|
||||
* value for all "explicit codes" gets this down to 4 bits.
|
||||
* Client code then needs to check for this special value
|
||||
* and replace it by the real one using a 5-element table.
|
||||
*
|
||||
* The general categories Mn & Me, non-spacing & enclosing marks,
|
||||
* are always NSM, and NSM are always of those categories.
|
||||
*
|
||||
* Digit values can often be derived from the code point value
|
||||
* itself in a simple way.
|
||||
*
|
||||
*/
|
||||
|
||||
/* count the case mappings and other values competing for the value bit field */
|
||||
x=0;
|
||||
value=0;
|
||||
count=0;
|
||||
isNumber= (UBool)(genCategoryNames[p->generalCategory][0]=='N');
|
||||
|
||||
if(p->upperCase!=0) {
|
||||
/* verify that no numbers and no Mn have case mappings */
|
||||
if(p->generalCategory==U_LOWERCASE_LETTER) {
|
||||
value=(int32_t)p->code-(int32_t)p->upperCase;
|
||||
} else {
|
||||
x=UPROPS_EXCEPTION_BIT;
|
||||
}
|
||||
++count;
|
||||
}
|
||||
if(p->lowerCase!=0) {
|
||||
/* verify that no numbers and no Mn have case mappings */
|
||||
if(p->generalCategory==U_UPPERCASE_LETTER || p->generalCategory==U_TITLECASE_LETTER) {
|
||||
value=(int32_t)p->lowerCase-(int32_t)p->code;
|
||||
} else {
|
||||
x=UPROPS_EXCEPTION_BIT;
|
||||
}
|
||||
++count;
|
||||
}
|
||||
if(p->upperCase!=p->titleCase) {
|
||||
x=UPROPS_EXCEPTION_BIT;
|
||||
++count;
|
||||
}
|
||||
if(p->numericType!=0) {
|
||||
do { /* pseudo-loop to allow break instead of goto */
|
||||
/* encode numeric type & value */
|
||||
type=p->numericType;
|
||||
value=p->numericValue;
|
||||
++count;
|
||||
}
|
||||
if(p->denominator!=0) {
|
||||
x=UPROPS_EXCEPTION_BIT;
|
||||
++count;
|
||||
}
|
||||
if(p->isMirrored) {
|
||||
if(p->mirrorMapping!=0) {
|
||||
value=(int32_t)p->mirrorMapping-(int32_t)p->code;
|
||||
}
|
||||
++count;
|
||||
}
|
||||
if(p->specialCasing!=NULL) {
|
||||
x=UPROPS_EXCEPTION_BIT;
|
||||
++count;
|
||||
}
|
||||
if(p->caseFolding!=NULL) {
|
||||
x=UPROPS_EXCEPTION_BIT;
|
||||
++count;
|
||||
}
|
||||
den=p->denominator;
|
||||
exp=p->exponent;
|
||||
|
||||
/* handle exceptions */
|
||||
if(count>1 || x!=0 || value<UPROPS_MIN_VALUE || UPROPS_MAX_VALUE<value) {
|
||||
/* this code point needs exception values */
|
||||
if(beVerbose) {
|
||||
if(x!=0) {
|
||||
/* do not print - many code points because of SpecialCasing & CaseFolding
|
||||
printf("*** code 0x%06x needs an exception because it is irregular\n", p->code);
|
||||
*/
|
||||
} else if(value<UPROPS_MIN_VALUE || UPROPS_MAX_VALUE<value) {
|
||||
printf("*** U+%04x needs an exception because its value is out-of-bounds at %ld (not [%ld..%ld]\n",
|
||||
(int)p->code, (long)value, (long)UPROPS_MIN_VALUE, (long)UPROPS_MAX_VALUE);
|
||||
if(den!=0) {
|
||||
/* fraction */
|
||||
if( type!=U_NT_NUMERIC ||
|
||||
value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM ||
|
||||
den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den ||
|
||||
exp!=0
|
||||
) {
|
||||
break;
|
||||
}
|
||||
type=UPROPS_NT_FRACTION;
|
||||
|
||||
if(value==-1) {
|
||||
value=0;
|
||||
}
|
||||
den-=UPROPS_FRACTION_DEN_OFFSET;
|
||||
value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den;
|
||||
} else if(exp!=0) {
|
||||
/* very large value */
|
||||
if( type!=U_NT_NUMERIC ||
|
||||
value<1 || 9<value ||
|
||||
exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp
|
||||
) {
|
||||
break;
|
||||
}
|
||||
type=UPROPS_NT_LARGE;
|
||||
|
||||
if(exp<=UPROPS_LARGE_MAX_EXP) {
|
||||
/* 1..9 * 10^(2..17) */
|
||||
exp-=UPROPS_LARGE_EXP_OFFSET;
|
||||
} else {
|
||||
printf("*** U+%04x needs an exception because it has %u values\n",
|
||||
(int)p->code, count);
|
||||
/* 1 * 10^(18..33) */
|
||||
if(value!=1) {
|
||||
break;
|
||||
}
|
||||
value=0;
|
||||
exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA;
|
||||
}
|
||||
value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
|
||||
} else if(value>UPROPS_MAX_SMALL_NUMBER) {
|
||||
/* large value */
|
||||
if(type!=U_NT_NUMERIC) {
|
||||
break;
|
||||
}
|
||||
type=UPROPS_NT_LARGE;
|
||||
|
||||
/* split the value into mantissa and exponent, base 10 */
|
||||
while((value%10)==0) {
|
||||
value/=10;
|
||||
++exp;
|
||||
}
|
||||
if(value>9) {
|
||||
break;
|
||||
}
|
||||
|
||||
exp-=UPROPS_LARGE_EXP_OFFSET;
|
||||
value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
|
||||
|
||||
/* } else normal value=0..0xff { */
|
||||
}
|
||||
|
||||
++exceptionsCount;
|
||||
x=UPROPS_EXCEPTION_BIT;
|
||||
/* encode the properties */
|
||||
return
|
||||
(uint32_t)p->generalCategory |
|
||||
((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) |
|
||||
((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT);
|
||||
} while(0);
|
||||
|
||||
/* allocate and create exception values */
|
||||
value=exceptionsTop;
|
||||
if(value>=UPROPS_MAX_EXCEPTIONS_COUNT) {
|
||||
fprintf(stderr, "genprops: out of exceptions memory at U+%06x. (%d exceeds allocated space)\n",
|
||||
(int)p->code, (int)value);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
} else {
|
||||
uint32_t first=0;
|
||||
uint16_t length=1;
|
||||
|
||||
if(p->upperCase!=0) {
|
||||
first|=1;
|
||||
exceptions[value+length++]=p->upperCase;
|
||||
}
|
||||
if(p->lowerCase!=0) {
|
||||
first|=2;
|
||||
exceptions[value+length++]=p->lowerCase;
|
||||
}
|
||||
if(p->upperCase!=p->titleCase) {
|
||||
first|=4;
|
||||
if(p->titleCase!=0) {
|
||||
exceptions[value+length++]=p->titleCase;
|
||||
} else {
|
||||
exceptions[value+length++]=p->code;
|
||||
}
|
||||
}
|
||||
if(p->numericType!=0) {
|
||||
if(p->denominator==0) {
|
||||
first|=0x10;
|
||||
exceptions[value+length++]=(uint32_t)p->numericValue;
|
||||
} else {
|
||||
if(p->numericValue!=1) {
|
||||
first|=0x10;
|
||||
exceptions[value+length++]=(uint32_t)p->numericValue;
|
||||
}
|
||||
first|=0x20;
|
||||
exceptions[value+length++]=p->denominator;
|
||||
}
|
||||
}
|
||||
if(p->isMirrored) {
|
||||
first|=0x40;
|
||||
exceptions[value+length++]=p->mirrorMapping;
|
||||
}
|
||||
if(p->specialCasing!=NULL) {
|
||||
first|=0x80;
|
||||
if(p->specialCasing->isComplex) {
|
||||
/* complex special casing */
|
||||
exceptions[value+length++]=0x80000000;
|
||||
} else {
|
||||
/* unconditional special casing */
|
||||
UChar u[128];
|
||||
uint32_t i;
|
||||
uint16_t j, entry;
|
||||
|
||||
i=1;
|
||||
entry=0;
|
||||
j=p->specialCasing->lowerCase[0];
|
||||
if(j>0) {
|
||||
uprv_memcpy(u+1, p->specialCasing->lowerCase+1, 2*j);
|
||||
i+=j;
|
||||
entry=j;
|
||||
}
|
||||
j=p->specialCasing->upperCase[0];
|
||||
if(j>0) {
|
||||
uprv_memcpy(u+i, p->specialCasing->upperCase+1, 2*j);
|
||||
i+=j;
|
||||
entry|=j<<5;
|
||||
}
|
||||
j=p->specialCasing->titleCase[0];
|
||||
if(j>0) {
|
||||
uprv_memcpy(u+i, p->specialCasing->titleCase+1, 2*j);
|
||||
i+=j;
|
||||
entry|=j<<10;
|
||||
}
|
||||
u[0]=entry;
|
||||
|
||||
exceptions[value+length++]=(i<<24)|addUChars(u, i);
|
||||
}
|
||||
}
|
||||
if(p->caseFolding!=NULL) {
|
||||
first|=0x100;
|
||||
if(p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
|
||||
/* special case folding, store only a marker */
|
||||
exceptions[value+length++]=0;
|
||||
} else {
|
||||
/* normal case folding with a simple and a full mapping */
|
||||
UChar u[128];
|
||||
uint16_t i;
|
||||
|
||||
/* store the simple mapping into the first two UChars */
|
||||
i=0;
|
||||
u[1]=0;
|
||||
UTF_APPEND_CHAR_UNSAFE(u, i, p->caseFolding->simple);
|
||||
|
||||
/* store the full mapping after that */
|
||||
i=p->caseFolding->full[0];
|
||||
if(i>0) {
|
||||
uprv_memcpy(u+2, p->caseFolding->full+1, 2*i);
|
||||
}
|
||||
|
||||
exceptions[value+length++]=(i<<24)|addUChars(u, 2+i);
|
||||
}
|
||||
}
|
||||
exceptions[value]=first;
|
||||
exceptionsTop+=length;
|
||||
}
|
||||
}
|
||||
|
||||
/* put together the 32-bit word of encoded properties */
|
||||
x|=
|
||||
(uint32_t)p->generalCategory |
|
||||
(uint32_t)p->bidi<<UPROPS_BIDI_SHIFT |
|
||||
(uint32_t)p->isMirrored<<UPROPS_MIRROR_SHIFT |
|
||||
(uint32_t)p->numericType<<UPROPS_NUMERIC_TYPE_SHIFT |
|
||||
(uint32_t)value<<UPROPS_VALUE_SHIFT;
|
||||
|
||||
return x;
|
||||
|
||||
/*
|
||||
* "Higher-hanging fruit" (not implemented):
|
||||
*
|
||||
* For some sets of fields, there are fewer sets of values
|
||||
* than the product of the numbers of values per field.
|
||||
* This means that storing one single value for more than
|
||||
* one field and later looking up both field values in a table
|
||||
* saves space.
|
||||
* Examples:
|
||||
* - general category & BiDi
|
||||
*
|
||||
* There are only few common displacements between a code point
|
||||
* and its case mappings. Store deltas. Store codes for few
|
||||
* occuring deltas.
|
||||
*/
|
||||
fprintf(stderr, "genprops error: unable to encode numeric type & value %d %ld/%lu E%d\n",
|
||||
(int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent);
|
||||
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern void
|
||||
|
@ -611,21 +358,6 @@ addProps(uint32_t c, uint32_t x) {
|
|||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
addCaseSensitive(UChar32 first, UChar32 last) {
|
||||
uint32_t x, cs;
|
||||
|
||||
cs=U_MASK(UPROPS_CASE_SENSITIVE_SHIFT);
|
||||
while(first<=last) {
|
||||
x=utrie_get32(pTrie, first, NULL);
|
||||
if(!utrie_set32(pTrie, first, x|cs)) {
|
||||
fprintf(stderr, "error: too many entries for the properties trie\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
++first;
|
||||
}
|
||||
}
|
||||
|
||||
extern uint32_t
|
||||
getProps(uint32_t c) {
|
||||
return utrie_get32(pTrie, (UChar32)c, NULL);
|
||||
|
@ -641,125 +373,8 @@ repeatProps(uint32_t first, uint32_t last, uint32_t x) {
|
|||
}
|
||||
}
|
||||
|
||||
/* compacting --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
compactProps(void) {
|
||||
/*
|
||||
* At this point, all the propsTop properties are in props[], but they
|
||||
* are not all unique.
|
||||
* Now we sort them, reduce them to unique ones in props32[], and
|
||||
* build an index in stage3[] from the old to the new indexes.
|
||||
* (The quick sort averages at N*log(N) with N=propsTop. The inverting
|
||||
* yields linear performance.)
|
||||
*/
|
||||
|
||||
/*
|
||||
* We are going to sort only an index table in map[] because we need this
|
||||
* index table anyway and qsort() does not allow to sort two tables together
|
||||
* directly. This will thus also reduce the amount of data moved around.
|
||||
*/
|
||||
uint32_t x;
|
||||
int32_t i, oldIndex, newIndex;
|
||||
|
||||
static uint16_t map[MAX_PROPS_COUNT];
|
||||
|
||||
#if DO_DEBUG_OUT
|
||||
{
|
||||
/* debug output */
|
||||
uint16_t i1, i2, i3;
|
||||
uint32_t c;
|
||||
for(c=0; c<0xffff; c+=307) {
|
||||
printf("properties(0x%06x)=0x%06x\n", c, getProps(c, &i1, &i2, &i3));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
props=utrie_getData(pTrie, &propsTop);
|
||||
|
||||
/* build the index table */
|
||||
for(i=propsTop; i>0;) {
|
||||
--i;
|
||||
map[i]=(uint16_t)i;
|
||||
}
|
||||
|
||||
/* reorder */
|
||||
qsort(map, propsTop, 2, compareProps);
|
||||
|
||||
/*
|
||||
* Now invert the reordered table and compact it in the same step.
|
||||
* The result will be props32[] having only unique properties words
|
||||
* and stage3[] having indexes to them.
|
||||
*/
|
||||
newIndex=0;
|
||||
for(i=0; i<propsTop;) {
|
||||
/* set the first of a possible series of the same properties */
|
||||
oldIndex=map[i];
|
||||
props32[newIndex]=x=props[oldIndex];
|
||||
props[oldIndex]=newIndex;
|
||||
|
||||
/* set the following same properties only in stage3 */
|
||||
while(++i<propsTop && x==props[map[i]]) {
|
||||
props[map[i]]=newIndex;
|
||||
}
|
||||
|
||||
++newIndex;
|
||||
}
|
||||
|
||||
/* we saved some space */
|
||||
if(beVerbose) {
|
||||
printf("compactProps() reduced propsTop from %u to %u\n",
|
||||
(int)propsTop, (int)newIndex);
|
||||
}
|
||||
propsTop=newIndex;
|
||||
|
||||
#if DO_DEBUG_OUT
|
||||
{
|
||||
/* debug output */
|
||||
uint16_t i1, i2, i3, i4;
|
||||
uint32_t c;
|
||||
for(c=0; c<0xffff; c+=307) {
|
||||
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static int
|
||||
compareProps(const void *l, const void *r) {
|
||||
uint32_t left=props[*(const uint16_t *)l], right=props[*(const uint16_t *)r];
|
||||
|
||||
/* compare general categories first */
|
||||
int rc=(int)(left&0x1f)-(int)(right&0x1f);
|
||||
if(rc==0 && left!=right) {
|
||||
rc= left<right ? -1 : 1;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* generate output data ----------------------------------------------------- */
|
||||
|
||||
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
|
||||
U_CFUNC uint32_t U_EXPORT2
|
||||
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t value;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else if(value!=0) {
|
||||
return (uint32_t)(offset|0x8000);
|
||||
} else {
|
||||
++start;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir) {
|
||||
static int32_t indexes[UPROPS_INDEX_COUNT]={
|
||||
|
@ -777,9 +392,7 @@ generateData(const char *dataDir) {
|
|||
int32_t trieSize, additionalPropsSize, offset;
|
||||
long dataLength;
|
||||
|
||||
compactProps();
|
||||
|
||||
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), getFoldedPropsValue, TRUE, &errorCode);
|
||||
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
|
||||
exit(errorCode);
|
||||
|
@ -787,28 +400,16 @@ generateData(const char *dataDir) {
|
|||
|
||||
offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
|
||||
|
||||
/* round up trie size to 4-alignement */
|
||||
/* round up trie size to 4-alignment */
|
||||
trieSize=(trieSize+3)&~3;
|
||||
offset+=trieSize>>2;
|
||||
indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */
|
||||
|
||||
offset+=propsTop;
|
||||
indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */
|
||||
|
||||
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
|
||||
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset;
|
||||
|
||||
/* round up UChar count to 4-alignement */
|
||||
ucharsTop=(ucharsTop+1)&~1;
|
||||
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
|
||||
indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
|
||||
indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
|
||||
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
|
||||
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
|
||||
|
||||
if(beVerbose) {
|
||||
printf("trie size in bytes: %5u\n", (int)trieSize);
|
||||
printf("number of unique properties values: %5u\n", (int)propsTop);
|
||||
printf("number of code points with exceptions: %5u\n", exceptionsCount);
|
||||
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
|
||||
printf("number of UChars for special mappings: %5u\n", (int)ucharsTop);
|
||||
}
|
||||
|
||||
additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes);
|
||||
|
@ -828,9 +429,6 @@ generateData(const char *dataDir) {
|
|||
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, trieBlock, trieSize);
|
||||
udata_writeBlock(pData, props32, 4*propsTop);
|
||||
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
|
||||
udata_writeBlock(pData, uchars, 2*ucharsTop);
|
||||
udata_writeBlock(pData, additionalProps, additionalPropsSize);
|
||||
|
||||
/* finish up */
|
||||
|
@ -845,25 +443,6 @@ generateData(const char *dataDir) {
|
|||
dataLength, (unsigned long)size);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
|
||||
utrie_close(pTrie);
|
||||
}
|
||||
|
||||
/* helpers ------------------------------------------------------------------ */
|
||||
|
||||
static uint32_t
|
||||
addUChars(const UChar *s, uint32_t length) {
|
||||
uint32_t top=(uint16_t)(ucharsTop+length);
|
||||
UChar *p;
|
||||
|
||||
if(top>=MAX_UCHAR_COUNT) {
|
||||
fprintf(stderr, "genprops: out of UChars memory\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
p=uchars+ucharsTop;
|
||||
uprv_memcpy(p, s, 2*length);
|
||||
ucharsTop=top;
|
||||
return (uint32_t)(p-uchars);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
Loading…
Add table
Reference in a new issue