ICU-5138 Separate the casing data from normalization data and data loading.

X-SVN-Rev: 19499
This commit is contained in:
George Rhoten 2006-03-31 05:29:06 +00:00
parent f47dea2b53
commit 7d382500f6
8 changed files with 271 additions and 247 deletions

View file

@ -74,7 +74,7 @@ utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_w
normlzr.o unorm.o unormcmp.o unorm_it.o chariter.o schriter.o uchriter.o uiter.o \
uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
uscript.o usc_impl.o unames.o \
utrie.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
utrie.o utrie_swap.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
uarrsort.o brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o \

View file

@ -460,6 +460,9 @@
<File
RelativePath=".\utrie.h">
</File>
<File
RelativePath=".\utrie_swap.c">
</File>
<File
RelativePath=".\uvector.cpp">
</File>

View file

@ -196,7 +196,9 @@ ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
U_CAPI void U_EXPORT2
ucase_close(UCaseProps *csp) {
if(csp!=NULL) {
#if !UCASE_HARDCODE_DATA
udata_close(csp->mem);
#endif
uprv_free(csp);
}
}
@ -1482,3 +1484,116 @@ ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
return (result==c) ? ~result : result;
}
/* case mapping properties API ---------------------------------------------- */
/* get the UCaseProps singleton, or else its dummy, once and for all */
static const UCaseProps *
getCaseProps() {
/*
* This lazy intialization with double-checked locking (without mutex protection for
* the initial check) is transiently unsafe under certain circumstances.
* Check the readme and use u_init() if necessary.
*/
/* the initial check is performed by the GET_CASE_PROPS() macro */
const UCaseProps *csp;
UErrorCode errorCode=U_ZERO_ERROR;
csp=ucase_getSingleton(&errorCode);
if(U_FAILURE(errorCode)) {
errorCode=U_ZERO_ERROR;
csp=ucase_getDummy(&errorCode);
if(U_FAILURE(errorCode)) {
return NULL;
}
}
return csp;
}
/*
* In ICU 3.0, most Unicode properties were loaded from uprops.icu.
* ICU 3.2 adds ucase.icu for case mapping properties.
* ICU 3.4 adds ubidi.icu for bidi/shaping properties and
* removes case/bidi/shaping properties from uprops.icu.
*
* Loading of uprops.icu was never mutex-protected and required u_init()
* for thread safety.
* In order to maintain performance for all such properties,
* ucase.icu and ubidi.icu are loaded lazily, without mutexing.
* u_init() will try to load them for thread safety,
* but u_init() will not fail if they are missing.
*
* uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
* and an error code for load failure.
* Instead, here we try to load at most once.
* If it works, we use the resulting singleton object.
* If it fails, then we get a dummy object, which always works unless
* we are seriously out of memory.
* After the first try, we have a never-changing pointer to either the
* real singleton or the dummy.
*
* This method is used in Unicode properties APIs (uchar.h) that
* do not have a service object and also do not have an error code parameter.
* Other API implementations get the singleton themselves
* (with mutexing), store it in the service object, and report errors.
*/
#define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
/* public API (see uchar.h) */
U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c) {
return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
}
U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c) {
return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
}
/* Transforms the Unicode character to its lower case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c) {
return ucase_tolower(GET_CASE_PROPS(), c);
}
/* Transforms the Unicode character to its upper case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c) {
return ucase_toupper(GET_CASE_PROPS(), c);
}
/* Transforms the Unicode character to its title case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c) {
return ucase_totitle(GET_CASE_PROPS(), c);
}
/* return the simple case folding mapping for c */
U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c, uint32_t options) {
return ucase_fold(GET_CASE_PROPS(), c, options);
}
U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c, UProperty which) {
/* case mapping properties */
const UCaseProps *csp=GET_CASE_PROPS();
if(csp==NULL) {
return FALSE;
}
switch(which) {
case UCHAR_LOWERCASE:
return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
case UCHAR_UPPERCASE:
return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
case UCHAR_SOFT_DOTTED:
return ucase_isSoftDotted(csp, c);
case UCHAR_CASE_SENSITIVE:
return ucase_isCaseSensitive(csp, c);
default:
return FALSE;
}
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2004-2005, International Business Machines
* Copyright (C) 2004-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -238,6 +238,9 @@ ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
const UChar **pString,
uint32_t options);
U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c, UProperty which);
/* file definitions --------------------------------------------------------- */
#define UCASE_DATA_NAME "ucase"

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2005, International Business Machines
* Copyright (C) 2002-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -36,115 +36,13 @@
/* cleanup ------------------------------------------------------------------ */
static const UCaseProps *gCsp=NULL;
static const UBiDiProps *gBdp=NULL;
static UBool U_CALLCONV uprops_cleanup(void) {
gCsp=NULL;
gBdp=NULL;
return TRUE;
}
/* case mapping properties API ---------------------------------------------- */
/* get the UCaseProps singleton, or else its dummy, once and for all */
static const UCaseProps *
getCaseProps() {
/*
* This lazy intialization with double-checked locking (without mutex protection for
* the initial check) is transiently unsafe under certain circumstances.
* Check the readme and use u_init() if necessary.
*/
/* the initial check is performed by the GET_CASE_PROPS() macro */
const UCaseProps *csp;
UErrorCode errorCode=U_ZERO_ERROR;
csp=ucase_getSingleton(&errorCode);
if(U_FAILURE(errorCode)) {
errorCode=U_ZERO_ERROR;
csp=ucase_getDummy(&errorCode);
if(U_FAILURE(errorCode)) {
return NULL;
}
}
umtx_lock(NULL);
if(gCsp==NULL) {
gCsp=csp;
csp=NULL;
ucln_common_registerCleanup(UCLN_COMMON_UPROPS, uprops_cleanup);
}
umtx_unlock(NULL);
return gCsp;
}
/*
* In ICU 3.0, most Unicode properties were loaded from uprops.icu.
* ICU 3.2 adds ucase.icu for case mapping properties.
* ICU 3.4 adds ubidi.icu for bidi/shaping properties and
* removes case/bidi/shaping properties from uprops.icu.
*
* Loading of uprops.icu was never mutex-protected and required u_init()
* for thread safety.
* In order to maintain performance for all such properties,
* ucase.icu and ubidi.icu are loaded lazily, without mutexing.
* u_init() will try to load them for thread safety,
* but u_init() will not fail if they are missing.
*
* uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
* and an error code for load failure.
* Instead, here we try to load at most once.
* If it works, we use the resulting singleton object.
* If it fails, then we get a dummy object, which always works unless
* we are seriously out of memory.
* After the first try, we have a never-changing pointer to either the
* real singleton or the dummy.
*
* This method is used in Unicode properties APIs (uchar.h) that
* do not have a service object and also do not have an error code parameter.
* Other API implementations get the singleton themselves
* (with mutexing), store it in the service object, and report errors.
*/
#define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
/* public API (see uchar.h) */
U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c) {
return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
}
U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c) {
return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
}
/* Transforms the Unicode character to its lower case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c) {
return ucase_tolower(GET_CASE_PROPS(), c);
}
/* Transforms the Unicode character to its upper case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c) {
return ucase_toupper(GET_CASE_PROPS(), c);
}
/* Transforms the Unicode character to its title case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c) {
return ucase_totitle(GET_CASE_PROPS(), c);
}
/* return the simple case folding mapping for c */
U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c, uint32_t options) {
return ucase_fold(GET_CASE_PROPS(), c, options);
}
/* bidi/shaping properties API ---------------------------------------------- */
/* get the UBiDiProps singleton, or else its dummy, once and for all */
@ -261,23 +159,7 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
return (u_getUnicodeProperties(c, column)&mask)!=0;
} else {
if(column==UPROPS_SRC_CASE) {
/* case mapping properties */
const UCaseProps *csp=GET_CASE_PROPS();
if(csp==NULL) {
return FALSE;
}
switch(which) {
case UCHAR_LOWERCASE:
return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
case UCHAR_UPPERCASE:
return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
case UCHAR_SOFT_DOTTED:
return ucase_isSoftDotted(csp, c);
case UCHAR_CASE_SENSITIVE:
return ucase_isCaseSensitive(csp, c);
default:
break;
}
return ucase_hasBinaryProperty(c, which);
} else if(column==UPROPS_SRC_NORM) {
#if !UCONFIG_NO_NORMALIZATION
/* normalization properties from unorm.icu */

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2001-2005, International Business Machines
* Copyright (C) 2001-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -23,7 +23,6 @@
#endif
#include "unicode/utypes.h"
#include "udataswp.h"
#include "cmemory.h"
#include "utrie.h"
@ -686,55 +685,6 @@ utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
/* serialization ------------------------------------------------------------ */
/**
* Trie data structure in serialized form:
*
* UTrieHeader header;
* uint16_t index[header.indexLength];
* uint16_t data[header.dataLength];
*/
struct UTrieHeader {
/** "Trie" in big-endian US-ASCII (0x54726965) */
uint32_t signature;
/**
* options bit field:
* 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
* 8 0=16-bit data, 1=32-bit data
* 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT
* 3..0 UTRIE_SHIFT // 1..9
*/
uint32_t options;
/** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
int32_t indexLength;
/** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
int32_t dataLength;
};
typedef struct UTrieHeader UTrieHeader;
/**
* Constants for use with UTrieHeader.options.
*/
enum {
/** Mask to get the UTRIE_SHIFT value from options. */
UTRIE_OPTIONS_SHIFT_MASK=0xf,
/** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
UTRIE_OPTIONS_INDEX_SHIFT=4,
/** If set, then the data (stage 2) array is 32 bits wide. */
UTRIE_OPTIONS_DATA_IS_32_BIT=0x100,
/**
* If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
* as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
*/
UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200
};
/*
* Default function for the folding value:
* Just store the offset (16 bits) if there is any non-initial-value entry.
@ -1077,79 +1027,6 @@ utrie_unserializeDummy(UTrie *trie,
return actualLength;
}
/* swapping ----------------------------------------------------------------- */
U_CAPI int32_t U_EXPORT2
utrie_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
const UTrieHeader *inTrie;
UTrieHeader trie;
int32_t size;
UBool dataIs32;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(ds==NULL || inData==NULL || (length>=0 && outData==NULL)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* setup and swapping */
if(length>=0 && length<sizeof(UTrieHeader)) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
inTrie=(const UTrieHeader *)inData;
trie.signature=ds->readUInt32(inTrie->signature);
trie.options=ds->readUInt32(inTrie->options);
trie.indexLength=udata_readInt32(ds, inTrie->indexLength);
trie.dataLength=udata_readInt32(ds, inTrie->dataLength);
if( trie.signature!=0x54726965 ||
(trie.options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
((trie.options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT ||
trie.indexLength<UTRIE_BMP_INDEX_LENGTH ||
(trie.indexLength&(UTRIE_SURROGATE_BLOCK_COUNT-1))!=0 ||
trie.dataLength<UTRIE_DATA_BLOCK_LENGTH ||
(trie.dataLength&(UTRIE_DATA_GRANULARITY-1))!=0 ||
((trie.options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0 && trie.dataLength<(UTRIE_DATA_BLOCK_LENGTH+0x100))
) {
*pErrorCode=U_INVALID_FORMAT_ERROR; /* not a UTrie */
return 0;
}
dataIs32=(UBool)((trie.options&UTRIE_OPTIONS_DATA_IS_32_BIT)!=0);
size=sizeof(UTrieHeader)+trie.indexLength*2+trie.dataLength*(dataIs32?4:2);
if(length>=0) {
UTrieHeader *outTrie;
if(length<size) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
outTrie=(UTrieHeader *)outData;
/* swap the header */
ds->swapArray32(ds, inTrie, sizeof(UTrieHeader), outTrie, pErrorCode);
/* swap the index and the data */
if(dataIs32) {
ds->swapArray16(ds, inTrie+1, trie.indexLength*2, outTrie+1, pErrorCode);
ds->swapArray32(ds, (const uint16_t *)(inTrie+1)+trie.indexLength, trie.dataLength*4,
(uint16_t *)(outTrie+1)+trie.indexLength, pErrorCode);
} else {
ds->swapArray16(ds, inTrie+1, (trie.indexLength+trie.dataLength)*2, outTrie+1, pErrorCode);
}
}
return size;
}
/* enumeration -------------------------------------------------------------- */
/* default UTrieEnumValue() returns the input value itself */

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2001-2005, International Business Machines
* Copyright (C) 2001-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -736,6 +736,57 @@ utrie_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/* serialization ------------------------------------------------------------ */
/**
* Trie data structure in serialized form:
*
* UTrieHeader header;
* uint16_t index[header.indexLength];
* uint16_t data[header.dataLength];
* @internal
*/
typedef struct UTrieHeader {
/** "Trie" in big-endian US-ASCII (0x54726965) */
uint32_t signature;
/**
* options bit field:
* 9 1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
* 8 0=16-bit data, 1=32-bit data
* 7..4 UTRIE_INDEX_SHIFT // 0..UTRIE_SHIFT
* 3..0 UTRIE_SHIFT // 1..9
*/
uint32_t options;
/** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
int32_t indexLength;
/** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
int32_t dataLength;
} UTrieHeader;
/**
* Constants for use with UTrieHeader.options.
* @internal
*/
enum {
/** Mask to get the UTRIE_SHIFT value from options. */
UTRIE_OPTIONS_SHIFT_MASK=0xf,
/** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
UTRIE_OPTIONS_INDEX_SHIFT=4,
/** If set, then the data (stage 2) array is 32 bits wide. */
UTRIE_OPTIONS_DATA_IS_32_BIT=0x100,
/**
* If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
* as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
*/
UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200
};
U_CDECL_END
#endif

View file

@ -0,0 +1,93 @@
/*
******************************************************************************
*
* Copyright (C) 2001-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utrie_swap.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created by: Markus W. Scherer
*
* This performs data swapping for a folded trie (see utrie.c for details).
*/
#include "udataswp.h"
#include "utrie.h"
/* swapping ----------------------------------------------------------------- */
U_CAPI int32_t U_EXPORT2
utrie_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
const UTrieHeader *inTrie;
UTrieHeader trie;
int32_t size;
UBool dataIs32;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(ds==NULL || inData==NULL || (length>=0 && outData==NULL)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* setup and swapping */
if(length>=0 && length<sizeof(UTrieHeader)) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
inTrie=(const UTrieHeader *)inData;
trie.signature=ds->readUInt32(inTrie->signature);
trie.options=ds->readUInt32(inTrie->options);
trie.indexLength=udata_readInt32(ds, inTrie->indexLength);
trie.dataLength=udata_readInt32(ds, inTrie->dataLength);
if( trie.signature!=0x54726965 ||
(trie.options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
((trie.options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT ||
trie.indexLength<UTRIE_BMP_INDEX_LENGTH ||
(trie.indexLength&(UTRIE_SURROGATE_BLOCK_COUNT-1))!=0 ||
trie.dataLength<UTRIE_DATA_BLOCK_LENGTH ||
(trie.dataLength&(UTRIE_DATA_GRANULARITY-1))!=0 ||
((trie.options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0 && trie.dataLength<(UTRIE_DATA_BLOCK_LENGTH+0x100))
) {
*pErrorCode=U_INVALID_FORMAT_ERROR; /* not a UTrie */
return 0;
}
dataIs32=(UBool)((trie.options&UTRIE_OPTIONS_DATA_IS_32_BIT)!=0);
size=sizeof(UTrieHeader)+trie.indexLength*2+trie.dataLength*(dataIs32?4:2);
if(length>=0) {
UTrieHeader *outTrie;
if(length<size) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
outTrie=(UTrieHeader *)outData;
/* swap the header */
ds->swapArray32(ds, inTrie, sizeof(UTrieHeader), outTrie, pErrorCode);
/* swap the index and the data */
if(dataIs32) {
ds->swapArray16(ds, inTrie+1, trie.indexLength*2, outTrie+1, pErrorCode);
ds->swapArray32(ds, (const uint16_t *)(inTrie+1)+trie.indexLength, trie.dataLength*4,
(uint16_t *)(outTrie+1)+trie.indexLength, pErrorCode);
} else {
ds->swapArray16(ds, inTrie+1, (trie.indexLength+trie.dataLength)*2, outTrie+1, pErrorCode);
}
}
return size;
}