diff --git a/icu4c/source/common/unorm.cpp b/icu4c/source/common/unorm.cpp index 44e2845ee4a..85eba54115d 100644 --- a/icu4c/source/common/unorm.cpp +++ b/icu4c/source/common/unorm.cpp @@ -74,7 +74,7 @@ * except that this is not implemented for Jamo * - c is treated as having a combining class of 0 */ -#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) /* * This new implementation of the normalization code loads its data from @@ -470,23 +470,47 @@ internalGetNXHangul(UErrorCode &errorCode) { return nxCache[UNORM_NX_HANGUL]; } -/* get and set an exclusion set from a UnicodeSet pattern */ +/* unorm.cpp 1.116 had and used static const UnicodeSet * internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) { + ... +} +*/ + +/* get and set an exclusion set from a serialized UnicodeSet */ +static const UnicodeSet * +internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) { /* internal function, does not check for incoming U_FAILURE */ UBool isCached; UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached); - if(!isCached) { - UnicodeSet *set=new UnicodeSet(UnicodeString(pattern, -1, US_INV), errorCode); + if( !isCached && + canonStartSets!=NULL && + canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex] + ) { + USerializedSet sset; + UnicodeSet *set; + UChar32 start, end; + int32_t i; + + if( !uset_getSerializedSet( + &sset, + canonStartSets+canonStartSets[nxIndex], + canonStartSets[nxIndex+1]-canonStartSets[nxIndex]) + ) { + errorCode=U_INVALID_FORMAT_ERROR; + return NULL; + } + + /* turn the serialized set into a UnicodeSet */ + set=new UnicodeSet(); if(set==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; return NULL; } - if(U_FAILURE(errorCode)) { - delete set; - return NULL; + for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) { + set->add(start, end); } umtx_lock(NULL); @@ -504,24 +528,25 @@ internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &error static const UnicodeSet * internalGetNXCJKCompat(UErrorCode &errorCode) { - /* build a set from [CJK Ideographs]&[has canonical decomposition] */ - return internalGetNXFromPattern( + /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */ + return internalGetSerializedNX( UNORM_NX_CJK_COMPAT, - "[:Ideographic:]&[:NFD_QC=No:]", + _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, errorCode); } static const UnicodeSet * internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) { /* internal function, does not check for incoming U_FAILURE */ - const char *pattern; + int32_t nxIndex; options&=_NORM_OPTIONS_UNICODE_MASK; switch(options) { case 0: return NULL; case UNORM_UNICODE_3_2: - pattern="[:^Age=3.2:]"; + /* [:^Age=3.2:] */ + nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET; break; default: errorCode=U_ILLEGAL_ARGUMENT_ERROR; @@ -529,7 +554,7 @@ internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) { } /* build a set with all code points that were not designated by the specified Unicode version */ - return internalGetNXFromPattern(options, pattern, errorCode); + return internalGetSerializedNX(options, nxIndex, errorCode); } /* Get a decomposition exclusion set. The data must be loaded. */ diff --git a/icu4c/source/common/unormimp.h b/icu4c/source/common/unormimp.h index acc58d8618b..f26aa999fc7 100644 --- a/icu4c/source/common/unormimp.h +++ b/icu4c/source/common/unormimp.h @@ -92,11 +92,19 @@ enum { /* canonStartSets[0..31] contains indexes for what is in the array */ enum { - _NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */ + _NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */ _NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */ _NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */ - _NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */ + /* from formatVersion 2.3: */ + _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, /* uint16_t offset from canonStartSets[0] to the + exclusion set for CJK compatibility characters */ + _NORM_SET_INDEX_NX_UNICODE32_OFFSET, /* uint16_t offset from canonStartSets[0] to the + exclusion set for Unicode 3.2 characters */ + _NORM_SET_INDEX_NX_RESERVED_OFFSET, /* uint16_t offset from canonStartSets[0] to the + end of the previous exclusion set */ + + _NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */ }; /* more constants for canonical starter sets */ @@ -401,12 +409,14 @@ U_CAPI UNormalizationCheckResult U_EXPORT2 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); /** - * Description of the format of unorm.dat version 2.2. + * Description of the format of unorm.icu version 2.3. * * Main change from version 1 to version 2: * Use of new, common UTrie instead of normalization-specific tries. * Change to version 2.1: add third/auxiliary trie with associated data. * Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK). + * Change to version 2.3: add serialized sets for normalization exclusions + * stored inside canonStartSets[] * * For more details of how to use the data structures see the code * in unorm.cpp (runtime normalization code) and @@ -690,6 +700,31 @@ unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); * if the high word has bit 15 set, then build a set with a single code point * which is (((high16(cp)&0x1f00)<<8)|result; * else there is a USerializedSet at canonStartSets+result + * + * FormatVersion 2.3 adds 2 serialized sets for normalization exclusions. + * They are stored in the data file so that the runtime normalization code need + * not depend on other properties and their data and implementation files. + * The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table + * give the location for each set. + * There is no set stored for UNORM_NX_HANGUL because it's trivial to create + * without using properties. + * + * Set contents: + * + * _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT) + * [[:Ideographic:]&[:NFD_QC=No:]] + * =[CJK Ideographs]&[has canonical decomposition] + * + * _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2) + * [:^Age=3.2:] + * =set with all code points that were not designated by the specified Unicode version + * + * _NORM_SET_INDEX_NX_RESERVED_OFFSET + * This is an offset that points to where the next, future set would start. + * Currently it indicates where the previous set ends, and thus its length. + * The name for this enum constant may in the future be applied to different + * index slots. In order to get the limit of a set, use its index slot and + * the immediately following one regardless of that one's enum name. */ #endif /* #if !UCONFIG_NO_NORMALIZATION */ diff --git a/icu4c/source/tools/gennorm/store.c b/icu4c/source/tools/gennorm/store.c index 489ef98050a..38b0d98b4e7 100644 --- a/icu4c/source/tools/gennorm/store.c +++ b/icu4c/source/tools/gennorm/store.c @@ -20,6 +20,7 @@ #include #include "unicode/utypes.h" #include "unicode/uchar.h" +#include "unicode/ustring.h" #include "cmemory.h" #include "cstring.h" #include "filestrm.h" @@ -36,6 +37,8 @@ #define DO_DEBUG_OUT 0 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + /* * The new implementation of the normalization code loads its data from * unorm.icu, which is generated with this gennorm tool. @@ -74,7 +77,7 @@ static UDataInfo dataInfo={ 0, { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */ - { 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ + { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ }; @@ -140,7 +143,8 @@ static uint16_t combiningTable[0x8000]; static uint16_t combiningTableTop=0; #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000 -static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH]; +static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH + +10000]; /* +10000 for exclusion sets */ static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP; static int32_t canonSetsCount=0; @@ -1722,6 +1726,9 @@ generateData(const char *dataDir) { #else + U_STRING_DECL(nxCJKCompatPattern, "[[:Ideographic:]&[:NFD_QC=No:]]", 31); + U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12); + USet *set; int32_t normTrieSize, fcdTrieSize, auxTrieSize; normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode); @@ -1757,6 +1764,38 @@ generateData(const char *dataDir) { } canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]; + /* create the normalization exclusion sets */ + U_STRING_INIT(nxCJKCompatPattern, "[[:Ideographic:]&[:NFD_QC=No:]]", 31); + U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12); + + canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop; + set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode)); + exit(errorCode); + } + canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode)); + exit(errorCode); + } + uset_close(set); + + canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop; + set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode)); + exit(errorCode); + } + canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode)); + exit(errorCode); + } + uset_close(set); + + canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop; + /* make sure that the FCD trie is 4-aligned */ if((utm_countItems(extraMem)+combiningTableTop)&1) { combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */ @@ -1789,6 +1828,7 @@ generateData(const char *dataDir) { printf(" number of sets %5d\n", (int)canonSetsCount); printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]); printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]); + printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]); printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size); }