ICU-4078 normalization exclusion sets: move their building (with set patterns) from unorm.cpp to gennorm so that runtime normalization code does not depend on all properties and uniset_props.cpp

X-SVN-Rev: 16304
This commit is contained in:
Markus Scherer 2004-09-12 16:59:20 +00:00
parent bd1f26f937
commit c7b731f94b
3 changed files with 118 additions and 18 deletions

View file

@ -74,7 +74,7 @@
* except that this is not implemented for Jamo
* - c is treated as having a combining class of 0
*/
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
/*
* This new implementation of the normalization code loads its data from
@ -470,23 +470,47 @@ internalGetNXHangul(UErrorCode &errorCode) {
return nxCache[UNORM_NX_HANGUL];
}
/* get and set an exclusion set from a UnicodeSet pattern */
/* unorm.cpp 1.116 had and used
static const UnicodeSet *
internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
...
}
*/
/* get and set an exclusion set from a serialized UnicodeSet */
static const UnicodeSet *
internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
/* internal function, does not check for incoming U_FAILURE */
UBool isCached;
UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
if(!isCached) {
UnicodeSet *set=new UnicodeSet(UnicodeString(pattern, -1, US_INV), errorCode);
if( !isCached &&
canonStartSets!=NULL &&
canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
) {
USerializedSet sset;
UnicodeSet *set;
UChar32 start, end;
int32_t i;
if( !uset_getSerializedSet(
&sset,
canonStartSets+canonStartSets[nxIndex],
canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
) {
errorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
/* turn the serialized set into a UnicodeSet */
set=new UnicodeSet();
if(set==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
if(U_FAILURE(errorCode)) {
delete set;
return NULL;
for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
set->add(start, end);
}
umtx_lock(NULL);
@ -504,24 +528,25 @@ internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &error
static const UnicodeSet *
internalGetNXCJKCompat(UErrorCode &errorCode) {
/* build a set from [CJK Ideographs]&[has canonical decomposition] */
return internalGetNXFromPattern(
/* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
return internalGetSerializedNX(
UNORM_NX_CJK_COMPAT,
"[:Ideographic:]&[:NFD_QC=No:]",
_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
errorCode);
}
static const UnicodeSet *
internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
/* internal function, does not check for incoming U_FAILURE */
const char *pattern;
int32_t nxIndex;
options&=_NORM_OPTIONS_UNICODE_MASK;
switch(options) {
case 0:
return NULL;
case UNORM_UNICODE_3_2:
pattern="[:^Age=3.2:]";
/* [:^Age=3.2:] */
nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
break;
default:
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
@ -529,7 +554,7 @@ internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
}
/* build a set with all code points that were not designated by the specified Unicode version */
return internalGetNXFromPattern(options, pattern, errorCode);
return internalGetSerializedNX(options, nxIndex, errorCode);
}
/* Get a decomposition exclusion set. The data must be loaded. */

View file

@ -92,11 +92,19 @@ enum {
/* canonStartSets[0..31] contains indexes for what is in the array */
enum {
_NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */
_NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */
_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */
_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */
_NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */
/* from formatVersion 2.3: */
_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, /* uint16_t offset from canonStartSets[0] to the
exclusion set for CJK compatibility characters */
_NORM_SET_INDEX_NX_UNICODE32_OFFSET, /* uint16_t offset from canonStartSets[0] to the
exclusion set for Unicode 3.2 characters */
_NORM_SET_INDEX_NX_RESERVED_OFFSET, /* uint16_t offset from canonStartSets[0] to the
end of the previous exclusion set */
_NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */
};
/* more constants for canonical starter sets */
@ -401,12 +409,14 @@ U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
/**
* Description of the format of unorm.dat version 2.2.
* Description of the format of unorm.icu version 2.3.
*
* Main change from version 1 to version 2:
* Use of new, common UTrie instead of normalization-specific tries.
* Change to version 2.1: add third/auxiliary trie with associated data.
* Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK).
* Change to version 2.3: add serialized sets for normalization exclusions
* stored inside canonStartSets[]
*
* For more details of how to use the data structures see the code
* in unorm.cpp (runtime normalization code) and
@ -690,6 +700,31 @@ unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
* if the high word has bit 15 set, then build a set with a single code point
* which is (((high16(cp)&0x1f00)<<8)|result;
* else there is a USerializedSet at canonStartSets+result
*
* FormatVersion 2.3 adds 2 serialized sets for normalization exclusions.
* They are stored in the data file so that the runtime normalization code need
* not depend on other properties and their data and implementation files.
* The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table
* give the location for each set.
* There is no set stored for UNORM_NX_HANGUL because it's trivial to create
* without using properties.
*
* Set contents:
*
* _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT)
* [[:Ideographic:]&[:NFD_QC=No:]]
* =[CJK Ideographs]&[has canonical decomposition]
*
* _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2)
* [:^Age=3.2:]
* =set with all code points that were not designated by the specified Unicode version
*
* _NORM_SET_INDEX_NX_RESERVED_OFFSET
* This is an offset that points to where the next, future set would start.
* Currently it indicates where the previous set ends, and thus its length.
* The name for this enum constant may in the future be applied to different
* index slots. In order to get the limit of a set, use its index slot and
* the immediately following one regardless of that one's enum name.
*/
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View file

@ -20,6 +20,7 @@
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
@ -36,6 +37,8 @@
#define DO_DEBUG_OUT 0
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
/*
* The new implementation of the normalization code loads its data from
* unorm.icu, which is generated with this gennorm tool.
@ -74,7 +77,7 @@ static UDataInfo dataInfo={
0,
{ 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
{ 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 3, 2, 0, 0 } /* dataVersion (Unicode version) */
};
@ -140,7 +143,8 @@ static uint16_t combiningTable[0x8000];
static uint16_t combiningTableTop=0;
#define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH];
static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
+10000]; /* +10000 for exclusion sets */
static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
static int32_t canonSetsCount=0;
@ -1722,6 +1726,9 @@ generateData(const char *dataDir) {
#else
U_STRING_DECL(nxCJKCompatPattern, "[[:Ideographic:]&[:NFD_QC=No:]]", 31);
U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
USet *set;
int32_t normTrieSize, fcdTrieSize, auxTrieSize;
normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
@ -1757,6 +1764,38 @@ generateData(const char *dataDir) {
}
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
/* create the normalization exclusion sets */
U_STRING_INIT(nxCJKCompatPattern, "[[:Ideographic:]&[:NFD_QC=No:]]", 31);
U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
exit(errorCode);
}
canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
exit(errorCode);
}
uset_close(set);
canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
exit(errorCode);
}
canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
exit(errorCode);
}
uset_close(set);
canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
/* make sure that the FCD trie is 4-aligned */
if((utm_countItems(extraMem)+combiningTableTop)&1) {
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
@ -1789,6 +1828,7 @@ generateData(const char *dataDir) {
printf(" number of sets %5d\n", (int)canonSetsCount);
printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
}