mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-4078 normalization exclusion sets: move their building (with set patterns) from unorm.cpp to gennorm so that runtime normalization code does not depend on all properties and uniset_props.cpp
X-SVN-Rev: 16304
This commit is contained in:
parent
bd1f26f937
commit
c7b731f94b
3 changed files with 118 additions and 18 deletions
|
@ -74,7 +74,7 @@
|
|||
* except that this is not implemented for Jamo
|
||||
* - c is treated as having a combining class of 0
|
||||
*/
|
||||
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
/*
|
||||
* This new implementation of the normalization code loads its data from
|
||||
|
@ -470,23 +470,47 @@ internalGetNXHangul(UErrorCode &errorCode) {
|
|||
return nxCache[UNORM_NX_HANGUL];
|
||||
}
|
||||
|
||||
/* get and set an exclusion set from a UnicodeSet pattern */
|
||||
/* unorm.cpp 1.116 had and used
|
||||
static const UnicodeSet *
|
||||
internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
|
||||
...
|
||||
}
|
||||
*/
|
||||
|
||||
/* get and set an exclusion set from a serialized UnicodeSet */
|
||||
static const UnicodeSet *
|
||||
internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
|
||||
/* internal function, does not check for incoming U_FAILURE */
|
||||
UBool isCached;
|
||||
|
||||
UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
|
||||
|
||||
if(!isCached) {
|
||||
UnicodeSet *set=new UnicodeSet(UnicodeString(pattern, -1, US_INV), errorCode);
|
||||
if( !isCached &&
|
||||
canonStartSets!=NULL &&
|
||||
canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
|
||||
) {
|
||||
USerializedSet sset;
|
||||
UnicodeSet *set;
|
||||
UChar32 start, end;
|
||||
int32_t i;
|
||||
|
||||
if( !uset_getSerializedSet(
|
||||
&sset,
|
||||
canonStartSets+canonStartSets[nxIndex],
|
||||
canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
|
||||
) {
|
||||
errorCode=U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* turn the serialized set into a UnicodeSet */
|
||||
set=new UnicodeSet();
|
||||
if(set==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
delete set;
|
||||
return NULL;
|
||||
for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
|
||||
set->add(start, end);
|
||||
}
|
||||
|
||||
umtx_lock(NULL);
|
||||
|
@ -504,24 +528,25 @@ internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &error
|
|||
|
||||
static const UnicodeSet *
|
||||
internalGetNXCJKCompat(UErrorCode &errorCode) {
|
||||
/* build a set from [CJK Ideographs]&[has canonical decomposition] */
|
||||
return internalGetNXFromPattern(
|
||||
/* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
|
||||
return internalGetSerializedNX(
|
||||
UNORM_NX_CJK_COMPAT,
|
||||
"[:Ideographic:]&[:NFD_QC=No:]",
|
||||
_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
|
||||
errorCode);
|
||||
}
|
||||
|
||||
static const UnicodeSet *
|
||||
internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
|
||||
/* internal function, does not check for incoming U_FAILURE */
|
||||
const char *pattern;
|
||||
int32_t nxIndex;
|
||||
|
||||
options&=_NORM_OPTIONS_UNICODE_MASK;
|
||||
switch(options) {
|
||||
case 0:
|
||||
return NULL;
|
||||
case UNORM_UNICODE_3_2:
|
||||
pattern="[:^Age=3.2:]";
|
||||
/* [:^Age=3.2:] */
|
||||
nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
|
||||
break;
|
||||
default:
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -529,7 +554,7 @@ internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
|
|||
}
|
||||
|
||||
/* build a set with all code points that were not designated by the specified Unicode version */
|
||||
return internalGetNXFromPattern(options, pattern, errorCode);
|
||||
return internalGetSerializedNX(options, nxIndex, errorCode);
|
||||
}
|
||||
|
||||
/* Get a decomposition exclusion set. The data must be loaded. */
|
||||
|
|
|
@ -92,11 +92,19 @@ enum {
|
|||
|
||||
/* canonStartSets[0..31] contains indexes for what is in the array */
|
||||
enum {
|
||||
_NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */
|
||||
_NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */
|
||||
_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */
|
||||
_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */
|
||||
|
||||
_NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */
|
||||
/* from formatVersion 2.3: */
|
||||
_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, /* uint16_t offset from canonStartSets[0] to the
|
||||
exclusion set for CJK compatibility characters */
|
||||
_NORM_SET_INDEX_NX_UNICODE32_OFFSET, /* uint16_t offset from canonStartSets[0] to the
|
||||
exclusion set for Unicode 3.2 characters */
|
||||
_NORM_SET_INDEX_NX_RESERVED_OFFSET, /* uint16_t offset from canonStartSets[0] to the
|
||||
end of the previous exclusion set */
|
||||
|
||||
_NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */
|
||||
};
|
||||
|
||||
/* more constants for canonical starter sets */
|
||||
|
@ -401,12 +409,14 @@ U_CAPI UNormalizationCheckResult U_EXPORT2
|
|||
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
|
||||
|
||||
/**
|
||||
* Description of the format of unorm.dat version 2.2.
|
||||
* Description of the format of unorm.icu version 2.3.
|
||||
*
|
||||
* Main change from version 1 to version 2:
|
||||
* Use of new, common UTrie instead of normalization-specific tries.
|
||||
* Change to version 2.1: add third/auxiliary trie with associated data.
|
||||
* Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK).
|
||||
* Change to version 2.3: add serialized sets for normalization exclusions
|
||||
* stored inside canonStartSets[]
|
||||
*
|
||||
* For more details of how to use the data structures see the code
|
||||
* in unorm.cpp (runtime normalization code) and
|
||||
|
@ -690,6 +700,31 @@ unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
|
|||
* if the high word has bit 15 set, then build a set with a single code point
|
||||
* which is (((high16(cp)&0x1f00)<<8)|result;
|
||||
* else there is a USerializedSet at canonStartSets+result
|
||||
*
|
||||
* FormatVersion 2.3 adds 2 serialized sets for normalization exclusions.
|
||||
* They are stored in the data file so that the runtime normalization code need
|
||||
* not depend on other properties and their data and implementation files.
|
||||
* The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table
|
||||
* give the location for each set.
|
||||
* There is no set stored for UNORM_NX_HANGUL because it's trivial to create
|
||||
* without using properties.
|
||||
*
|
||||
* Set contents:
|
||||
*
|
||||
* _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT)
|
||||
* [[:Ideographic:]&[:NFD_QC=No:]]
|
||||
* =[CJK Ideographs]&[has canonical decomposition]
|
||||
*
|
||||
* _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2)
|
||||
* [:^Age=3.2:]
|
||||
* =set with all code points that were not designated by the specified Unicode version
|
||||
*
|
||||
* _NORM_SET_INDEX_NX_RESERVED_OFFSET
|
||||
* This is an offset that points to where the next, future set would start.
|
||||
* Currently it indicates where the previous set ends, and thus its length.
|
||||
* The name for this enum constant may in the future be applied to different
|
||||
* index slots. In order to get the limit of a set, use its index slot and
|
||||
* the immediately following one regardless of that one's enum name.
|
||||
*/
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "filestrm.h"
|
||||
|
@ -36,6 +37,8 @@
|
|||
|
||||
#define DO_DEBUG_OUT 0
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
/*
|
||||
* The new implementation of the normalization code loads its data from
|
||||
* unorm.icu, which is generated with this gennorm tool.
|
||||
|
@ -74,7 +77,7 @@ static UDataInfo dataInfo={
|
|||
0,
|
||||
|
||||
{ 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
|
||||
{ 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 3, 2, 0, 0 } /* dataVersion (Unicode version) */
|
||||
};
|
||||
|
||||
|
@ -140,7 +143,8 @@ static uint16_t combiningTable[0x8000];
|
|||
static uint16_t combiningTableTop=0;
|
||||
|
||||
#define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
|
||||
static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH];
|
||||
static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
|
||||
+10000]; /* +10000 for exclusion sets */
|
||||
static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
|
||||
static int32_t canonSetsCount=0;
|
||||
|
||||
|
@ -1722,6 +1726,9 @@ generateData(const char *dataDir) {
|
|||
|
||||
#else
|
||||
|
||||
U_STRING_DECL(nxCJKCompatPattern, "[[:Ideographic:]&[:NFD_QC=No:]]", 31);
|
||||
U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
|
||||
USet *set;
|
||||
int32_t normTrieSize, fcdTrieSize, auxTrieSize;
|
||||
|
||||
normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
|
||||
|
@ -1757,6 +1764,38 @@ generateData(const char *dataDir) {
|
|||
}
|
||||
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
|
||||
|
||||
/* create the normalization exclusion sets */
|
||||
U_STRING_INIT(nxCJKCompatPattern, "[[:Ideographic:]&[:NFD_QC=No:]]", 31);
|
||||
U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
|
||||
|
||||
canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
|
||||
set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
uset_close(set);
|
||||
|
||||
canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
|
||||
set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
uset_close(set);
|
||||
|
||||
canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
|
||||
|
||||
/* make sure that the FCD trie is 4-aligned */
|
||||
if((utm_countItems(extraMem)+combiningTableTop)&1) {
|
||||
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
|
||||
|
@ -1789,6 +1828,7 @@ generateData(const char *dataDir) {
|
|||
printf(" number of sets %5d\n", (int)canonSetsCount);
|
||||
printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
|
||||
printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
|
||||
printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
|
||||
printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue