mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-8804 Normalizer2::getRawDecomposition(c) with added data in .nrm formatVersion 2
X-SVN-Rev: 30980
This commit is contained in:
parent
0ff00f0a40
commit
03748b07f1
15 changed files with 629 additions and 99 deletions
|
@ -155,6 +155,11 @@ FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) c
|
|||
return set.contains(c) && norm2.getDecomposition(c, decomposition);
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
|
||||
return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
|
||||
}
|
||||
|
||||
uint8_t
|
||||
FilteredNormalizer2::getCombiningClass(UChar32 c) const {
|
||||
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
|
||||
|
|
|
@ -35,6 +35,18 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
Normalizer2::~Normalizer2() {}
|
||||
|
||||
UBool
|
||||
Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
uint8_t
|
||||
Normalizer2::getCombiningClass(UChar32 /*c*/) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(Normalizer2)
|
||||
|
||||
// Normalizer2 implementation for the old UNORM_NONE.
|
||||
class NoopNormalizer2 : public Normalizer2 {
|
||||
virtual ~NoopNormalizer2();
|
||||
|
@ -82,6 +94,7 @@ class NoopNormalizer2 : public Normalizer2 {
|
|||
getDecomposition(UChar32, UnicodeString &) const {
|
||||
return FALSE;
|
||||
}
|
||||
// No need to override the default getRawDecomposition().
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &, UErrorCode &) const {
|
||||
return TRUE;
|
||||
|
@ -195,6 +208,21 @@ public:
|
|||
}
|
||||
return TRUE;
|
||||
}
|
||||
virtual UBool
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
|
||||
UChar buffer[30];
|
||||
int32_t length;
|
||||
const UChar *d=impl.getRawDecomposition(c, buffer, length);
|
||||
if(d==NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
if(d==buffer) {
|
||||
decomposition.setTo(buffer, length); // copy the string (algorithmic decomposition)
|
||||
} else {
|
||||
decomposition.setTo(FALSE, d, length); // read-only alias
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
virtual uint8_t
|
||||
getCombiningClass(UChar32 c) const {
|
||||
|
@ -656,13 +684,6 @@ Normalizer2::getInstance(const char *packageName,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
uint8_t
|
||||
Normalizer2::getCombiningClass(UChar32 /*c*/) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(Normalizer2)
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// C API ------------------------------------------------------------------- ***
|
||||
|
@ -813,6 +834,25 @@ unorm2_getDecomposition(const UNormalizer2 *norm2,
|
|||
}
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_getRawDecomposition(const UNormalizer2 *norm2,
|
||||
UChar32 c, UChar *decomposition, int32_t capacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(decomposition==NULL ? capacity!=0 : capacity<0) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString destString(decomposition, 0, capacity);
|
||||
if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
|
||||
return destString.extract(decomposition, capacity, *pErrorCode);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
U_DRAFT uint8_t U_EXPORT2
|
||||
unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
|
||||
return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
|
||||
|
|
|
@ -270,7 +270,7 @@ Normalizer2Impl::isAcceptable(void *context,
|
|||
pInfo->dataFormat[1]==0x72 &&
|
||||
pInfo->dataFormat[2]==0x6d &&
|
||||
pInfo->dataFormat[3]==0x32 &&
|
||||
pInfo->formatVersion[0]==1
|
||||
pInfo->formatVersion[0]==2
|
||||
) {
|
||||
Normalizer2Impl *me=(Normalizer2Impl *)context;
|
||||
uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
|
||||
|
@ -315,8 +315,30 @@ Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &err
|
|||
}
|
||||
|
||||
offset=nextOffset;
|
||||
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
|
||||
maybeYesCompositions=(const uint16_t *)(inBytes+offset);
|
||||
extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
|
||||
|
||||
// smallFCD: new in formatVersion 2
|
||||
offset=nextOffset;
|
||||
smallFCD=inBytes+offset;
|
||||
|
||||
// Build tccc180[].
|
||||
// gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
|
||||
uint8_t bits=0;
|
||||
for(UChar c=0; c<0x180; bits>>=1) {
|
||||
if((c&0xff)==0) {
|
||||
bits=smallFCD[c>>8]; // one byte per 0x100 code points
|
||||
}
|
||||
if(bits&1) {
|
||||
for(int i=0; i<0x20; ++i, ++c) {
|
||||
tccc180[c]=(uint8_t)getFCD16FromNormData(c);
|
||||
}
|
||||
} else {
|
||||
uprv_memset(tccc180+c, 0, 0x20);
|
||||
c+=0x20;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
|
||||
|
@ -524,16 +546,16 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
|
|||
} else {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
uint16_t firstUnit=*mapping++;
|
||||
uint16_t firstUnit=*mapping;
|
||||
int32_t length=firstUnit&MAPPING_LENGTH_MASK;
|
||||
uint8_t leadCC, trailCC;
|
||||
trailCC=(uint8_t)(firstUnit>>8);
|
||||
if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
|
||||
leadCC=(uint8_t)(*mapping++>>8);
|
||||
leadCC=(uint8_t)(*(mapping-1)>>8);
|
||||
} else {
|
||||
leadCC=0;
|
||||
}
|
||||
return buffer.append((const UChar *)mapping, length, leadCC, trailCC, errorCode);
|
||||
return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -558,12 +580,57 @@ Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) c
|
|||
} else {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
uint16_t firstUnit=*mapping++;
|
||||
length=firstUnit&MAPPING_LENGTH_MASK;
|
||||
if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
|
||||
++mapping;
|
||||
length=*mapping&MAPPING_LENGTH_MASK;
|
||||
return (const UChar *)mapping+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
|
||||
// so that a raw mapping fits that consists of one unit ("rm0")
|
||||
// plus all but the first two code units of the normal mapping.
|
||||
// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
|
||||
const UChar *
|
||||
Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
|
||||
// We do not loop in this method because an algorithmic mapping itself
|
||||
// becomes a final result rather than having to be decomposed recursively.
|
||||
uint16_t norm16;
|
||||
if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
|
||||
// c does not decompose
|
||||
return NULL;
|
||||
} else if(isHangul(norm16)) {
|
||||
// Hangul syllable: decompose algorithmically
|
||||
Hangul::getRawDecomposition(c, buffer);
|
||||
length=2;
|
||||
return buffer;
|
||||
} else if(isDecompNoAlgorithmic(norm16)) {
|
||||
c=mapAlgorithmic(c, norm16);
|
||||
length=0;
|
||||
U16_APPEND_UNSAFE(buffer, length, c);
|
||||
return buffer;
|
||||
} else {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
uint16_t firstUnit=*mapping;
|
||||
int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
|
||||
if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
|
||||
// Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
|
||||
// Bit 7=MAPPING_HAS_CCC_LCCC_WORD
|
||||
const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
|
||||
uint16_t rm0=*rawMapping;
|
||||
if(rm0<=MAPPING_LENGTH_MASK) {
|
||||
length=rm0;
|
||||
return rawMapping-rm0;
|
||||
} else {
|
||||
// Copy the normal mapping and replace its first two code units with rm0.
|
||||
buffer[0]=(UChar)rm0;
|
||||
u_memcpy(buffer+1, mapping+1+2, mLength-2);
|
||||
length=mLength-1;
|
||||
return buffer;
|
||||
}
|
||||
return (const UChar *)mapping;
|
||||
} else {
|
||||
length=mLength;
|
||||
return (const UChar *)mapping+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -611,7 +678,7 @@ UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
|
|||
} else {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
uint16_t firstUnit=*mapping++;
|
||||
uint16_t firstUnit=*mapping;
|
||||
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -627,7 +694,7 @@ UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
|
|||
// if(trailCC==1) test leadCC==0, same as checking for before-boundary
|
||||
}
|
||||
// TRUE if leadCC==0 (hasFCDBoundaryBefore())
|
||||
return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*mapping&0xff00)==0;
|
||||
return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1327,14 +1394,14 @@ UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
|
|||
} else {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
uint16_t firstUnit=*mapping++;
|
||||
uint16_t firstUnit=*mapping;
|
||||
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
|
||||
return FALSE;
|
||||
}
|
||||
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*mapping++&0xff00)) {
|
||||
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
|
||||
return FALSE; // non-zero leadCC
|
||||
}
|
||||
int32_t i=0;
|
||||
int32_t i=1; // skip over the firstUnit
|
||||
UChar32 c;
|
||||
U16_NEXT_UNSAFE(mapping, i, c);
|
||||
return isCompYesAndZeroCC(getNorm16(c));
|
||||
|
@ -1348,7 +1415,8 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo
|
|||
if(isInert(norm16)) {
|
||||
return TRUE;
|
||||
} else if(norm16<=minYesNo) {
|
||||
// Hangul LVT (==minYesNo) has a boundary after it.
|
||||
// Hangul: norm16==minYesNo
|
||||
// Hangul LVT has a boundary after it.
|
||||
// Hangul LV and non-inert yesYes characters combine forward.
|
||||
return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
|
||||
} else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
|
||||
|
@ -1362,12 +1430,13 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBo
|
|||
const uint16_t *mapping=getMapping(norm16);
|
||||
uint16_t firstUnit=*mapping;
|
||||
// TRUE if
|
||||
// c is not deleted, and
|
||||
// it and its decomposition do not combine forward, and it has a starter, and
|
||||
// if FCC then trailCC<=1
|
||||
// not MAPPING_NO_COMP_BOUNDARY_AFTER
|
||||
// (which is set if
|
||||
// c is not deleted, and
|
||||
// it and its decomposition do not combine forward, and it has a starter)
|
||||
// and if FCC then trailCC<=1
|
||||
return
|
||||
(firstUnit&MAPPING_LENGTH_MASK)!=0 &&
|
||||
(firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
|
||||
(firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
|
||||
(!onlyContiguous || firstUnit<=0x1ff);
|
||||
}
|
||||
}
|
||||
|
@ -1462,7 +1531,7 @@ void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCod
|
|||
void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
|
||||
UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
|
||||
// Only loops for 1:1 algorithmic mappings.
|
||||
for(;;) { /* loop doesn't iterate */
|
||||
for(;;) {
|
||||
if(norm16>=MIN_NORMAL_MAYBE_YES) {
|
||||
norm16&=0xff;
|
||||
norm16|=norm16<<8;
|
||||
|
@ -1492,12 +1561,10 @@ void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t no
|
|||
// characters on both sides will become adjacent.
|
||||
norm16=0x1ff;
|
||||
} else {
|
||||
norm16=firstUnit>>8; // tccc
|
||||
if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
|
||||
norm16=mapping[1]&0xff00; // lccc
|
||||
} else {
|
||||
norm16=0;
|
||||
norm16|=*(mapping-1)&0xff00; // lccc
|
||||
}
|
||||
norm16|=firstUnit>>8; // tccc
|
||||
}
|
||||
}
|
||||
utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
|
||||
|
@ -1511,6 +1578,42 @@ const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
|
|||
return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
|
||||
}
|
||||
|
||||
// Gets the FCD value from the regular normalization data.
|
||||
uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
|
||||
// Only loops for 1:1 algorithmic mappings.
|
||||
for(;;) {
|
||||
uint16_t norm16=getNorm16(c);
|
||||
if(norm16<=minYesNo) {
|
||||
// no decomposition or Hangul syllable, all zeros
|
||||
return 0;
|
||||
} else if(norm16>=MIN_NORMAL_MAYBE_YES) {
|
||||
// combining mark
|
||||
norm16&=0xff;
|
||||
return norm16|(norm16<<8);
|
||||
} else if(norm16>=minMaybeYes) {
|
||||
return 0;
|
||||
} else if(isDecompNoAlgorithmic(norm16)) {
|
||||
c=mapAlgorithmic(c, norm16);
|
||||
} else {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
uint16_t firstUnit=*mapping;
|
||||
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
|
||||
// A character that is deleted (maps to an empty string) must
|
||||
// get the worst-case lccc and tccc values because arbitrary
|
||||
// characters on both sides will become adjacent.
|
||||
return 0x1ff;
|
||||
} else {
|
||||
norm16=firstUnit>>8; // tccc
|
||||
if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
|
||||
norm16|=*(mapping-1)&0xff00; // lccc
|
||||
}
|
||||
return norm16;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Dual functionality:
|
||||
// buffer!=NULL: normalize
|
||||
// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
|
||||
|
@ -1836,16 +1939,16 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, ui
|
|||
if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
const uint16_t *mapping=getMapping(norm16_2);
|
||||
uint16_t firstUnit=*mapping++;
|
||||
uint16_t firstUnit=*mapping;
|
||||
int32_t length=firstUnit&MAPPING_LENGTH_MASK;
|
||||
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
|
||||
if(c==c2 && (*mapping&0xff)!=0) {
|
||||
if(c==c2 && (*(mapping-1)&0xff)!=0) {
|
||||
newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
|
||||
}
|
||||
++mapping;
|
||||
}
|
||||
// Skip empty mappings (no characters in the decomposition).
|
||||
if(length!=0) {
|
||||
++mapping; // skip over the firstUnit
|
||||
// add c to first code point's start set
|
||||
int32_t i=0;
|
||||
U16_NEXT_UNSAFE(mapping, i, c2);
|
||||
|
@ -1954,7 +2057,7 @@ unorm2_swap(const UDataSwapper *ds,
|
|||
pInfo->dataFormat[1]==0x72 &&
|
||||
pInfo->dataFormat[2]==0x6d &&
|
||||
pInfo->dataFormat[3]==0x32 &&
|
||||
pInfo->formatVersion[0]==1
|
||||
(pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
|
||||
)) {
|
||||
udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1],
|
||||
|
@ -2013,10 +2116,14 @@ unorm2_swap(const UDataSwapper *ds,
|
|||
offset=nextOffset;
|
||||
|
||||
/* swap the uint16_t extraData[] */
|
||||
nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET+1];
|
||||
nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
|
||||
ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
|
||||
offset=nextOffset;
|
||||
|
||||
/* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
|
||||
nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
|
||||
offset=nextOffset;
|
||||
|
||||
U_ASSERT(offset==size);
|
||||
}
|
||||
|
||||
|
|
|
@ -86,6 +86,24 @@ public:
|
|||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decomposes c, which must be a Hangul syllable, into buffer.
|
||||
* This is the raw, not recursive, decomposition. Its length is always 2.
|
||||
*/
|
||||
static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) {
|
||||
UChar32 orig=c;
|
||||
c-=HANGUL_BASE;
|
||||
UChar32 c2=c%JAMO_T_COUNT;
|
||||
if(c2==0) {
|
||||
c/=JAMO_T_COUNT;
|
||||
buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
|
||||
buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
|
||||
} else {
|
||||
buffer[0]=orig-c2; // LV syllable
|
||||
buffer[1]=(UChar)(JAMO_T_BASE+c2);
|
||||
}
|
||||
}
|
||||
private:
|
||||
Hangul(); // no instantiation
|
||||
};
|
||||
|
@ -253,6 +271,8 @@ public:
|
|||
return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2));
|
||||
}
|
||||
|
||||
uint16_t getFCD16FromNormData(UChar32 c) const;
|
||||
|
||||
void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
|
||||
UTrie2 *newFCDTrie, UErrorCode &errorCode) const;
|
||||
|
||||
|
@ -260,7 +280,7 @@ public:
|
|||
CanonIterData &newData, UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Get the decomposition for one code point.
|
||||
* Gets the decomposition for one code point.
|
||||
* @param c code point
|
||||
* @param buffer out-only buffer for algorithmic decompositions
|
||||
* @param length out-only, takes the length of the decomposition, if any
|
||||
|
@ -268,6 +288,15 @@ public:
|
|||
*/
|
||||
const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const;
|
||||
|
||||
/**
|
||||
* Gets the raw decomposition for one code point.
|
||||
* @param c code point
|
||||
* @param buffer out-only buffer for algorithmic decompositions
|
||||
* @param length out-only, takes the length of the decomposition, if any
|
||||
* @return pointer to the decomposition, or NULL if none
|
||||
*/
|
||||
const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const;
|
||||
|
||||
UBool isCanonSegmentStarter(UChar32 c) const;
|
||||
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
|
||||
|
||||
|
@ -287,7 +316,7 @@ public:
|
|||
// Byte offsets from the start of the data, after the generic header.
|
||||
IX_NORM_TRIE_OFFSET,
|
||||
IX_EXTRA_DATA_OFFSET,
|
||||
IX_RESERVED2_OFFSET,
|
||||
IX_SMALL_FCD_OFFSET,
|
||||
IX_RESERVED3_OFFSET,
|
||||
IX_RESERVED4_OFFSET,
|
||||
IX_RESERVED5_OFFSET,
|
||||
|
@ -311,7 +340,7 @@ public:
|
|||
|
||||
enum {
|
||||
MAPPING_HAS_CCC_LCCC_WORD=0x80,
|
||||
MAPPING_PLUS_COMPOSITION_LIST=0x40,
|
||||
MAPPING_HAS_RAW_MAPPING=0x40,
|
||||
MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
|
||||
MAPPING_LENGTH_MASK=0x1f
|
||||
};
|
||||
|
@ -414,7 +443,7 @@ private:
|
|||
uint8_t getCCFromNoNo(uint16_t norm16) const {
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
|
||||
return (uint8_t)mapping[1];
|
||||
return (uint8_t)*(mapping-1);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
@ -442,8 +471,7 @@ private:
|
|||
const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list
|
||||
return list+ // mapping pointer
|
||||
1+ // +1 to skip the first unit with the mapping lenth
|
||||
(*list&MAPPING_LENGTH_MASK)+ // + mapping length
|
||||
((*list>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD
|
||||
(*list&MAPPING_LENGTH_MASK); // + mapping length
|
||||
}
|
||||
/**
|
||||
* @param c code point must have compositions
|
||||
|
@ -497,6 +525,8 @@ private:
|
|||
UTrie2 *normTrie;
|
||||
const uint16_t *maybeYesCompositions;
|
||||
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
|
||||
const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
|
||||
uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F
|
||||
|
||||
SimpleSingleton fcdTrieSingleton;
|
||||
SimpleSingleton canonIterDataSingleton;
|
||||
|
@ -650,7 +680,7 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
|||
|
||||
/**
|
||||
* Format of Normalizer2 .nrm data files.
|
||||
* Format version 1.0.
|
||||
* Format version 2.0.
|
||||
*
|
||||
* Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
|
||||
* ICU ships with data files for standard Unicode Normalization Forms
|
||||
|
@ -747,6 +777,29 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
|||
* The norm16 values of those characters are directly indexes into the extraData array.
|
||||
*
|
||||
* The data structures for composition lists and mappings are described in the design doc.
|
||||
*
|
||||
* uint8_t smallFCD[0x100]; -- new in format version 2
|
||||
*
|
||||
* This is a bit set to help speed up FCD value lookups in the absence of a full
|
||||
* UTrie2 or other large data structure with the full FCD value mapping.
|
||||
*
|
||||
* Each smallFCD bit is set if any of the corresponding 32 BMP code points
|
||||
* has a non-zero FCD value (lccc!=0 or tccc!=0).
|
||||
* Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
|
||||
* A bit for 32 lead surrogates is set if any of the 32k corresponding
|
||||
* _supplementary_ code points has a non-zero FCD value.
|
||||
*
|
||||
* This bit set is most useful for the large blocks of CJK characters with FCD=0.
|
||||
*
|
||||
* Changes from format version 1 to format version 2 ---------------------------
|
||||
*
|
||||
* - Addition of data for raw (not recursively decomposed) mappings.
|
||||
* + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
|
||||
* the mapping is to an empty string or when the character combines-forward.
|
||||
* This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
|
||||
* is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
|
||||
* + For details see the design doc.
|
||||
* - Addition of the smallFCD[] bit set.
|
||||
*/
|
||||
|
||||
#endif /* !UCONFIG_NO_NORMALIZATION */
|
||||
|
|
|
@ -196,6 +196,33 @@ public:
|
|||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
|
||||
|
||||
/**
|
||||
* Gets the raw decomposition mapping of c.
|
||||
*
|
||||
* This is similar to the getDecomposition() method but returns the
|
||||
* raw decomposition mapping as specified in UnicodeData.txt or
|
||||
* (for custom data) in the mapping files processed by the gennorm2 tool.
|
||||
* By contrast, getDecomposition() returns the processed,
|
||||
* recursively-decomposed version of this mapping.
|
||||
*
|
||||
* When used on a standard NFKC Normalizer2 instance,
|
||||
* getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
|
||||
*
|
||||
* When used on a standard NFC Normalizer2 instance,
|
||||
* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
|
||||
* in this case, the result contains either one or two code points (=1..4 UChars).
|
||||
*
|
||||
* This function is independent of the mode of the Normalizer2.
|
||||
* The default implementation returns FALSE.
|
||||
* @param c code point
|
||||
* @param decomposition String object which will be set to c's
|
||||
* raw decomposition mapping, if there is one.
|
||||
* @return TRUE if c has a decomposition, otherwise FALSE
|
||||
* @draft ICU 49
|
||||
*/
|
||||
virtual UBool
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
||||
|
||||
/**
|
||||
* Gets the combining class of c.
|
||||
* The default implementation returns 0
|
||||
|
@ -405,8 +432,9 @@ public:
|
|||
UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
|
||||
* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
|
||||
* Gets the decomposition mapping of c.
|
||||
* For details see the base class documentation.
|
||||
*
|
||||
* This function is independent of the mode of the Normalizer2.
|
||||
* @param c code point
|
||||
* @param decomposition String object which will be set to c's
|
||||
|
@ -417,6 +445,20 @@ public:
|
|||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
||||
|
||||
/**
|
||||
* Gets the raw decomposition mapping of c.
|
||||
* For details see the base class documentation.
|
||||
*
|
||||
* This function is independent of the mode of the Normalizer2.
|
||||
* @param c code point
|
||||
* @param decomposition String object which will be set to c's
|
||||
* raw decomposition mapping, if there is one.
|
||||
* @return TRUE if c has a decomposition, otherwise FALSE
|
||||
* @draft ICU 49
|
||||
*/
|
||||
virtual UBool
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
||||
|
||||
/**
|
||||
* Gets the combining class of c.
|
||||
* The default implementation returns 0
|
||||
|
|
|
@ -261,8 +261,11 @@ unorm2_append(const UNormalizer2 *norm2,
|
|||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Gets the decomposition mapping of c. Equivalent to unorm2_normalize(string(c))
|
||||
* on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster.
|
||||
* Gets the decomposition mapping of c.
|
||||
* Roughly equivalent to normalizing the String form of c
|
||||
* on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function
|
||||
* returns a negative value and does not write a string
|
||||
* if c does not have a decomposition mapping in this instance's data.
|
||||
* This function is independent of the mode of the UNormalizer2.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param c code point
|
||||
|
@ -281,6 +284,40 @@ unorm2_getDecomposition(const UNormalizer2 *norm2,
|
|||
UChar32 c, UChar *decomposition, int32_t capacity,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Gets the raw decomposition mapping of c.
|
||||
*
|
||||
* This is similar to the unorm2_getDecomposition() function but returns the
|
||||
* raw decomposition mapping as specified in UnicodeData.txt or
|
||||
* (for custom data) in the mapping files processed by the gennorm2 tool.
|
||||
* By contrast, unorm2_getDecomposition() returns the processed,
|
||||
* recursively-decomposed version of this mapping.
|
||||
*
|
||||
* When used on a standard NFKC Normalizer2 instance,
|
||||
* unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
|
||||
*
|
||||
* When used on a standard NFC Normalizer2 instance,
|
||||
* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
|
||||
* in this case, the result contains either one or two code points (=1..4 UChars).
|
||||
*
|
||||
* This function is independent of the mode of the UNormalizer2.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param c code point
|
||||
* @param decomposition String buffer which will be set to c's
|
||||
* raw decomposition mapping, if there is one.
|
||||
* @param capacity number of UChars that can be written to decomposition
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value
|
||||
* @draft ICU 49
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_getRawDecomposition(const UNormalizer2 *norm2,
|
||||
UChar32 c, UChar *decomposition, int32_t capacity,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Gets the combining class of c.
|
||||
* The default implementation returns 0
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -65,6 +65,9 @@ TestFCD(void);
|
|||
static void
|
||||
TestGetDecomposition(void);
|
||||
|
||||
static void
|
||||
TestGetRawDecomposition(void);
|
||||
|
||||
static void TestAppendRestoreMiddle(void);
|
||||
|
||||
static const char* const canonTests[][3] = {
|
||||
|
@ -154,6 +157,7 @@ void addNormTest(TestNode** root)
|
|||
addTest(root, &TestFCNFKCClosure, "tsnorm/cnormtst/TestFCNFKCClosure");
|
||||
addTest(root, &TestComposition, "tsnorm/cnormtst/TestComposition");
|
||||
addTest(root, &TestGetDecomposition, "tsnorm/cnormtst/TestGetDecomposition");
|
||||
addTest(root, &TestGetRawDecomposition, "tsnorm/cnormtst/TestGetRawDecomposition");
|
||||
addTest(root, &TestAppendRestoreMiddle, "tsnorm/cnormtst/TestAppendRestoreMiddle");
|
||||
}
|
||||
|
||||
|
@ -1486,32 +1490,97 @@ TestGetDecomposition() {
|
|||
|
||||
length=unorm2_getDecomposition(n2, 0x20, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length>=0) {
|
||||
log_err("unorm2_getDecomposition(space) failed\n");
|
||||
log_err("unorm2_getDecomposition(fcc, space) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xe4, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0x61 || decomp[1]!=0x308 || decomp[2]!=0) {
|
||||
log_err("unorm2_getDecomposition(a-umlaut) failed\n");
|
||||
log_err("unorm2_getDecomposition(fcc, a-umlaut) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xac01, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=3 || decomp[0]!=0x1100 || decomp[1]!=0x1161 || decomp[2]!=0x11a8 || decomp[3]!=0) {
|
||||
log_err("unorm2_getDecomposition(Hangul syllable U+AC01) failed\n");
|
||||
log_err("unorm2_getDecomposition(fcc, Hangul syllable U+AC01) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xac01, NULL, 0, &errorCode);
|
||||
if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=3) {
|
||||
log_err("unorm2_getDecomposition(Hangul syllable U+AC01) overflow failed\n");
|
||||
log_err("unorm2_getDecomposition(fcc, Hangul syllable U+AC01) overflow failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xac01, decomp, -1, &errorCode);
|
||||
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
log_err("unorm2_getDecomposition(capacity<0) failed\n");
|
||||
log_err("unorm2_getDecomposition(fcc, capacity<0) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xac01, NULL, 4, &errorCode);
|
||||
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
log_err("unorm2_getDecomposition(decomposition=NULL) failed\n");
|
||||
log_err("unorm2_getDecomposition(fcc, decomposition=NULL) failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
TestGetRawDecomposition() {
|
||||
UChar decomp[32];
|
||||
int32_t length;
|
||||
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const UNormalizer2 *n2=unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err_status(errorCode, "unorm2_getInstance(nfkc) failed: %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values,
|
||||
* without recursive decomposition.
|
||||
*/
|
||||
|
||||
length=unorm2_getRawDecomposition(n2, 0x20, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length>=0) {
|
||||
log_err("unorm2_getDecomposition(nfkc, space) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getRawDecomposition(n2, 0xe4, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0x61 || decomp[1]!=0x308 || decomp[2]!=0) {
|
||||
log_err("unorm2_getDecomposition(nfkc, a-umlaut) failed\n");
|
||||
}
|
||||
/* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getRawDecomposition(n2, 0x1e08, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0xc7 || decomp[1]!=0x301 || decomp[2]!=0) {
|
||||
log_err("unorm2_getDecomposition(nfkc, c-cedilla-acute) failed\n");
|
||||
}
|
||||
/* U+212B ANGSTROM SIGN */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getRawDecomposition(n2, 0x212b, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=1 || decomp[0]!=0xc5 || decomp[1]!=0) {
|
||||
log_err("unorm2_getDecomposition(nfkc, angstrom sign) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getRawDecomposition(n2, 0xac00, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0x1100 || decomp[1]!=0x1161 || decomp[2]!=0) {
|
||||
log_err("unorm2_getDecomposition(nfkc, Hangul syllable U+AC00) failed\n");
|
||||
}
|
||||
/* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getRawDecomposition(n2, 0xac01, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0xac00 || decomp[1]!=0x11a8 || decomp[2]!=0) {
|
||||
log_err("unorm2_getDecomposition(nfkc, Hangul syllable U+AC01) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getRawDecomposition(n2, 0xac01, NULL, 0, &errorCode);
|
||||
if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=2) {
|
||||
log_err("unorm2_getDecomposition(nfkc, Hangul syllable U+AC01) overflow failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getRawDecomposition(n2, 0xac01, decomp, -1, &errorCode);
|
||||
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
log_err("unorm2_getDecomposition(nfkc, capacity<0) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getRawDecomposition(n2, 0xac01, NULL, 4, &errorCode);
|
||||
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
log_err("unorm2_getDecomposition(nfkc, decomposition=NULL) failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -929,6 +929,7 @@ typedef struct UnicodeDataContext {
|
|||
#if UCONFIG_NO_NORMALIZATION
|
||||
const void *dummy;
|
||||
#else
|
||||
const UNormalizer2 *nfc;
|
||||
const UNormalizer2 *nfkc;
|
||||
#endif
|
||||
} UnicodeDataContext;
|
||||
|
@ -952,14 +953,18 @@ unicodeDataLineFn(void *context,
|
|||
UErrorCode *pErrorCode)
|
||||
{
|
||||
char buffer[100];
|
||||
const char *d;
|
||||
char *end;
|
||||
uint32_t value;
|
||||
UChar32 c;
|
||||
int32_t i;
|
||||
int8_t type;
|
||||
int32_t dt;
|
||||
UChar dm[32], s[32];
|
||||
int32_t dmLength, length;
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
const UNormalizer2 *nfkc;
|
||||
const UNormalizer2 *nfc, *nfkc;
|
||||
#endif
|
||||
|
||||
/* get the character code, field 0 */
|
||||
|
@ -1010,6 +1015,77 @@ unicodeDataLineFn(void *context,
|
|||
log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
|
||||
}
|
||||
|
||||
/* get Decomposition_Type & Decomposition_Mapping, field 5 */
|
||||
if(fields[5][0]==fields[5][1]) {
|
||||
/* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
|
||||
if(c==0xac00 || c==0xd7a3) {
|
||||
dt=U_DT_CANONICAL;
|
||||
} else {
|
||||
dt=U_DT_NONE;
|
||||
}
|
||||
} else {
|
||||
d=fields[5][0];
|
||||
*fields[5][1]=0;
|
||||
dt=UCHAR_INVALID_CODE;
|
||||
if(*d=='<') {
|
||||
end=strchr(++d, '>');
|
||||
if(end!=NULL) {
|
||||
*end=0;
|
||||
dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
|
||||
d=u_skipWhitespace(end+1);
|
||||
}
|
||||
} else {
|
||||
dt=U_DT_CANONICAL;
|
||||
}
|
||||
}
|
||||
if(dt>U_DT_NONE) {
|
||||
if(c==0xac00) {
|
||||
dm[0]=0x1100;
|
||||
dm[1]=0x1161;
|
||||
dm[2]=0;
|
||||
dmLength=2;
|
||||
} else if(c==0xd7a3) {
|
||||
dm[0]=0xd788;
|
||||
dm[1]=0x11c2;
|
||||
dm[2]=0;
|
||||
dmLength=2;
|
||||
} else {
|
||||
dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
|
||||
}
|
||||
} else {
|
||||
dmLength=-1;
|
||||
}
|
||||
if(dt<0 || U_FAILURE(*pErrorCode)) {
|
||||
log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
|
||||
return;
|
||||
}
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
|
||||
if(i!=dt) {
|
||||
log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
|
||||
}
|
||||
/* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
|
||||
length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
|
||||
log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
|
||||
"or the Decomposition_Mapping is different (%s)\n",
|
||||
c, length, dmLength, u_errorName(*pErrorCode));
|
||||
return;
|
||||
}
|
||||
/* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
|
||||
if(dt!=U_DT_CANONICAL) {
|
||||
dmLength=-1;
|
||||
}
|
||||
nfc=((UnicodeDataContext *)context)->nfc;
|
||||
length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
|
||||
log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
|
||||
"or the Decomposition_Mapping is different (%s)\n",
|
||||
c, length, dmLength, u_errorName(*pErrorCode));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* get ISO Comment, field 11 */
|
||||
*fields[11][1]=0;
|
||||
i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
|
||||
|
@ -1231,9 +1307,10 @@ static void TestUnicodeData()
|
|||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
context.nfc=unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, &errorCode);
|
||||
context.nfkc=unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_data_err("error: unable to open an NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
|
||||
log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1167,6 +1167,14 @@ BasicNormalizerTest::TestCompare() {
|
|||
errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions");
|
||||
}
|
||||
|
||||
// test getRawDecomposition() for some characters that do not decompose
|
||||
if( nfcNorm2->getRawDecomposition(0x20, s2) ||
|
||||
nfcNorm2->getRawDecomposition(0x4e00, s2) ||
|
||||
nfcNorm2->getRawDecomposition(0x20002, s2)
|
||||
) {
|
||||
errln("NFC.getRawDecomposition() returns TRUE for characters which do not have decompositions");
|
||||
}
|
||||
|
||||
// test FilteredNormalizer2::getDecomposition()
|
||||
UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode);
|
||||
FilteredNormalizer2 fn2(*nfcNorm2, filter);
|
||||
|
@ -1175,6 +1183,13 @@ BasicNormalizerTest::TestCompare() {
|
|||
) {
|
||||
errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed");
|
||||
}
|
||||
|
||||
// test FilteredNormalizer2::getRawDecomposition()
|
||||
if( fn2.getRawDecomposition(0xe4, s1) || !fn2.getRawDecomposition(0x100, s2) ||
|
||||
s2.length()!=2 || s2[0]!=0x41 || s2[1]!=0x304
|
||||
) {
|
||||
errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
|
||||
}
|
||||
}
|
||||
|
||||
// verify that case-folding does not un-FCD strings
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Copyright (C) 2009-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -55,7 +55,7 @@ static UDataInfo dataInfo={
|
|||
0,
|
||||
|
||||
{ 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
|
||||
{ 1, 0, 0, 0 }, /* formatVersion */
|
||||
{ 2, 0, 0, 0 }, /* formatVersion */
|
||||
{ 5, 2, 0, 0 } /* dataVersion (Unicode version) */
|
||||
};
|
||||
|
||||
|
@ -121,6 +121,7 @@ struct Norm {
|
|||
}
|
||||
|
||||
UnicodeString *mapping;
|
||||
UnicodeString *rawMapping; // non-NULL if the mapping is further decomposed
|
||||
UChar32 mappingCP; // >=0 if mapping to 1 code point
|
||||
int32_t mappingPhase;
|
||||
MappingType mappingType;
|
||||
|
@ -165,6 +166,7 @@ Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
|
|||
normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
|
||||
norms=allocNorm(); // unused Norm struct at index 0
|
||||
memset(indexes, 0, sizeof(indexes));
|
||||
memset(smallFCD, 0, sizeof(smallFCD));
|
||||
}
|
||||
|
||||
Normalizer2DataBuilder::~Normalizer2DataBuilder() {
|
||||
|
@ -172,6 +174,7 @@ Normalizer2DataBuilder::~Normalizer2DataBuilder() {
|
|||
int32_t normsLength=utm_countItems(normMem);
|
||||
for(int32_t i=1; i<normsLength; ++i) {
|
||||
delete norms[i].mapping;
|
||||
delete norms[i].rawMapping;
|
||||
delete norms[i].compositions;
|
||||
}
|
||||
utm_close(normMem);
|
||||
|
@ -421,7 +424,8 @@ public:
|
|||
UBool
|
||||
Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
|
||||
if(norms[value].hasMapping()) {
|
||||
const UnicodeString &m=*norms[value].mapping;
|
||||
Norm &norm=norms[value];
|
||||
const UnicodeString &m=*norm.mapping;
|
||||
UnicodeString *decomposed=NULL;
|
||||
const UChar *s=m.getBuffer();
|
||||
int32_t length=m.length();
|
||||
|
@ -438,7 +442,7 @@ Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
|
|||
}
|
||||
const Norm &cNorm=getNormRef(c);
|
||||
if(cNorm.hasMapping()) {
|
||||
if(norms[value].mappingType==Norm::ROUND_TRIP) {
|
||||
if(norm.mappingType==Norm::ROUND_TRIP) {
|
||||
if(prev==0) {
|
||||
if(cNorm.mappingType!=Norm::ROUND_TRIP) {
|
||||
fprintf(stderr,
|
||||
|
@ -480,7 +484,7 @@ Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
|
|||
} else if(Hangul::isHangul(c)) {
|
||||
UChar buffer[3];
|
||||
int32_t hangulLength=Hangul::decompose(c, buffer);
|
||||
if(norms[value].mappingType==Norm::ROUND_TRIP && prev!=0) {
|
||||
if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"U+%04lX's round-trip mapping's non-starter "
|
||||
|
@ -498,9 +502,14 @@ Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
|
|||
}
|
||||
}
|
||||
if(decomposed!=NULL) {
|
||||
delete norms[value].mapping;
|
||||
norms[value].mapping=decomposed;
|
||||
// Not norms[value].setMappingCP(); because the original mapping
|
||||
if(norm.rawMapping==NULL) {
|
||||
// Remember the original mapping when decomposing recursively.
|
||||
norm.rawMapping=norm.mapping;
|
||||
} else {
|
||||
delete norm.mapping;
|
||||
}
|
||||
norm.mapping=decomposed;
|
||||
// Not norm.setMappingCP(); because the original mapping
|
||||
// is most likely to be encodable as a delta.
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -585,9 +594,17 @@ Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
|
||||
* A starter character with a mapping does not have a composition boundary after it
|
||||
* if the character itself combines-forward (which is tested by the caller of this function),
|
||||
* or it is deleted (mapped to the empty string),
|
||||
* or its mapping contains no starter,
|
||||
* or the last starter combines-forward.
|
||||
*/
|
||||
UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
|
||||
if(buffer.isEmpty()) {
|
||||
return TRUE; // maps-to-empty string is no boundary of any kind
|
||||
return TRUE; // maps-to-empty-string is no boundary of any kind
|
||||
}
|
||||
int32_t lastStarterIndex=buffer.lastStarterIndex();
|
||||
if(lastStarterIndex<0) {
|
||||
|
@ -602,7 +619,7 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
|
|||
// otherwise it is blocked.
|
||||
return lastStarterIndex==buffer.length()-1;
|
||||
}
|
||||
// no Hangul in fully decomposed mapping
|
||||
// Note: There can be no Hangul syllable in the fully decomposed mapping.
|
||||
const Norm *starterNorm=&getNormRef(starter);
|
||||
if(starterNorm->compositions==NULL) {
|
||||
return FALSE; // the last starter does not combine forward
|
||||
|
@ -632,7 +649,9 @@ UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &bu
|
|||
}
|
||||
|
||||
// Requires p->hasMapping().
|
||||
void Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
|
||||
// Returns the offset of the "first unit" from the beginning of the extraData for c.
|
||||
// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
|
||||
int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
|
||||
UnicodeString &m=*p->mapping;
|
||||
int32_t length=m.length();
|
||||
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
|
||||
|
@ -656,22 +675,59 @@ void Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeStrin
|
|||
(long)c);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
int32_t firstUnit=length|(trailCC<<8);
|
||||
int32_t secondUnit=p->cc|(leadCC<<8);
|
||||
if(secondUnit!=0) {
|
||||
firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
|
||||
// Write small-FCD data.
|
||||
if((leadCC|trailCC)!=0) {
|
||||
UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
|
||||
smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
|
||||
}
|
||||
if(p->compositions!=NULL) {
|
||||
firstUnit|=Normalizer2Impl::MAPPING_PLUS_COMPOSITION_LIST;
|
||||
// Write the mapping & raw mapping extraData.
|
||||
int32_t firstUnit=length|(trailCC<<8);
|
||||
int32_t preMappingLength=0;
|
||||
if(p->rawMapping!=NULL) {
|
||||
UnicodeString &rm=*p->rawMapping;
|
||||
int32_t rmLength=rm.length();
|
||||
if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"raw mapping for U+%04lX longer than maximum of %d\n",
|
||||
(long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
UChar rm0=rm.charAt(0);
|
||||
if( rmLength==length-1 &&
|
||||
// 99: overlong substring lengths get pinned to remainder lengths anyway
|
||||
0==rm.compare(1, 99, m, 2, 99) &&
|
||||
rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
|
||||
) {
|
||||
// Compression:
|
||||
// rawMapping=rm0+mapping.substring(2) -> store only rm0
|
||||
//
|
||||
// The raw mapping is the same as the final mapping after replacing
|
||||
// the final mapping's first two code units with the raw mapping's first one.
|
||||
// In this case, we store only that first unit, rm0.
|
||||
// This helps with a few hundred mappings.
|
||||
dataString.append(rm0);
|
||||
preMappingLength=1;
|
||||
} else {
|
||||
// Store the raw mapping with its length.
|
||||
dataString.append(rm);
|
||||
dataString.append((UChar)rmLength);
|
||||
preMappingLength=rmLength+1;
|
||||
}
|
||||
firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
|
||||
}
|
||||
int32_t cccLccc=p->cc|(leadCC<<8);
|
||||
if(cccLccc!=0) {
|
||||
dataString.append((UChar)cccLccc);
|
||||
++preMappingLength;
|
||||
firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
|
||||
}
|
||||
if(p->hasNoCompBoundaryAfter) {
|
||||
firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
|
||||
}
|
||||
dataString.append((UChar)firstUnit);
|
||||
if(secondUnit!=0) {
|
||||
dataString.append((UChar)secondUnit);
|
||||
}
|
||||
dataString.append(m);
|
||||
return preMappingLength;
|
||||
}
|
||||
|
||||
// Requires p->compositions!=NULL.
|
||||
|
@ -751,6 +807,21 @@ public:
|
|||
|
||||
void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
|
||||
Norm *p=norms+value;
|
||||
if(!p->hasMapping()) {
|
||||
// Write small-FCD data.
|
||||
// There is similar code in writeMapping() for characters that do have a mapping.
|
||||
if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
|
||||
(long)c);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
if(p->cc!=0) {
|
||||
UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
|
||||
smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
|
||||
}
|
||||
}
|
||||
if(p->combinesBack) {
|
||||
if(p->hasMapping()) {
|
||||
fprintf(stderr,
|
||||
|
@ -773,10 +844,8 @@ void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraData
|
|||
writeCompositions(c, p, writer.yesYesCompositions);
|
||||
}
|
||||
} else if(p->mappingType==Norm::ROUND_TRIP) {
|
||||
p->offset=
|
||||
(writer.yesNoData.length()<<Norm::OFFSET_SHIFT)|
|
||||
Norm::OFFSET_YES_NO;
|
||||
writeMapping(c, p, writer.yesNoData);
|
||||
int32_t offset=writer.yesNoData.length()+writeMapping(c, p, writer.yesNoData);
|
||||
p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO;
|
||||
if(p->compositions!=NULL) {
|
||||
writeCompositions(c, p, writer.yesNoData);
|
||||
}
|
||||
|
@ -791,8 +860,15 @@ void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraData
|
|||
}
|
||||
if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
|
||||
// Try a compact, algorithmic encoding.
|
||||
// Only for ccc=0, because we can't store additional information.
|
||||
if(p->mappingCP>=0) {
|
||||
// Only for ccc=0, because we can't store additional information
|
||||
// and we do not recursively follow an algorithmic encoding for access to the ccc.
|
||||
//
|
||||
// Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
|
||||
// if the mappingCP decomposes further, to ensure that there is a place to store it.
|
||||
// We want to see that the final mapping does not have exactly 1 code point,
|
||||
// or else we would have to recursively ensure that the final mapping is stored
|
||||
// in normal extraData.
|
||||
if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
|
||||
int32_t delta=p->mappingCP-c;
|
||||
if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
|
||||
p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
|
||||
|
@ -801,22 +877,18 @@ void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraData
|
|||
}
|
||||
if(p->offset==0) {
|
||||
int32_t oldNoNoLength=writer.noNoMappings.length();
|
||||
writeMapping(c, p, writer.noNoMappings);
|
||||
int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
|
||||
UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
|
||||
int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
|
||||
if(previousOffset!=0) {
|
||||
// Duplicate, remove the new units and point to the old ones.
|
||||
writer.noNoMappings.truncate(oldNoNoLength);
|
||||
p->offset=
|
||||
((previousOffset-1)<<Norm::OFFSET_SHIFT)|
|
||||
Norm::OFFSET_NO_NO;
|
||||
p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
|
||||
} else {
|
||||
// Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
|
||||
IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
|
||||
writer.previousNoNoMappings.puti(newMapping, oldNoNoLength+1, errorCode);
|
||||
p->offset=
|
||||
(oldNoNoLength<<Norm::OFFSET_SHIFT)|
|
||||
Norm::OFFSET_NO_NO;
|
||||
writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
|
||||
p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -947,10 +1019,18 @@ void Normalizer2DataBuilder::processData() {
|
|||
BuilderReorderingBuffer buffer;
|
||||
int32_t normsLength=utm_countItems(normMem);
|
||||
for(int32_t i=1; i<normsLength; ++i) {
|
||||
if(norms[i].hasMapping()) {
|
||||
buffer.reset();
|
||||
reorder(norms+i, buffer);
|
||||
norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
|
||||
// Set the hasNoCompBoundaryAfter flag for use by the last code branch
|
||||
// in Normalizer2Impl::hasCompBoundaryAfter().
|
||||
// For details see the comments on hasNoCompBoundaryAfter(buffer).
|
||||
const Norm &norm=norms[i];
|
||||
if(norm.hasMapping()) {
|
||||
if(norm.compositions!=NULL) {
|
||||
norms[i].hasNoCompBoundaryAfter=TRUE;
|
||||
} else {
|
||||
buffer.reset();
|
||||
reorder(norms+i, buffer);
|
||||
norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1051,14 +1131,18 @@ void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
|
|||
indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
|
||||
offset+=norm16TrieLength;
|
||||
indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
|
||||
int32_t totalSize=offset+=extraData.length()*2;
|
||||
for(int32_t i=Normalizer2Impl::IX_RESERVED2_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
|
||||
offset+=extraData.length()*2;
|
||||
indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
|
||||
offset+=sizeof(smallFCD);
|
||||
int32_t totalSize=offset;
|
||||
for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
|
||||
indexes[i]=totalSize;
|
||||
}
|
||||
|
||||
if(beVerbose) {
|
||||
printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength);
|
||||
printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length());
|
||||
printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD));
|
||||
printf("size of binary data file contents: %5ld bytes\n", (long)totalSize);
|
||||
printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
|
||||
printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
|
||||
|
@ -1080,7 +1164,7 @@ void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
|
|||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
|
||||
udata_writeUString(pData, extraData.getBuffer(), extraData.length());
|
||||
|
||||
udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
|
||||
int32_t writtenSize=udata_finish(pData, errorCode);
|
||||
if(errorCode.isFailure()) {
|
||||
fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Copyright (C) 2009-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -90,7 +90,7 @@ private:
|
|||
void reorder(Norm *p, BuilderReorderingBuffer &buffer);
|
||||
UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer);
|
||||
void setHangulData();
|
||||
void writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString);
|
||||
int32_t writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString);
|
||||
void writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString);
|
||||
void writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer);
|
||||
int32_t getCenterNoNoDelta() {
|
||||
|
@ -111,6 +111,7 @@ private:
|
|||
int32_t indexes[Normalizer2Impl::IX_COUNT];
|
||||
UTrie2 *norm16Trie;
|
||||
UnicodeString extraData;
|
||||
uint8_t smallFCD[0x100];
|
||||
|
||||
UVersionInfo unicodeVersion;
|
||||
};
|
||||
|
|
Loading…
Add table
Reference in a new issue