mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-8942 use smaller/simpler FCD data rather than building an FCD trie
X-SVN-Rev: 30985
This commit is contained in:
parent
8ffe8b6439
commit
524fd241c5
8 changed files with 152 additions and 333 deletions
|
@ -523,12 +523,7 @@ const Normalizer2 *Normalizer2Factory::getNFDInstance(UErrorCode &errorCode) {
|
|||
|
||||
const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
if(allModes!=NULL) {
|
||||
allModes->impl.getFCDTrie(errorCode);
|
||||
return &allModes->fcd;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
return allModes!=NULL ? &allModes->fcd : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
|
||||
|
@ -605,17 +600,6 @@ Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
|
|||
return &((Normalizer2WithImpl *)norm2)->impl;
|
||||
}
|
||||
|
||||
const UTrie2 *
|
||||
Normalizer2Factory::getFCDTrie(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=
|
||||
Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
if(allModes!=NULL) {
|
||||
return allModes->impl.getFCDTrie(errorCode);
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getInstance(const char *packageName,
|
||||
const char *name,
|
||||
|
@ -682,7 +666,6 @@ Normalizer2::getInstance(const char *packageName,
|
|||
case UNORM2_DECOMPOSE:
|
||||
return &allModes->decomp;
|
||||
case UNORM2_FCD:
|
||||
allModes->impl.getFCDTrie(errorCode);
|
||||
return &allModes->fcd;
|
||||
case UNORM2_COMPOSE_CONTIGUOUS:
|
||||
return &allModes->fcc;
|
||||
|
@ -960,25 +943,14 @@ unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
|
|||
}
|
||||
|
||||
U_CFUNC uint16_t
|
||||
unorm_getFCD16Simple(UChar32 c) {
|
||||
unorm_getFCD16(UChar32 c) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const UTrie2 *trie=Normalizer2Factory::getFCDTrie(errorCode);
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return UTRIE2_GET16(trie, c);
|
||||
return impl->getFCD16(c);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI const uint16_t * U_EXPORT2
|
||||
unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode) {
|
||||
const UTrie2 *trie=Normalizer2Factory::getFCDTrie(*pErrorCode);
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
fcdHighStart=trie->highStart;
|
||||
return trie->index;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
|
|
|
@ -254,7 +254,6 @@ struct CanonIterData : public UMemory {
|
|||
Normalizer2Impl::~Normalizer2Impl() {
|
||||
udata_close(memory);
|
||||
utrie2_close(normTrie);
|
||||
UTrie2Singleton(fcdTrieSingleton).deleteInstance();
|
||||
delete (CanonIterData *)canonIterDataSingleton.fInstance;
|
||||
}
|
||||
|
||||
|
@ -1507,121 +1506,13 @@ const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *
|
|||
return iter.codePointStart;
|
||||
}
|
||||
|
||||
class FCDTrieSingleton : public UTrie2Singleton {
|
||||
public:
|
||||
FCDTrieSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
|
||||
UTrie2Singleton(s), impl(ni), errorCode(ec) {}
|
||||
UTrie2 *getInstance(UErrorCode &errorCode) {
|
||||
return UTrie2Singleton::getInstance(createInstance, this, errorCode);
|
||||
}
|
||||
static void *createInstance(const void *context, UErrorCode &errorCode);
|
||||
UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
|
||||
if(value!=0) {
|
||||
impl.setFCD16FromNorm16(start, end, (uint16_t)value, newFCDTrie, errorCode);
|
||||
}
|
||||
return U_SUCCESS(errorCode);
|
||||
}
|
||||
|
||||
Normalizer2Impl &impl;
|
||||
UTrie2 *newFCDTrie;
|
||||
UErrorCode &errorCode;
|
||||
};
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
// Set the FCD value for a range of same-norm16 characters.
|
||||
static UBool U_CALLCONV
|
||||
enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
|
||||
return ((FCDTrieSingleton *)context)->rangeHandler(start, end, value);
|
||||
}
|
||||
|
||||
// Collect (OR together) the FCD values for a range of supplementary characters,
|
||||
// for their lead surrogate code unit.
|
||||
static UBool U_CALLCONV
|
||||
enumRangeOrValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
|
||||
*((uint32_t *)context)|=value;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
void *FCDTrieSingleton::createInstance(const void *context, UErrorCode &errorCode) {
|
||||
FCDTrieSingleton *me=(FCDTrieSingleton *)context;
|
||||
me->newFCDTrie=utrie2_open(0, 0, &errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
utrie2_enum(me->impl.getNormTrie(), NULL, enumRangeHandler, me);
|
||||
for(UChar lead=0xd800; lead<0xdc00; ++lead) {
|
||||
uint32_t oredValue=utrie2_get32(me->newFCDTrie, lead);
|
||||
utrie2_enumForLeadSurrogate(me->newFCDTrie, lead, NULL, enumRangeOrValue, &oredValue);
|
||||
if(oredValue!=0) {
|
||||
// Set a "bad" value for makeFCD() to break the quick check loop
|
||||
// and look up the value for the supplementary code point.
|
||||
// If there is any lccc, then set the worst-case lccc of 1.
|
||||
// The ORed-together value's tccc is already the worst case.
|
||||
if(oredValue>0xff) {
|
||||
oredValue=0x100|(oredValue&0xff);
|
||||
}
|
||||
utrie2_set32ForLeadSurrogateCodeUnit(me->newFCDTrie, lead, oredValue, &errorCode);
|
||||
}
|
||||
}
|
||||
utrie2_freeze(me->newFCDTrie, UTRIE2_16_VALUE_BITS, &errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return me->newFCDTrie;
|
||||
}
|
||||
}
|
||||
utrie2_close(me->newFCDTrie);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void Normalizer2Impl::setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
|
||||
UTrie2 *newFCDTrie, UErrorCode &errorCode) const {
|
||||
// Only loops for 1:1 algorithmic mappings.
|
||||
for(;;) {
|
||||
if(norm16>=MIN_NORMAL_MAYBE_YES) {
|
||||
norm16&=0xff;
|
||||
norm16|=norm16<<8;
|
||||
} else if(norm16<=minYesNo || minMaybeYes<=norm16) {
|
||||
// no decomposition or Hangul syllable, all zeros
|
||||
break;
|
||||
} else if(limitNoNo<=norm16) {
|
||||
int32_t delta=norm16-(minMaybeYes-MAX_DELTA-1);
|
||||
if(start==end) {
|
||||
start+=delta;
|
||||
norm16=getNorm16(start);
|
||||
} else {
|
||||
// the same delta leads from different original characters to different mappings
|
||||
do {
|
||||
UChar32 c=start+delta;
|
||||
setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie, errorCode);
|
||||
} while(++start<=end);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
uint16_t firstUnit=*mapping;
|
||||
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
|
||||
// A character that is deleted (maps to an empty string) must
|
||||
// get the worst-case lccc and tccc values because arbitrary
|
||||
// characters on both sides will become adjacent.
|
||||
norm16=0x1ff;
|
||||
} else {
|
||||
norm16=firstUnit>>8; // tccc
|
||||
if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
|
||||
norm16|=*(mapping-1)&0xff00; // lccc
|
||||
}
|
||||
}
|
||||
}
|
||||
utrie2_setRange32(newFCDTrie, start, end, norm16, TRUE, &errorCode);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const UTrie2 *Normalizer2Impl::getFCDTrie(UErrorCode &errorCode) const {
|
||||
// Logically const: Synchronized instantiation.
|
||||
Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
|
||||
return FCDTrieSingleton(me->fcdTrieSingleton, *me, errorCode).getInstance(errorCode);
|
||||
}
|
||||
// Note: normalizer2impl.cpp r30982 (2011-nov-27)
|
||||
// still had getFCDTrie() which built and cached an FCD trie.
|
||||
// That provided faster access to FCD data than getFCD16FromNormData()
|
||||
// but required synchronization and consumed some 10kB of heap memory
|
||||
// in any process that uses FCD (e.g., via collation).
|
||||
// tccc180[] and smallFCD[] are intended to help with any loss of performance,
|
||||
// at least for Latin & CJK.
|
||||
|
||||
// Gets the FCD value from the regular normalization data.
|
||||
uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
|
||||
|
@ -1679,7 +1570,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
|
|||
prevBoundary=src;
|
||||
// We know that the previous character's lccc==0.
|
||||
// Fetching the fcd16 value was deferred for this below-U+0300 code point.
|
||||
prevFCD16=getFCD16FromSingleLead(*(src-1));
|
||||
prevFCD16=getFCD16(*(src-1));
|
||||
if(prevFCD16>1) {
|
||||
--prevBoundary;
|
||||
}
|
||||
|
@ -1693,8 +1584,6 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
|
|||
// The exception is the call to decomposeShort() which uses the buffer
|
||||
// in the normal way.
|
||||
|
||||
const UTrie2 *trie=fcdTrie();
|
||||
|
||||
const UChar *prevSrc;
|
||||
UChar32 c=0;
|
||||
uint16_t fcd16=0;
|
||||
|
@ -1705,24 +1594,24 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
|
|||
if((c=*src)<MIN_CCC_LCCC_CP) {
|
||||
prevFCD16=~c;
|
||||
++src;
|
||||
} else if((fcd16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c))<=0xff) {
|
||||
prevFCD16=fcd16;
|
||||
} else if(!singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
prevFCD16=0;
|
||||
++src;
|
||||
} else if(!U16_IS_SURROGATE(c)) {
|
||||
break;
|
||||
} else {
|
||||
UChar c2;
|
||||
if(U16_IS_SURROGATE_LEAD(c)) {
|
||||
if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
|
||||
c=U16_GET_SUPPLEMENTARY(c, c2);
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
|
||||
--src;
|
||||
c=U16_GET_SUPPLEMENTARY(c2, c);
|
||||
if(U16_IS_SURROGATE(c)) {
|
||||
UChar c2;
|
||||
if(U16_IS_SURROGATE_LEAD(c)) {
|
||||
if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
|
||||
c=U16_GET_SUPPLEMENTARY(c, c2);
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
|
||||
--src;
|
||||
c=U16_GET_SUPPLEMENTARY(c2, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
if((fcd16=getFCD16(c))<=0xff) {
|
||||
if((fcd16=getFCD16FromNormData(c))<=0xff) {
|
||||
prevFCD16=fcd16;
|
||||
src+=U16_LENGTH(c);
|
||||
} else {
|
||||
|
@ -1742,7 +1631,8 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
|
|||
// We know that the previous character's lccc==0.
|
||||
if(prevFCD16<0) {
|
||||
// Fetching the fcd16 value was deferred for this below-U+0300 code point.
|
||||
prevFCD16=getFCD16FromSingleLead((UChar)~prevFCD16);
|
||||
UChar32 prev=~prevFCD16;
|
||||
prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
|
||||
if(prevFCD16>1) {
|
||||
--prevBoundary;
|
||||
}
|
||||
|
@ -1752,7 +1642,7 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
|
|||
--p;
|
||||
// Need to fetch the previous character's FCD value because
|
||||
// prevFCD16 was just for the trail surrogate code point.
|
||||
prevFCD16=getFCD16FromSurrogatePair(p[0], p[1]);
|
||||
prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
|
||||
// Still known to have lccc==0 because its lead surrogate unit had lccc==0.
|
||||
}
|
||||
if(prevFCD16>1) {
|
||||
|
@ -1840,21 +1730,18 @@ void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
|
|||
}
|
||||
|
||||
const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
|
||||
BackwardUTrie2StringIterator iter(fcdTrie(), start, p);
|
||||
uint16_t fcd16;
|
||||
do {
|
||||
fcd16=iter.previous16();
|
||||
} while(fcd16>0xff);
|
||||
return iter.codePointStart;
|
||||
while(start<p && previousFCD16(start, p)>0xff) {}
|
||||
return p;
|
||||
}
|
||||
|
||||
const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
|
||||
ForwardUTrie2StringIterator iter(fcdTrie(), p, limit);
|
||||
uint16_t fcd16;
|
||||
do {
|
||||
fcd16=iter.next16();
|
||||
} while(fcd16>0xff);
|
||||
return iter.codePointStart;
|
||||
while(p<limit) {
|
||||
const UChar *codePointStart=p;
|
||||
if(nextFCD16(p, limit)<=0xff) {
|
||||
return codePointStart;
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
// CanonicalIterator data -------------------------------------------------- ***
|
||||
|
|
|
@ -216,7 +216,6 @@ private:
|
|||
class U_COMMON_API Normalizer2Impl : public UMemory {
|
||||
public:
|
||||
Normalizer2Impl() : memory(NULL), normTrie(NULL) {
|
||||
fcdTrieSingleton.fInstance=NULL;
|
||||
canonIterDataSingleton.fInstance=NULL;
|
||||
}
|
||||
~Normalizer2Impl();
|
||||
|
@ -229,7 +228,6 @@ public:
|
|||
// low-level properties ------------------------------------------------ ***
|
||||
|
||||
const UTrie2 *getNormTrie() const { return normTrie; }
|
||||
const UTrie2 *getFCDTrie(UErrorCode &errorCode) const ;
|
||||
|
||||
UBool ensureCanonIterData(UErrorCode &errorCode) const;
|
||||
|
||||
|
@ -260,22 +258,80 @@ public:
|
|||
return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
|
||||
}
|
||||
|
||||
uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); }
|
||||
uint16_t getFCD16FromSingleLead(UChar c) const {
|
||||
return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c);
|
||||
/**
|
||||
* Returns the FCD data for code point c.
|
||||
* @param c A Unicode code point.
|
||||
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
|
||||
*/
|
||||
uint16_t getFCD16(UChar32 c) const {
|
||||
if(c<0) {
|
||||
return 0;
|
||||
} else if(c<0x180) {
|
||||
return tccc180[c];
|
||||
} else if(c<=0xffff) {
|
||||
if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
|
||||
}
|
||||
return getFCD16FromNormData(c);
|
||||
}
|
||||
uint16_t getFCD16FromSupplementary(UChar32 c) const {
|
||||
return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c);
|
||||
/**
|
||||
* Returns the FCD data for the next code point (post-increment).
|
||||
* Might skip only a lead surrogate rather than the whole surrogate pair if none of
|
||||
* the supplementary code points associated with the lead surrogate have non-zero FCD data.
|
||||
* @param s A valid pointer into a string. Requires s!=limit.
|
||||
* @param limit The end of the string, or NULL.
|
||||
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
|
||||
*/
|
||||
uint16_t nextFCD16(const UChar *&s, const UChar *limit) const {
|
||||
UChar32 c=*s++;
|
||||
if(c<0x180) {
|
||||
return tccc180[c];
|
||||
} else if(!singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
return 0;
|
||||
}
|
||||
UChar c2;
|
||||
if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) {
|
||||
c=U16_GET_SUPPLEMENTARY(c, c2);
|
||||
++s;
|
||||
}
|
||||
return getFCD16FromNormData(c);
|
||||
}
|
||||
uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const {
|
||||
return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2));
|
||||
/**
|
||||
* Returns the FCD data for the previous code point (pre-decrement).
|
||||
* @param start The start of the string.
|
||||
* @param s A valid pointer into a string. Requires start<s.
|
||||
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
|
||||
*/
|
||||
uint16_t previousFCD16(const UChar *start, const UChar *&s) const {
|
||||
UChar32 c=*--s;
|
||||
if(c<0x180) {
|
||||
return tccc180[c];
|
||||
}
|
||||
if(!U16_IS_TRAIL(c)) {
|
||||
if(!singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
UChar c2;
|
||||
if(start<s && U16_IS_LEAD(c2=*(s-1))) {
|
||||
c=U16_GET_SUPPLEMENTARY(c2, c);
|
||||
--s;
|
||||
}
|
||||
}
|
||||
return getFCD16FromNormData(c);
|
||||
}
|
||||
|
||||
/** Returns the FCD data for U+0000<=c<U+0180. */
|
||||
uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; }
|
||||
/** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */
|
||||
UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
|
||||
// 0<=lead<=0xffff
|
||||
uint8_t bits=smallFCD[lead>>8];
|
||||
if(bits==0) { return false; }
|
||||
return (UBool)((bits>>((lead>>5)&7))&1);
|
||||
}
|
||||
/** Returns the FCD value from the regular normalization data. */
|
||||
uint16_t getFCD16FromNormData(UChar32 c) const;
|
||||
|
||||
void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
|
||||
UTrie2 *newFCDTrie, UErrorCode &errorCode) const;
|
||||
|
||||
void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
|
||||
CanonIterData &newData, UErrorCode &errorCode) const;
|
||||
|
||||
|
@ -504,8 +560,6 @@ private:
|
|||
const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
|
||||
const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
|
||||
|
||||
const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; }
|
||||
|
||||
const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
|
||||
const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
|
||||
|
||||
|
@ -532,7 +586,6 @@ private:
|
|||
const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
|
||||
uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F
|
||||
|
||||
SimpleSingleton fcdTrieSingleton;
|
||||
SimpleSingleton canonIterDataSingleton;
|
||||
};
|
||||
|
||||
|
@ -565,8 +618,6 @@ public:
|
|||
// Get the Impl instance of the Normalizer2.
|
||||
// Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
|
||||
static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);
|
||||
|
||||
static const UTrie2 *getFCDTrie(UErrorCode &errorCode);
|
||||
private:
|
||||
Normalizer2Factory(); // No instantiation.
|
||||
};
|
||||
|
@ -586,101 +637,11 @@ U_CFUNC UNormalizationCheckResult
|
|||
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
|
||||
|
||||
/**
|
||||
* Get the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
|
||||
* Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC uint16_t
|
||||
unorm_getFCD16Simple(UChar32 c);
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get access to the internal FCD trie table to be able to perform
|
||||
* incremental, per-code unit, FCD checks in collation.
|
||||
* One pointer is sufficient because the trie index values are offset
|
||||
* by the index size, so that the same pointer is used to access the trie data.
|
||||
* Code points at fcdHighStart and above have a zero FCD value.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI const uint16_t * U_EXPORT2
|
||||
unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value for a code unit, with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* If c is a lead surrogate and the value is not 0,
|
||||
* then some of c's associated supplementary code points have a non-zero FCD value.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
|
||||
return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value of the next code point (post-increment), with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
||||
const UChar *&s, const UChar *limit) {
|
||||
UChar32 c=*s++;
|
||||
uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
if(fcd!=0 && U16_IS_LEAD(c)) {
|
||||
UChar c2;
|
||||
if(s!=limit && U16_IS_TRAIL(c2=*s)) {
|
||||
++s;
|
||||
c=U16_GET_SUPPLEMENTARY(c, c2);
|
||||
if(c<fcdHighStart) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
|
||||
} else {
|
||||
fcd=0;
|
||||
}
|
||||
} else /* unpaired lead surrogate */ {
|
||||
fcd=0;
|
||||
}
|
||||
}
|
||||
return fcd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value of the previous code point (pre-decrement), with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
||||
const UChar *start, const UChar *&s) {
|
||||
UChar32 c=*--s;
|
||||
uint16_t fcd;
|
||||
if(!U16_IS_SURROGATE(c)) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
} else {
|
||||
UChar c2;
|
||||
if(U16_IS_SURROGATE_TRAIL(c) && s!=start && U16_IS_LEAD(c2=*(s-1))) {
|
||||
--s;
|
||||
c=U16_GET_SUPPLEMENTARY(c2, c);
|
||||
if(c<fcdHighStart) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
|
||||
} else {
|
||||
fcd=0;
|
||||
}
|
||||
} else /* unpaired surrogate */ {
|
||||
fcd=0;
|
||||
}
|
||||
}
|
||||
return fcd;
|
||||
}
|
||||
unorm_getFCD16(UChar32 c);
|
||||
|
||||
/**
|
||||
* Format of Normalizer2 .nrm data files.
|
||||
|
|
|
@ -401,7 +401,7 @@ static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) {
|
|||
}
|
||||
#else
|
||||
static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
|
||||
return unorm_getFCD16Simple(c)>>8;
|
||||
return unorm_getFCD16(c)>>8;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -411,7 +411,7 @@ static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) {
|
|||
}
|
||||
#else
|
||||
static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
|
||||
return unorm_getFCD16Simple(c)&0xff;
|
||||
return unorm_getFCD16(c)&0xff;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -52,13 +52,11 @@ U_NAMESPACE_USE
|
|||
|
||||
#define ZERO_CC_LIMIT_ 0xC0
|
||||
|
||||
// this is static pointer to the normalizer fcdTrieIndex
|
||||
// This is static pointer to the NFC implementation instance.
|
||||
// it is always the same between calls to u_cleanup
|
||||
// and therefore writing to it is not synchronized.
|
||||
// It is cleaned in ucol_cleanup
|
||||
static const uint16_t *fcdTrieIndex=NULL;
|
||||
// Code points at fcdHighStart and above have a zero FCD value.
|
||||
static UChar32 fcdHighStart = 0;
|
||||
static const Normalizer2Impl *g_nfcImpl = NULL;
|
||||
|
||||
// These are values from UCA required for
|
||||
// implicit generation and supressing sort key compression
|
||||
|
@ -72,7 +70,7 @@ U_CDECL_BEGIN
|
|||
static UBool U_CALLCONV
|
||||
ucol_cleanup(void)
|
||||
{
|
||||
fcdTrieIndex = NULL;
|
||||
g_nfcImpl = NULL;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
@ -86,11 +84,13 @@ U_CDECL_END
|
|||
// init FCD data
|
||||
static inline
|
||||
UBool initializeFCD(UErrorCode *status) {
|
||||
if (fcdTrieIndex != NULL) {
|
||||
if (g_nfcImpl != NULL) {
|
||||
return TRUE;
|
||||
} else {
|
||||
// The result is constant, until the library is reloaded.
|
||||
fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
|
||||
g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
|
||||
// Note: Alternatively, we could also store this pointer in each collIterate struct,
|
||||
// same as Normalizer2Factory::getImpl(collIterate->nfd).
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
|
||||
return U_SUCCESS(*status);
|
||||
}
|
||||
|
@ -1433,10 +1433,8 @@ inline UBool collIterFCD(collIterate *collationSource) {
|
|||
endP = NULL;
|
||||
}
|
||||
|
||||
// Get the trailing combining class of the current character. If it's zero,
|
||||
// we are OK.
|
||||
/* trie access */
|
||||
fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
|
||||
// Get the trailing combining class of the current character. If it's zero, we are OK.
|
||||
fcd = g_nfcImpl->nextFCD16(srcP, endP);
|
||||
if (fcd != 0) {
|
||||
prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
|
||||
|
||||
|
@ -1447,8 +1445,7 @@ inline UBool collIterFCD(collIterate *collationSource) {
|
|||
{
|
||||
const UChar *savedSrcP = srcP;
|
||||
|
||||
/* trie access */
|
||||
fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
|
||||
fcd = g_nfcImpl->nextFCD16(srcP, endP);
|
||||
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
|
||||
if (leadingCC == 0) {
|
||||
srcP = savedSrcP; // Hit char that is not part of combining sequence.
|
||||
|
@ -1809,7 +1806,7 @@ inline UBool collPrevIterFCD(collIterate *data)
|
|||
src = data->pos + 1;
|
||||
|
||||
/* Get the trailing combining class of the current character. */
|
||||
fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
|
||||
fcd = g_nfcImpl->previousFCD16(start, src);
|
||||
|
||||
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
|
||||
|
||||
|
@ -1825,7 +1822,7 @@ inline UBool collPrevIterFCD(collIterate *data)
|
|||
return result;
|
||||
}
|
||||
|
||||
fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
|
||||
fcd = g_nfcImpl->previousFCD16(start, src);
|
||||
|
||||
trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
|
||||
|
||||
|
|
|
@ -845,8 +845,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
|||
UColToken *tok = lh->first;
|
||||
UColToken *expt = NULL;
|
||||
uint32_t i = 0, j = 0;
|
||||
UChar32 fcdHighStart;
|
||||
const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
|
||||
const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
|
||||
|
||||
while(tok != NULL && U_SUCCESS(*status)) {
|
||||
/* first, check if there are any expansions */
|
||||
|
@ -942,7 +941,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
|||
if (!src->buildCCTabFlag && el.cSize > 0) {
|
||||
// Check the trailing canonical combining class (tccc) of the last character.
|
||||
const UChar *s = el.cPoints + el.cSize;
|
||||
uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
|
||||
uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
|
||||
if ((fcd & 0xff) != 0) {
|
||||
src->buildCCTabFlag = TRUE;
|
||||
}
|
||||
|
|
|
@ -743,15 +743,12 @@ uprv_uca_copyCMTable(tempUCATable *t, UChar *cm, uint16_t *index) {
|
|||
static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) {
|
||||
|
||||
UChar c;
|
||||
uint16_t fcd; // Hi byte is lead combining class.
|
||||
// lo byte is trailing combing class.
|
||||
const uint16_t *fcdTrieIndex;
|
||||
UChar32 fcdHighStart;
|
||||
uint16_t fcd; // Hi byte is lead combining class. lo byte is trailing combing class.
|
||||
UBool buildCMTable = (t->cmLookup==NULL); // flag for building combining class table
|
||||
UChar *cm=NULL;
|
||||
uint16_t index[256];
|
||||
int32_t count=0;
|
||||
fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
|
||||
const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
|
@ -767,7 +764,18 @@ static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) {
|
|||
uprv_memset(index, 0, sizeof(index));
|
||||
}
|
||||
for (c=0; c<0xffff; c++) {
|
||||
fcd = unorm_getFCD16(fcdTrieIndex, c);
|
||||
if (U16_IS_LEAD(c)) {
|
||||
fcd = 0;
|
||||
if (nfcImpl->singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
UChar32 supp = U16_GET_SUPPLEMENTARY(c, 0xdc00);
|
||||
UChar32 suppLimit = supp + 0x400;
|
||||
while (supp < suppLimit) {
|
||||
fcd |= nfcImpl->getFCD16FromNormData(supp++);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fcd = nfcImpl->getFCD16(c);
|
||||
}
|
||||
if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
|
||||
(U16_IS_LEAD(c) && fcd != 0)) {// c is a leading surrogate with some FCD data
|
||||
if (buildCMTable) {
|
||||
|
@ -1785,12 +1793,11 @@ uprv_uca_addMultiCMContractions(tempUCATable *t,
|
|||
CombinClassTable *cmLookup = t->cmLookup;
|
||||
UChar newDecomp[256];
|
||||
int32_t maxComp, newDecLen;
|
||||
UChar32 fcdHighStart;
|
||||
const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
|
||||
const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
int16_t curClass = (unorm_getFCD16(fcdTrieIndex, c->tailoringCM) & 0xff);
|
||||
int16_t curClass = nfcImpl->getFCD16(c->tailoringCM) & 0xff;
|
||||
CompData *precomp = c->precomp;
|
||||
int32_t compLen = c->compLen;
|
||||
UChar *comp = c->comp;
|
||||
|
@ -1855,12 +1862,11 @@ uprv_uca_addTailCanonicalClosures(tempUCATable *t,
|
|||
UCAElements *el,
|
||||
UErrorCode *status) {
|
||||
CombinClassTable *cmLookup = t->cmLookup;
|
||||
UChar32 fcdHighStart;
|
||||
const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
|
||||
const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
int16_t maxIndex = (unorm_getFCD16(fcdTrieIndex, cMark) & 0xff );
|
||||
int16_t maxIndex = nfcImpl->getFCD16(cMark) & 0xff;
|
||||
UCAElements element;
|
||||
uint16_t *index;
|
||||
UChar decomp[256];
|
||||
|
@ -1874,8 +1880,8 @@ uprv_uca_addTailCanonicalClosures(tempUCATable *t,
|
|||
return;
|
||||
}
|
||||
index = cmLookup->index;
|
||||
int32_t cClass=(unorm_getFCD16(fcdTrieIndex, cMark) & 0xff);
|
||||
maxIndex = (int32_t)index[(unorm_getFCD16(fcdTrieIndex, cMark) & 0xff)-1];
|
||||
int32_t cClass=nfcImpl->getFCD16(cMark) & 0xff;
|
||||
maxIndex = (int32_t)index[(nfcImpl->getFCD16(cMark) & 0xff)-1];
|
||||
c.comp = comp;
|
||||
c.decomp = decomp;
|
||||
c.precomp = precomp;
|
||||
|
@ -1898,7 +1904,7 @@ uprv_uca_addTailCanonicalClosures(tempUCATable *t,
|
|||
// other combining mark combinations.
|
||||
precomp[precompLen].cp=comp[0];
|
||||
curClass = precomp[precompLen].cClass =
|
||||
index[unorm_getFCD16(fcdTrieIndex, decomp[1]) & 0xff];
|
||||
index[nfcImpl->getFCD16(decomp[1]) & 0xff];
|
||||
precompLen++;
|
||||
replacedPos=0;
|
||||
for (decompLen=0; decompLen< (int32_t)el->cSize; decompLen++) {
|
||||
|
@ -1938,7 +1944,7 @@ uprv_uca_addTailCanonicalClosures(tempUCATable *t,
|
|||
// This is a fix for tailoring contractions with accented
|
||||
// character at the end of contraction string.
|
||||
if ((len>2) &&
|
||||
(unorm_getFCD16(fcdTrieIndex, comp[len-2]) & 0xff00)==0) {
|
||||
(nfcImpl->getFCD16(comp[len-2]) & 0xff00)==0) {
|
||||
uprv_uca_addFCD4AccentedContractions(t, colEl, comp, len, &element, status);
|
||||
}
|
||||
|
||||
|
@ -1967,8 +1973,6 @@ uprv_uca_canonicalClosure(tempUCATable *t,
|
|||
UColToken *tok;
|
||||
uint32_t i = 0, j = 0;
|
||||
UChar baseChar, firstCM;
|
||||
UChar32 fcdHighStart;
|
||||
const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
|
||||
context.nfcImpl=Normalizer2Factory::getNFCImpl(*status);
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
|
@ -2039,7 +2043,7 @@ uprv_uca_canonicalClosure(tempUCATable *t,
|
|||
}
|
||||
if(src->UCA != NULL) {
|
||||
for(j = 0; j<el.cSize; j++) {
|
||||
int16_t fcd = unorm_getFCD16(fcdTrieIndex, el.cPoints[j]);
|
||||
int16_t fcd = context.nfcImpl->getFCD16(el.cPoints[j]);
|
||||
if ( (fcd & 0xff) == 0 ) {
|
||||
baseChar = el.cPoints[j]; // last base character
|
||||
firstCM=0; // reset combining mark value
|
||||
|
|
|
@ -37,8 +37,7 @@ U_NAMESPACE_USE
|
|||
#define SECOND_LAST_BYTE_SHIFT_ 8
|
||||
#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
|
||||
|
||||
static const uint16_t *fcdTrieIndex = NULL;
|
||||
static UChar32 fcdHighStart = 0;
|
||||
static const Normalizer2Impl *g_nfcImpl = NULL;
|
||||
|
||||
// internal methods -------------------------------------------------
|
||||
|
||||
|
@ -103,7 +102,7 @@ inline int hash(uint32_t ce)
|
|||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
usearch_cleanup(void) {
|
||||
fcdTrieIndex = NULL;
|
||||
g_nfcImpl = NULL;
|
||||
return TRUE;
|
||||
}
|
||||
U_CDECL_END
|
||||
|
@ -117,8 +116,8 @@ U_CDECL_END
|
|||
static
|
||||
inline void initializeFCD(UErrorCode *status)
|
||||
{
|
||||
if (fcdTrieIndex == NULL) {
|
||||
fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
|
||||
if (g_nfcImpl == NULL) {
|
||||
g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
|
||||
}
|
||||
}
|
||||
|
@ -138,7 +137,7 @@ uint16_t getFCD(const UChar *str, int32_t *offset,
|
|||
int32_t strlength)
|
||||
{
|
||||
const UChar *temp = str + *offset;
|
||||
uint16_t result = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, temp, str + strlength);
|
||||
uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
|
||||
*offset = (int32_t)(temp - str);
|
||||
return result;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue