mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-2481 prototype tailored normalization; remove old ignore_hangul
X-SVN-Rev: 11065
This commit is contained in:
parent
a36de8f446
commit
d2966f1a4d
5 changed files with 470 additions and 106 deletions
|
@ -672,7 +672,7 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
|
|||
int32_t tempLen = inputLen + bufLen;
|
||||
|
||||
UChar trial[bufSize];
|
||||
unorm_decompose(trial, bufSize, temp, tempLen, FALSE, FALSE, &status);
|
||||
unorm_decompose(trial, bufSize, temp, tempLen, FALSE, 0, &status);
|
||||
|
||||
/* Test for buffer overflows */
|
||||
if(U_FAILURE(status)) {
|
||||
|
|
|
@ -18,12 +18,6 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef ICU_UNORM_USE_DEPRECATES
|
||||
enum {
|
||||
IGNORE_HANGUL=1
|
||||
};
|
||||
#endif /* ICU_UNORM_USE_DEPRECATES */
|
||||
|
||||
const char Normalizer::fgClassID=0;
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
|
@ -197,7 +191,7 @@ Normalizer::normalize(const UnicodeString& source,
|
|||
UChar *buffer=dest->getBuffer(source.length());
|
||||
int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
mode, (options&IGNORE_HANGUL)!=0,
|
||||
mode, options,
|
||||
&status);
|
||||
dest->releaseBuffer(length);
|
||||
if(status==U_BUFFER_OVERFLOW_ERROR) {
|
||||
|
@ -205,7 +199,7 @@ Normalizer::normalize(const UnicodeString& source,
|
|||
buffer=dest->getBuffer(length);
|
||||
length=unorm_internalNormalize(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
mode, (options&IGNORE_HANGUL)!=0,
|
||||
mode, options,
|
||||
&status);
|
||||
dest->releaseBuffer(length);
|
||||
}
|
||||
|
@ -243,7 +237,7 @@ Normalizer::compose(const UnicodeString& source,
|
|||
UChar *buffer=dest->getBuffer(source.length());
|
||||
int32_t length=unorm_compose(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
compat, (options&IGNORE_HANGUL)!=0,
|
||||
compat, options,
|
||||
&status);
|
||||
dest->releaseBuffer(length);
|
||||
if(status==U_BUFFER_OVERFLOW_ERROR) {
|
||||
|
@ -251,7 +245,7 @@ Normalizer::compose(const UnicodeString& source,
|
|||
buffer=dest->getBuffer(length);
|
||||
length=unorm_compose(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
compat, (options&IGNORE_HANGUL)!=0,
|
||||
compat, options,
|
||||
&status);
|
||||
dest->releaseBuffer(length);
|
||||
}
|
||||
|
@ -289,7 +283,7 @@ Normalizer::decompose(const UnicodeString& source,
|
|||
UChar *buffer=dest->getBuffer(source.length());
|
||||
int32_t length=unorm_decompose(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
compat, (options&IGNORE_HANGUL)!=0,
|
||||
compat, options,
|
||||
&status);
|
||||
dest->releaseBuffer(length);
|
||||
if(status==U_BUFFER_OVERFLOW_ERROR) {
|
||||
|
@ -297,7 +291,7 @@ Normalizer::decompose(const UnicodeString& source,
|
|||
buffer=dest->getBuffer(length);
|
||||
length=unorm_decompose(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
compat, (options&IGNORE_HANGUL)!=0,
|
||||
compat, options,
|
||||
&status);
|
||||
dest->releaseBuffer(length);
|
||||
}
|
||||
|
|
|
@ -173,18 +173,6 @@ typedef enum {
|
|||
* @obsolete ICU 2.4. Use UNORM_NFKC instead since this API will be removed in that release.
|
||||
*/
|
||||
UCOL_DECOMP_COMPAT_COMP_CAN =5,
|
||||
|
||||
/**
|
||||
* Do not normalize Hangul.
|
||||
* @obsolete ICU 2.2. Obsolete option, to be removed (or moved to private for documentation) in that release.
|
||||
*/
|
||||
UCOL_IGNORE_HANGUL = 16,
|
||||
|
||||
/**
|
||||
* Do not normalize Hangul.
|
||||
* @obsolete ICU 2.2. Obsolete option, to be removed (or moved to private for documentation) in that release.
|
||||
*/
|
||||
UNORM_IGNORE_HANGUL = 16
|
||||
#endif /* ICU_UNORM_USE_DEPRECATES */
|
||||
} UNormalizationMode;
|
||||
|
||||
|
|
|
@ -35,6 +35,24 @@
|
|||
#include "unicode/uset.h"
|
||||
#include "unormimp.h"
|
||||
|
||||
/* ### TODO: These depend on whether tailored normalization becomes permanent. */
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/usetiter.h"
|
||||
|
||||
/*
|
||||
* ### TODO: status of prototype for tailored normalization
|
||||
*
|
||||
* My main thrust so far was for unorm_normalize() and unorm_quickCheck().
|
||||
* isNormalized() should work, I think.
|
||||
* I have not yet thought about iterative normalization at all.
|
||||
*
|
||||
* Generally, any function that searches for a safe boundary has not been touched,
|
||||
* which means that these functions will be over-pessimistic when
|
||||
* exclusions are applied.
|
||||
* This may not matter because subsequent checks and normalizations do apply the exclusions.
|
||||
*/
|
||||
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
/*
|
||||
* This new implementation of the normalization code loads its data from
|
||||
* unorm.dat, which is generated with the gennorm tool.
|
||||
|
@ -47,12 +65,6 @@ enum {
|
|||
_STACK_BUFFER_CAPACITY=100
|
||||
};
|
||||
|
||||
#ifndef ICU_UNORM_USE_DEPRECATES
|
||||
enum {
|
||||
UNORM_IGNORE_HANGUL=16
|
||||
};
|
||||
#endif /* ICU_UNORM_USE_DEPRECATES */
|
||||
|
||||
/* Korean Hangul and Jamo constants */
|
||||
enum {
|
||||
JAMO_L_BASE=0x1100, /* "lead" jamo */
|
||||
|
@ -112,6 +124,8 @@ isJamoVTNorm32JamoV(uint32_t norm32) {
|
|||
return norm32<_NORM_JAMO_V_TOP;
|
||||
}
|
||||
|
||||
/* some prototypes ---------------------------------------------------------- */
|
||||
|
||||
static const UChar *
|
||||
_findPreviousStarter(const UChar *start, const UChar *src,
|
||||
uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe);
|
||||
|
@ -124,6 +138,7 @@ static const UChar *
|
|||
_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
|
||||
const UChar *prevStarter, const UChar *src,
|
||||
uint32_t qcMask, uint8_t &prevCC,
|
||||
const UnicodeSet *dx,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/* load unorm.dat ----------------------------------------------------------- */
|
||||
|
@ -151,10 +166,15 @@ static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
|
|||
/* the Unicode version of the normalization data */
|
||||
static UVersionInfo dataVersion={ 3, 1, 0, 0 };
|
||||
|
||||
/* ### TODO: prototype ### cache UnicodeSets for each combination of exclusion flags */
|
||||
static UnicodeSet *dxCache[UNORM_DX_MASK+1]={ NULL };
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
UBool
|
||||
unorm_cleanup() {
|
||||
int32_t i;
|
||||
|
||||
if(normData!=NULL) {
|
||||
udata_close(normData);
|
||||
normData=NULL;
|
||||
|
@ -162,6 +182,10 @@ unorm_cleanup() {
|
|||
dataErrorCode=U_ZERO_ERROR;
|
||||
haveNormData=0;
|
||||
|
||||
for(i=0; i<LENGTHOF(dxCache); ++i) {
|
||||
delete dxCache[i];
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
@ -374,6 +398,201 @@ _getExtraData(uint32_t norm32) {
|
|||
return extraData+(norm32>>_NORM_EXTRA_SHIFT);
|
||||
}
|
||||
|
||||
/* decomposition exclusion sets --------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Decomposition exclusion UnicodeSets are used for tailored normalization,
|
||||
* Unicode public review issue number 7. (http://www.unicode.org/review/)
|
||||
*
|
||||
* By specifying one or several sets of code points,
|
||||
* those do not get decomposed in normalization, even though Unicode might
|
||||
* otherwise define a decomposition for them.
|
||||
*
|
||||
* ### TODO: This is a prototype. Assess if it should become a permanent part of ICU.
|
||||
*/
|
||||
|
||||
static const UnicodeSet *
|
||||
internalGetDXHangul(UErrorCode &errorCode) {
|
||||
/* internal function, does not check for incoming U_FAILURE */
|
||||
|
||||
if(dxCache[UNORM_DX_HANGUL]==NULL) {
|
||||
UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
|
||||
if(set==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
umtx_lock(NULL);
|
||||
if(dxCache[UNORM_DX_HANGUL]==NULL) {
|
||||
dxCache[UNORM_DX_HANGUL]=set;
|
||||
set=NULL;
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
|
||||
delete set;
|
||||
}
|
||||
|
||||
return dxCache[UNORM_DX_HANGUL];
|
||||
}
|
||||
|
||||
static const UnicodeSet *
|
||||
internalGetDXCJKCompat(UErrorCode &errorCode) {
|
||||
/* internal function, does not check for incoming U_FAILURE */
|
||||
|
||||
if(dxCache[UNORM_DX_CJK_COMPAT]==NULL) {
|
||||
/* build a set from [CJK Ideographs]-[has canonical decomposition] */
|
||||
UnicodeSet *set, *hasDecomp;
|
||||
|
||||
set=new UnicodeSet(UNICODE_STRING("[:Ideographic:]", 15), errorCode);
|
||||
if(set==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
delete set;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* start with an empty set for [has canonical decomposition] */
|
||||
hasDecomp=new UnicodeSet();
|
||||
if(hasDecomp==NULL) {
|
||||
delete set;
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* iterate over all ideographs and remember which canonically decompose */
|
||||
UnicodeSetIterator it(*set);
|
||||
UChar32 start, end;
|
||||
uint32_t norm32;
|
||||
|
||||
while(it.nextRange() && !it.isString()) {
|
||||
start=it.getCodepoint();
|
||||
end=it.getCodepointEnd();
|
||||
while(start<=end) {
|
||||
UTRIE_GET32(&normTrie, start, norm32);
|
||||
if(norm32&_NORM_QC_NFD) {
|
||||
hasDecomp->add(start);
|
||||
}
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
/* compute set difference */
|
||||
set->removeAll(*hasDecomp);
|
||||
|
||||
umtx_lock(NULL);
|
||||
if(dxCache[UNORM_DX_CJK_COMPAT]==NULL) {
|
||||
dxCache[UNORM_DX_CJK_COMPAT]=set;
|
||||
set=NULL;
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
|
||||
delete set;
|
||||
}
|
||||
|
||||
return dxCache[UNORM_DX_CJK_COMPAT];
|
||||
}
|
||||
|
||||
static const UnicodeSet *
|
||||
internalGetDXAUmlaut(UErrorCode &errorCode) {
|
||||
/* internal function, does not check for incoming U_FAILURE */
|
||||
|
||||
if(dxCache[UNORM_DX_A_UMLAUT]==NULL) {
|
||||
UnicodeSet *set=new UnicodeSet(0xe4, 0xe4);
|
||||
if(set==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
umtx_lock(NULL);
|
||||
if(dxCache[UNORM_DX_A_UMLAUT]==NULL) {
|
||||
dxCache[UNORM_DX_A_UMLAUT]=set;
|
||||
set=NULL;
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
|
||||
delete set;
|
||||
}
|
||||
|
||||
return dxCache[UNORM_DX_A_UMLAUT];
|
||||
}
|
||||
|
||||
/* Get a decomposition exclusion set. The data must be loaded. */
|
||||
static const UnicodeSet *
|
||||
internalGetDX(int32_t options, UErrorCode &errorCode) {
|
||||
if(dxCache[options]==NULL) {
|
||||
/* return basic sets */
|
||||
if(options==UNORM_DX_HANGUL) {
|
||||
return internalGetDXHangul(errorCode);
|
||||
}
|
||||
if(options==UNORM_DX_CJK_COMPAT) {
|
||||
return internalGetDXCJKCompat(errorCode);
|
||||
}
|
||||
if(options==UNORM_DX_A_UMLAUT) {
|
||||
return internalGetDXCJKCompat(errorCode);
|
||||
}
|
||||
|
||||
/* build a set from multiple subsets */
|
||||
UnicodeSet *set;
|
||||
const UnicodeSet *other;
|
||||
|
||||
set=new UnicodeSet();
|
||||
if(set==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if((options&UNORM_DX_HANGUL)!=0 && NULL!=(other=internalGetDXHangul(errorCode))) {
|
||||
set->addAll(*other);
|
||||
}
|
||||
if((options&UNORM_DX_CJK_COMPAT)!=0 && NULL!=(other=internalGetDXCJKCompat(errorCode))) {
|
||||
set->addAll(*other);
|
||||
}
|
||||
if((options&UNORM_DX_A_UMLAUT)!=0 && NULL!=(other=internalGetDXAUmlaut(errorCode))) {
|
||||
set->addAll(*other);
|
||||
}
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
delete set;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
umtx_lock(NULL);
|
||||
if(dxCache[options]==NULL) {
|
||||
dxCache[options]=set;
|
||||
set=NULL;
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
|
||||
delete set;
|
||||
}
|
||||
|
||||
return dxCache[options];
|
||||
}
|
||||
|
||||
static inline const UnicodeSet *
|
||||
getDX(int32_t options, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode) || (options&=UNORM_DX_MASK)==0) {
|
||||
/* incoming failure, or no decomposition exclusions requested */
|
||||
return NULL;
|
||||
} else {
|
||||
return internalGetDX(options, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
static inline UBool
|
||||
dx_contains(const UnicodeSet *dx, UChar32 c) {
|
||||
return dx!=NULL && dx->contains(c);
|
||||
}
|
||||
|
||||
static inline UBool
|
||||
dx_contains(const UnicodeSet *dx, UChar c, UChar c2) {
|
||||
return dx!=NULL && dx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
|
||||
}
|
||||
|
||||
/* other normalization primitives ------------------------------------------- */
|
||||
|
||||
/* get the canonical or compatibility decomposition for one character */
|
||||
static inline const UChar *
|
||||
_decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
|
||||
|
@ -1013,7 +1232,7 @@ _mergeOrdered(UChar *start, UChar *current,
|
|||
/* quick check functions ---------------------------------------------------- */
|
||||
|
||||
static UBool
|
||||
unorm_checkFCD(const UChar *src, int32_t srcLength) {
|
||||
unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *dx) {
|
||||
const UChar *limit;
|
||||
UChar c, c2;
|
||||
uint16_t fcd16;
|
||||
|
@ -1076,8 +1295,27 @@ unorm_checkFCD(const UChar *src, int32_t srcLength) {
|
|||
++src;
|
||||
fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
|
||||
} else {
|
||||
c2=0;
|
||||
fcd16=0;
|
||||
}
|
||||
} else {
|
||||
c2=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If (c, c2) is excluded, then replace the code point's FCD data
|
||||
* with the regular UCD cc because it does not decompose.
|
||||
*/
|
||||
if(dx!=NULL) {
|
||||
UChar32 cp;
|
||||
|
||||
cp= c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2);
|
||||
if(dx->contains(cp)) {
|
||||
uint32_t norm32;
|
||||
UTRIE_GET32(&normTrie, cp, norm32);
|
||||
/* This depends on knowing that _NORM_CC_MASK==0xff00 */
|
||||
fcd16=(uint16_t)(norm32&0xff00)|(((uint16_t)norm32)>>8);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1093,7 +1331,11 @@ unorm_checkFCD(const UChar *src, int32_t srcLength) {
|
|||
if(cc!=0) {
|
||||
if(prevCC<0) {
|
||||
/* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
|
||||
prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
|
||||
if(!dx_contains(dx, (UChar32)-prevCC)) {
|
||||
prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
|
||||
} else {
|
||||
prevCC=0; /* excluded; UCD cc's of code points <U+0300 are all 0 */
|
||||
}
|
||||
}
|
||||
|
||||
if(cc<prevCC) {
|
||||
|
@ -1109,6 +1351,7 @@ _quickCheck(const UChar *src,
|
|||
int32_t srcLength,
|
||||
UNormalizationMode mode,
|
||||
UBool allowMaybe,
|
||||
const UnicodeSet *dx,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar stackBuffer[_STACK_BUFFER_CAPACITY];
|
||||
UChar *buffer;
|
||||
|
@ -1153,7 +1396,7 @@ _quickCheck(const UChar *src,
|
|||
qcMask=_NORM_QC_NFKD;
|
||||
break;
|
||||
case UNORM_FCD:
|
||||
return unorm_checkFCD(src, srcLength) ? UNORM_YES : UNORM_NO;
|
||||
return unorm_checkFCD(src, srcLength, dx) ? UNORM_YES : UNORM_NO;
|
||||
default:
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return UNORM_MAYBE;
|
||||
|
@ -1210,8 +1453,11 @@ _quickCheck(const UChar *src,
|
|||
++src;
|
||||
norm32=_getNorm32FromSurrogatePair(norm32, c2);
|
||||
} else {
|
||||
c2=0;
|
||||
norm32=0;
|
||||
}
|
||||
} else {
|
||||
c2=0;
|
||||
}
|
||||
|
||||
/* check the combining order */
|
||||
|
@ -1223,6 +1469,11 @@ _quickCheck(const UChar *src,
|
|||
prevCC=cc;
|
||||
|
||||
/* check for "no" or "maybe" quick check flags */
|
||||
if(dx_contains(dx, c, c2)) {
|
||||
/* excluded: treat like "yes" */
|
||||
continue;
|
||||
}
|
||||
|
||||
qcNorm32=norm32&qcMask;
|
||||
if(qcNorm32&_NORM_QC_ANY_NO) {
|
||||
result=UNORM_NO;
|
||||
|
@ -1255,7 +1506,7 @@ _quickCheck(const UChar *src,
|
|||
prevStarter,
|
||||
src,
|
||||
qcMask,
|
||||
prevCC, pErrorCode);
|
||||
prevCC, dx, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
result=UNORM_MAYBE; /* error (out of memory) */
|
||||
break;
|
||||
|
@ -1283,16 +1534,23 @@ endloop:
|
|||
U_CAPI UNormalizationCheckResult U_EXPORT2
|
||||
unorm_quickCheck(const UChar *src,
|
||||
int32_t srcLength,
|
||||
UNormalizationMode mode,
|
||||
UNormalizationMode mode,
|
||||
UErrorCode *pErrorCode) {
|
||||
return _quickCheck(src, srcLength, mode, TRUE, pErrorCode);
|
||||
return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI UNormalizationCheckResult U_EXPORT2
|
||||
unorm_quickCheckTailored(const UChar *src, int32_t srcLength,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
return _quickCheck(src, srcLength, mode, TRUE, getDX(options, *pErrorCode), pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
unorm_isNormalized(const UChar *src, int32_t srcLength,
|
||||
UNormalizationMode mode,
|
||||
UErrorCode *pErrorCode) {
|
||||
return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, pErrorCode));
|
||||
return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
|
||||
}
|
||||
|
||||
/* make NFD & NFKD ---------------------------------------------------------- */
|
||||
|
@ -1386,7 +1644,7 @@ unorm_getDecomposition(UChar32 c, UBool compat,
|
|||
static int32_t
|
||||
_decompose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UBool compat, const UnicodeSet *dx,
|
||||
uint8_t &outTrailCC) {
|
||||
UChar buffer[3];
|
||||
const UChar *limit, *prevSrc, *p;
|
||||
|
@ -1470,7 +1728,7 @@ _decompose(UChar *dest, int32_t destCapacity,
|
|||
* otherwise, p[length] is merged in with _mergeOrdered()
|
||||
*/
|
||||
if(isNorm32HangulOrJamo(norm32)) {
|
||||
if(ignoreHangul) {
|
||||
if(dx_contains(dx, c)) {
|
||||
c2=0;
|
||||
p=NULL;
|
||||
length=1;
|
||||
|
@ -1511,7 +1769,7 @@ _decompose(UChar *dest, int32_t destCapacity,
|
|||
}
|
||||
|
||||
/* get the decomposition and the lead and trail cc's */
|
||||
if((norm32&qcMask)==0) {
|
||||
if((norm32&qcMask)==0 || dx_contains(dx, c, c2)) {
|
||||
/* c does not decompose */
|
||||
cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
|
||||
p=NULL;
|
||||
|
@ -1575,8 +1833,9 @@ _decompose(UChar *dest, int32_t destCapacity,
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_decompose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UBool compat, int32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UnicodeSet *dx;
|
||||
int32_t destIndex;
|
||||
uint8_t trailCC;
|
||||
|
||||
|
@ -1584,9 +1843,14 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
|
|||
return 0;
|
||||
}
|
||||
|
||||
dx=getDX(options, *pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
destIndex=_decompose(dest, destCapacity,
|
||||
src, srcLength,
|
||||
compat, ignoreHangul,
|
||||
compat, dx,
|
||||
trailCC);
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
|
||||
|
@ -1648,7 +1912,8 @@ _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
|
|||
|
||||
static uint8_t
|
||||
_decomposeFCD(const UChar *src, const UChar *decompLimit,
|
||||
UChar *dest, int32_t &destIndex, int32_t destCapacity) {
|
||||
UChar *dest, int32_t &destIndex, int32_t destCapacity,
|
||||
const UnicodeSet *dx) {
|
||||
const UChar *p;
|
||||
uint32_t norm32;
|
||||
int32_t reorderStartIndex, length;
|
||||
|
@ -1694,7 +1959,7 @@ _decomposeFCD(const UChar *src, const UChar *decompLimit,
|
|||
}
|
||||
|
||||
/* get the decomposition and the lead and trail cc's */
|
||||
if((norm32&_NORM_QC_NFD)==0) {
|
||||
if((norm32&_NORM_QC_NFD)==0 || dx_contains(dx, c, c2)) {
|
||||
/* c does not decompose */
|
||||
cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
|
||||
p=NULL;
|
||||
|
@ -1756,6 +2021,7 @@ _decomposeFCD(const UChar *src, const UChar *decompLimit,
|
|||
static int32_t
|
||||
unorm_makeFCD(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const UnicodeSet *dx,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar *limit, *prevSrc, *decompStart;
|
||||
int32_t destIndex, length;
|
||||
|
@ -1839,7 +2105,11 @@ unorm_makeFCD(UChar *dest, int32_t destCapacity,
|
|||
/* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
|
||||
if(prevCC<0) {
|
||||
/* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
|
||||
prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
|
||||
if(!dx_contains(dx, (UChar32)-prevCC)) {
|
||||
prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
|
||||
} else {
|
||||
prevCC=0; /* excluded; UCD cc's of code points <U+0300 are all 0 */
|
||||
}
|
||||
|
||||
/*
|
||||
* set a pointer to this below-U+0300 character;
|
||||
|
@ -1883,6 +2153,22 @@ unorm_makeFCD(UChar *dest, int32_t destCapacity,
|
|||
|
||||
/* we are looking at the character (c, c2) at [prevSrc..src[ */
|
||||
|
||||
/*
|
||||
* If (c, c2) is excluded, then replace the code point's FCD data
|
||||
* with the regular UCD cc because it does not decompose.
|
||||
*/
|
||||
if(dx!=NULL) {
|
||||
UChar32 cp;
|
||||
|
||||
cp= c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2);
|
||||
if(dx->contains(cp)) {
|
||||
uint32_t norm32;
|
||||
UTRIE_GET32(&normTrie, cp, norm32);
|
||||
/* This depends on knowing that _NORM_CC_MASK==0xff00 */
|
||||
fcd16=(uint16_t)(norm32&0xff00)|(((uint16_t)norm32)>>8);
|
||||
}
|
||||
}
|
||||
|
||||
/* check the combining order, get the lead cc */
|
||||
cc=(int16_t)(fcd16>>8);
|
||||
if(cc==0 || cc>=prevCC) {
|
||||
|
@ -1921,7 +2207,8 @@ unorm_makeFCD(UChar *dest, int32_t destCapacity,
|
|||
* decompose and reorder a limited piece of the text
|
||||
*/
|
||||
prevCC=_decomposeFCD(decompStart, src,
|
||||
dest, destIndex, destCapacity);
|
||||
dest, destIndex, destCapacity,
|
||||
dx);
|
||||
decompStart=src;
|
||||
}
|
||||
}
|
||||
|
@ -1935,7 +2222,8 @@ unorm_makeFCD(UChar *dest, int32_t destCapacity,
|
|||
static inline uint32_t
|
||||
_getNextCombining(UChar *&p, const UChar *limit,
|
||||
UChar &c, UChar &c2,
|
||||
uint16_t &combiningIndex, uint8_t &cc) {
|
||||
uint16_t &combiningIndex, uint8_t &cc,
|
||||
const UnicodeSet *dx) {
|
||||
uint32_t norm32, combineFlags;
|
||||
|
||||
c=*p++;
|
||||
|
@ -1951,9 +2239,14 @@ _getNextCombining(UChar *&p, const UChar *limit,
|
|||
} else if(isNorm32HangulOrJamo(norm32)) {
|
||||
/* a compatibility decomposition contained Jamos */
|
||||
c2=0;
|
||||
combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
|
||||
cc=0;
|
||||
return norm32&_NORM_COMBINES_ANY;
|
||||
if(!dx_contains(dx, c)) {
|
||||
combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
|
||||
return norm32&_NORM_COMBINES_ANY;
|
||||
} else {
|
||||
combiningIndex=0;
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
/* c is a lead surrogate, get the real norm32 */
|
||||
if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
|
||||
|
@ -1967,13 +2260,19 @@ _getNextCombining(UChar *&p, const UChar *limit,
|
|||
}
|
||||
}
|
||||
|
||||
combineFlags=norm32&_NORM_COMBINES_ANY;
|
||||
if(combineFlags!=0) {
|
||||
combiningIndex=*(_getExtraData(norm32)-1);
|
||||
}
|
||||
|
||||
cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
|
||||
return combineFlags;
|
||||
|
||||
if(!dx_contains(dx, c, c2)) {
|
||||
combineFlags=norm32&_NORM_COMBINES_ANY;
|
||||
if(combineFlags!=0) {
|
||||
combiningIndex=*(_getExtraData(norm32)-1);
|
||||
}
|
||||
|
||||
return combineFlags;
|
||||
} else {
|
||||
combiningIndex=0;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2073,7 +2372,7 @@ _combine(const uint16_t *table, uint16_t combineBackIndex,
|
|||
* while the combining mark that is removed has at least one code unit
|
||||
*/
|
||||
static uint8_t
|
||||
_recompose(UChar *p, UChar *&limit) {
|
||||
_recompose(UChar *p, UChar *&limit, const UnicodeSet *dx) {
|
||||
UChar *starter, *pRemove, *q, *r;
|
||||
uint32_t combineFlags;
|
||||
UChar c, c2;
|
||||
|
@ -2090,7 +2389,7 @@ _recompose(UChar *p, UChar *&limit) {
|
|||
prevCC=0;
|
||||
|
||||
for(;;) {
|
||||
combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc);
|
||||
combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, dx);
|
||||
if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
|
||||
if(combineBackIndex&0x8000) {
|
||||
/* c is a Jamo V/T, see if we can compose it with the previous character */
|
||||
|
@ -2102,18 +2401,27 @@ _recompose(UChar *p, UChar *&limit) {
|
|||
if(c2<JAMO_L_COUNT) {
|
||||
pRemove=p-1;
|
||||
c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
|
||||
if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
|
||||
if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT && !dx_contains(dx, c2)) {
|
||||
++p;
|
||||
c+=c2;
|
||||
}
|
||||
*starter=c;
|
||||
if(!dx_contains(dx, c)) {
|
||||
*starter=c;
|
||||
} else {
|
||||
/* excluded */
|
||||
if(!isHangulWithoutJamoT(c)) {
|
||||
--p; /* undo the ++p from reading the Jamo T */
|
||||
}
|
||||
/* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
|
||||
pRemove=NULL;
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
/*
|
||||
* The following is disabled with #if 0 because it can not occur:
|
||||
* Since the input is in NFD, there are no Hangul LV syllables that
|
||||
* a Jamo T could combine with.
|
||||
* All Jamo Ts are combined above when handling Jamo Ls.
|
||||
* All Jamo Ts are combined above when handling Jamo Vs.
|
||||
*/
|
||||
} else {
|
||||
/* Jamo T, compose with previous Hangul that does not have a Jamo T */
|
||||
|
@ -2150,8 +2458,10 @@ _recompose(UChar *p, UChar *&limit) {
|
|||
!(combineFwdIndex&0x8000) &&
|
||||
/* the combining mark is not blocked and */
|
||||
(prevCC<cc || prevCC==0) &&
|
||||
/* the starter and the combining mark (c, c2) do combine */
|
||||
0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2))
|
||||
/* the starter and the combining mark (c, c2) do combine and */
|
||||
0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
|
||||
/* the composition result is not excluded */
|
||||
!dx_contains(dx, value, value2)
|
||||
) {
|
||||
/* replace the starter with the composition, remove the combining mark */
|
||||
pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
|
||||
|
@ -2224,7 +2534,7 @@ _recompose(UChar *p, UChar *&limit) {
|
|||
|
||||
/* if (c, c2) did not combine, then check if it is a starter */
|
||||
if(cc==0) {
|
||||
/* found a new starter */
|
||||
/* found a new starter; combineFlags==0 if (c, c2) is excluded */
|
||||
if(combineFlags&_NORM_COMBINES_FWD) {
|
||||
/* it may combine with something, prepare for it */
|
||||
if(c2==0) {
|
||||
|
@ -2321,6 +2631,7 @@ static const UChar *
|
|||
_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
|
||||
const UChar *prevStarter, const UChar *src,
|
||||
uint32_t qcMask, uint8_t &prevCC,
|
||||
const UnicodeSet *dx,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar *recomposeLimit;
|
||||
uint8_t trailCC;
|
||||
|
@ -2331,7 +2642,7 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
|
|||
/* decompose [prevStarter..src[ */
|
||||
length=_decompose(buffer, bufferCapacity,
|
||||
prevStarter, src-prevStarter,
|
||||
compat, FALSE,
|
||||
compat, dx,
|
||||
trailCC);
|
||||
if(length>bufferCapacity) {
|
||||
if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
|
||||
|
@ -2340,14 +2651,14 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
|
|||
}
|
||||
length=_decompose(buffer, bufferCapacity,
|
||||
prevStarter, src-prevStarter,
|
||||
compat, FALSE,
|
||||
compat, dx,
|
||||
trailCC);
|
||||
}
|
||||
|
||||
/* recompose the decomposition */
|
||||
recomposeLimit=buffer+length;
|
||||
if(length>=2) {
|
||||
prevCC=_recompose(buffer, recomposeLimit);
|
||||
prevCC=_recompose(buffer, recomposeLimit, dx);
|
||||
}
|
||||
|
||||
/* return with a pointer to the recomposition and its length */
|
||||
|
@ -2357,7 +2668,10 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
|
|||
|
||||
static inline UBool
|
||||
_composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
|
||||
UBool compat, UChar *dest) {
|
||||
UBool compat, UChar *dest, const UnicodeSet *dx) {
|
||||
if(dx!=NULL && (dx->contains(prev) || dx->contains(c))) {
|
||||
return FALSE;
|
||||
}
|
||||
if(isJamoVTNorm32JamoV(norm32)) {
|
||||
/* c is a Jamo V, compose with previous Jamo L and following Jamo T */
|
||||
prev=(UChar)(prev-JAMO_L_BASE);
|
||||
|
@ -2369,7 +2683,9 @@ _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UC
|
|||
UChar next, t;
|
||||
|
||||
next=*src;
|
||||
if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
|
||||
if(dx_contains(dx, next)) {
|
||||
/* excluded */
|
||||
} else if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
|
||||
/* normal Jamo T */
|
||||
++src;
|
||||
c+=t;
|
||||
|
@ -2390,6 +2706,12 @@ _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UC
|
|||
}
|
||||
}
|
||||
}
|
||||
if(dx_contains(dx, c)) {
|
||||
if(!isHangulWithoutJamoT(c)) {
|
||||
--src; /* undo ++src from reading the Jamo T */
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
if(dest!=0) {
|
||||
*dest=c;
|
||||
}
|
||||
|
@ -2397,8 +2719,12 @@ _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UC
|
|||
}
|
||||
} else if(isHangulWithoutJamoT(prev)) {
|
||||
/* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
|
||||
c=(UChar)(prev+(c-JAMO_T_BASE));
|
||||
if(dx_contains(dx, c)) {
|
||||
return FALSE;
|
||||
}
|
||||
if(dest!=0) {
|
||||
*dest=(UChar)(prev+(c-JAMO_T_BASE));
|
||||
*dest=c;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -2408,7 +2734,7 @@ _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UC
|
|||
static int32_t
|
||||
_compose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, UBool /* ### TODO: need to do this? -- ignoreHangul -- ### */,
|
||||
UBool compat, const UnicodeSet *dx,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar stackBuffer[_STACK_BUFFER_CAPACITY];
|
||||
UChar *buffer;
|
||||
|
@ -2553,11 +2879,12 @@ _compose(UChar *dest, int32_t destCapacity,
|
|||
prevCC=cc=0;
|
||||
reorderStartIndex=destIndex;
|
||||
|
||||
if( /* ### TODO: do we need to do this? !ignoreHangul && ### */
|
||||
if(
|
||||
destIndex>0 &&
|
||||
_composeHangul(
|
||||
*(prevSrc-1), c, norm32, src, limit, compat,
|
||||
destIndex<=destCapacity ? dest+(destIndex-1) : 0)
|
||||
destIndex<=destCapacity ? dest+(destIndex-1) : 0,
|
||||
dx)
|
||||
) {
|
||||
prevStarter=src;
|
||||
continue;
|
||||
|
@ -2586,7 +2913,7 @@ _compose(UChar *dest, int32_t destCapacity,
|
|||
}
|
||||
|
||||
/* we are looking at the character (c, c2) at [prevSrc..src[ */
|
||||
if((norm32&qcMask)==0) {
|
||||
if((norm32&qcMask)==0 || dx_contains(dx, c, c2)) {
|
||||
cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
|
||||
} else {
|
||||
const UChar *p;
|
||||
|
@ -2628,6 +2955,7 @@ _compose(UChar *dest, int32_t destCapacity,
|
|||
prevStarter, src,
|
||||
qcMask,
|
||||
prevCC, /* output */
|
||||
dx,
|
||||
pErrorCode);
|
||||
|
||||
if(p==NULL) {
|
||||
|
@ -2688,29 +3016,28 @@ _compose(UChar *dest, int32_t destCapacity,
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_compose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UBool compat, int32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UnicodeSet *dx;
|
||||
int32_t destIndex;
|
||||
|
||||
if(!_haveData(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
dx=getDX(options, *pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
destIndex=_compose(dest, destCapacity,
|
||||
src, srcLength,
|
||||
compat, ignoreHangul,
|
||||
compat, dx,
|
||||
pErrorCode);
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
|
||||
}
|
||||
|
||||
/*
|
||||
### TODO
|
||||
task items:
|
||||
- 2.0 Java sample code from unicode.org compare vs. JNI around C implementation - do monkey test
|
||||
- 2.1 port that sample code to C/C++ and run as part of regular test suite
|
||||
*/
|
||||
|
||||
/* normalize() API ---------------------------------------------------------- */
|
||||
|
||||
/**
|
||||
|
@ -2721,32 +3048,39 @@ unorm_compose(UChar *dest, int32_t destCapacity,
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UNormalizationMode mode, UBool ignoreHangul,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UnicodeSet *dx;
|
||||
|
||||
switch(mode) {
|
||||
case UNORM_NFD:
|
||||
return unorm_decompose(dest, destCapacity,
|
||||
src, srcLength,
|
||||
FALSE, ignoreHangul,
|
||||
FALSE, options,
|
||||
pErrorCode);
|
||||
case UNORM_NFKD:
|
||||
return unorm_decompose(dest, destCapacity,
|
||||
src, srcLength,
|
||||
TRUE, ignoreHangul,
|
||||
TRUE, options,
|
||||
pErrorCode);
|
||||
case UNORM_NFC:
|
||||
return unorm_compose(dest, destCapacity,
|
||||
src, srcLength,
|
||||
FALSE, ignoreHangul,
|
||||
FALSE, options,
|
||||
pErrorCode);
|
||||
case UNORM_NFKC:
|
||||
return unorm_compose(dest, destCapacity,
|
||||
src, srcLength,
|
||||
TRUE, ignoreHangul,
|
||||
TRUE, options,
|
||||
pErrorCode);
|
||||
case UNORM_FCD:
|
||||
dx=getDX(options, *pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
return unorm_makeFCD(dest, destCapacity,
|
||||
src, srcLength,
|
||||
dx,
|
||||
pErrorCode);
|
||||
case UNORM_NONE:
|
||||
/* just copy the string */
|
||||
|
@ -2766,7 +3100,7 @@ unorm_internalNormalize(UChar *dest, int32_t destCapacity,
|
|||
/** Public API for normalizing. */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_normalize(const UChar *src, int32_t srcLength,
|
||||
UNormalizationMode mode, int32_t option,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
/* check argument values */
|
||||
|
@ -2792,7 +3126,7 @@ unorm_normalize(const UChar *src, int32_t srcLength,
|
|||
|
||||
return unorm_internalNormalize(dest, destCapacity,
|
||||
src, srcLength,
|
||||
mode, (UBool)((option&(UNORM_IGNORE_HANGUL|1))!=0),
|
||||
mode, options,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
|
@ -3026,7 +3360,7 @@ unorm_previous(UCharIterator *src,
|
|||
if(doNormalize) {
|
||||
destLength=unorm_internalNormalize(dest, destCapacity,
|
||||
buffer+startIndex, bufferLength,
|
||||
mode, (UBool)((options&(UNORM_IGNORE_HANGUL|1))!=0),
|
||||
mode, options,
|
||||
pErrorCode);
|
||||
if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
|
||||
*pNeededToNormalize=
|
||||
|
@ -3272,7 +3606,7 @@ unorm_next(UCharIterator *src,
|
|||
if(doNormalize) {
|
||||
destLength=unorm_internalNormalize(dest, destCapacity,
|
||||
buffer, bufferLength,
|
||||
mode, (UBool)((options&(UNORM_IGNORE_HANGUL|1))!=0),
|
||||
mode, options,
|
||||
pErrorCode);
|
||||
if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
|
||||
*pNeededToNormalize=
|
||||
|
@ -3422,12 +3756,12 @@ unorm_concatenate(const UChar *left, int32_t leftLength,
|
|||
if(destCapacity>destLength) {
|
||||
destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
|
||||
buffer, bufferLength,
|
||||
mode, (UBool)((options&(UNORM_IGNORE_HANGUL|1))!=0),
|
||||
mode, options,
|
||||
pErrorCode);
|
||||
} else {
|
||||
destLength+=unorm_internalNormalize(NULL, 0,
|
||||
buffer, bufferLength,
|
||||
mode, (UBool)((options&(UNORM_IGNORE_HANGUL|1))!=0),
|
||||
mode, options,
|
||||
pErrorCode);
|
||||
}
|
||||
/*
|
||||
|
@ -3906,6 +4240,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
UErrorCode *pErrorCode) {
|
||||
UChar fold1[300], fold2[300], fcd1[300], fcd2[300];
|
||||
UChar *f1, *f2, *d1, *d2;
|
||||
const UnicodeSet *dx;
|
||||
int32_t result;
|
||||
|
||||
/* argument checking */
|
||||
|
@ -3924,17 +4259,22 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
return 0;
|
||||
}
|
||||
|
||||
dx=getDX((int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT), *pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
f1=f2=d1=d2=0;
|
||||
options|=_COMPARE_EQUIV;
|
||||
result=0;
|
||||
|
||||
|
||||
if(!(options&UNORM_INPUT_IS_FCD)) {
|
||||
int32_t _len1, _len2;
|
||||
UBool isFCD1, isFCD2;
|
||||
|
||||
// check if s1 and/or s2 fulfill the FCD conditions
|
||||
isFCD1=unorm_checkFCD(s1, length1);
|
||||
isFCD2=unorm_checkFCD(s2, length2);
|
||||
isFCD1=unorm_checkFCD(s1, length1, dx);
|
||||
isFCD2=unorm_checkFCD(s2, length2, dx);
|
||||
|
||||
if((options&U_COMPARE_IGNORE_CASE)!=0 && !(isFCD1 && isFCD2)) {
|
||||
// case-fold first to keep the order of operations as in UAX 21 2.5
|
||||
|
@ -3992,8 +4332,8 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
|
||||
// turn off U_COMPARE_IGNORE_CASE and re-check FCD
|
||||
options&=~U_COMPARE_IGNORE_CASE;
|
||||
isFCD1=unorm_checkFCD(s1, length1);
|
||||
isFCD2=unorm_checkFCD(s2, length2);
|
||||
isFCD1=unorm_checkFCD(s1, length1, dx);
|
||||
isFCD2=unorm_checkFCD(s2, length2, dx);
|
||||
}
|
||||
|
||||
if(!isFCD1 && !isFCD2) {
|
||||
|
@ -4005,7 +4345,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
|
||||
_len1=_decompose(fcd1, sizeof(fcd1)/U_SIZEOF_UCHAR,
|
||||
s1, length1,
|
||||
FALSE, FALSE,
|
||||
FALSE, dx,
|
||||
trailCC);
|
||||
if(_len1<=(int32_t)(sizeof(fcd1)/U_SIZEOF_UCHAR)) {
|
||||
s1=fcd1;
|
||||
|
@ -4018,7 +4358,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
|
||||
_len1=_decompose(d1, _len1,
|
||||
s1, length1,
|
||||
FALSE, FALSE,
|
||||
FALSE, dx,
|
||||
trailCC);
|
||||
|
||||
s1=d1;
|
||||
|
@ -4027,7 +4367,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
|
||||
_len2=_decompose(fcd2, sizeof(fcd2)/U_SIZEOF_UCHAR,
|
||||
s2, length2,
|
||||
FALSE, FALSE,
|
||||
FALSE, dx,
|
||||
trailCC);
|
||||
if(_len2<=(int32_t)(sizeof(fcd2)/U_SIZEOF_UCHAR)) {
|
||||
s2=fcd2;
|
||||
|
@ -4040,7 +4380,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
|
||||
_len2=_decompose(d2, _len2,
|
||||
s2, length2,
|
||||
FALSE, FALSE,
|
||||
FALSE, dx,
|
||||
trailCC);
|
||||
|
||||
s2=d2;
|
||||
|
@ -4055,6 +4395,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
if(!isFCD1) {
|
||||
_len1=unorm_makeFCD(fcd1, sizeof(fcd1)/U_SIZEOF_UCHAR,
|
||||
s1, length1,
|
||||
dx,
|
||||
pErrorCode);
|
||||
if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
|
||||
s1=fcd1;
|
||||
|
@ -4068,6 +4409,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
*pErrorCode=U_ZERO_ERROR;
|
||||
_len1=unorm_makeFCD(d1, _len1,
|
||||
s1, length1,
|
||||
dx,
|
||||
pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
goto cleanup;
|
||||
|
@ -4081,6 +4423,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
if(!isFCD2) {
|
||||
_len2=unorm_makeFCD(fcd2, sizeof(fcd2)/U_SIZEOF_UCHAR,
|
||||
s2, length2,
|
||||
dx,
|
||||
pErrorCode);
|
||||
if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
|
||||
s2=fcd2;
|
||||
|
@ -4094,6 +4437,7 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
*pErrorCode=U_ZERO_ERROR;
|
||||
_len2=unorm_makeFCD(d2, _len2,
|
||||
s2, length2,
|
||||
dx,
|
||||
pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
goto cleanup;
|
||||
|
|
|
@ -147,6 +147,44 @@ enum {
|
|||
_NORM_DECOMP_LENGTH_MASK=0x7f
|
||||
};
|
||||
|
||||
/* Constants for options flags for tailored normalization. ### TODO prototype, see unorm.cpp */
|
||||
enum {
|
||||
/** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */
|
||||
UNORM_DX_HANGUL=1,
|
||||
/** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */
|
||||
UNORM_DX_CJK_COMPAT=2,
|
||||
/** Options bit 2, do not decompose a-umlaut, only for testing. @internal */
|
||||
UNORM_DX_A_UMLAUT=4,
|
||||
/** This many of the least significant options bits are used to specify decomposition exclusions. @draft ICU 2.6 */
|
||||
UNORM_DX_COUNT=4,
|
||||
/** Options bit mask for decomposition exclusions. @draft ICU 2.6 */
|
||||
UNORM_DX_MASK=(1<<UNORM_DX_COUNT)-1
|
||||
};
|
||||
|
||||
/**
|
||||
* Lowest-order bit number of unorm_compare() options bits corresponding to
|
||||
* normalization options bits.
|
||||
*
|
||||
* The options parameter for unorm_compare() uses most bits for
|
||||
* itself and for various comparison and folding flags.
|
||||
* The most significant bits, however, are shifted down and passed on
|
||||
* to the normalization implementation.
|
||||
* (options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT)
|
||||
*
|
||||
* ### TODO prototype, see unorm.cpp
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
#define UNORM_COMPARE_NORM_OPTIONS_SHIFT 20
|
||||
|
||||
/**
|
||||
* ### TODO prototype, see unorm.cpp
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
U_CAPI UNormalizationCheckResult U_EXPORT2
|
||||
unorm_quickCheckTailored(const UChar *src, int32_t srcLength,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Is the normalizer data loaded?
|
||||
* This is used internally before other internal normalizer functions
|
||||
|
@ -170,7 +208,7 @@ unorm_haveData(UErrorCode *pErrorCode);
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UNormalizationMode mode, UBool ignoreHangul,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
|
@ -180,7 +218,7 @@ unorm_internalNormalize(UChar *dest, int32_t destCapacity,
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_decompose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UBool compat, int32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
|
@ -190,7 +228,7 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_compose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
UBool compat, int32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
|
|
Loading…
Add table
Reference in a new issue