ICU-1007 replace old normalization implementation by new one, use unorm_quickCheck(UNORM_FCD) instead of the temporary checkFCD()

X-SVN-Rev: 5489
This commit is contained in:
Markus Scherer 2001-08-17 00:21:18 +00:00
parent 44283cc553
commit 6eb5998fc1
5 changed files with 258 additions and 1032 deletions
icu4c/source

View file

@ -151,59 +151,33 @@ Normalizer::normalize(const UnicodeString& source,
EMode mode,
int32_t options,
UnicodeString& result,
UErrorCode &status)
{
if (quickCheck(source, mode, status) == UNORM_YES)
{
result = source;
return;
}
/* ### TODO: begin new implementation */
if(unorm_usesNewImplementation()) {
if(source.isBogus()) {
UErrorCode &status) {
if(source.isBogus()) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_internalNormalize(result.fArray, result.fCapacity,
source.fArray, source.fLength,
getUNormalizationMode(mode, status), (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_internalNormalize(result.fArray, result.fCapacity,
source.fArray, source.fLength,
getUNormalizationMode(mode, status), (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
}
}
return;
}
/* ### end new implementation */
switch (mode) {
case NO_OP:
result = source;
break;
case COMPOSE:
case COMPOSE_COMPAT:
compose(source, (mode & COMPAT_BIT) != 0, options, result, status);
break;
case DECOMP:
case DECOMP_COMPAT:
decompose(source, (mode & COMPAT_BIT) != 0, options, result, status);
break;
}
}
UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
Normalizer::EMode mode,
UErrorCode &status)
{
if (U_FAILURE(status))
return UNORM_MAYBE;
UErrorCode &status) {
if(U_FAILURE(status)) {
return UNORM_MAYBE;
}
return unorm_quickCheck(source.fArray, source.length(),
getUNormalizationMode(mode, status), &status);
return unorm_quickCheck(source.fArray, source.length(),
getUNormalizationMode(mode, status), &status);
}
//-------------------------------------------------------------------------
@ -239,165 +213,19 @@ Normalizer::compose(const UnicodeString& source,
UBool compat,
int32_t options,
UnicodeString& result,
UErrorCode &status)
{
/* ### TODO: begin new implementation */
if(unorm_usesNewImplementation()) {
if(source.isBogus()) {
UErrorCode &status) {
if(source.isBogus()) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_compose(result.fArray, result.fCapacity,
source.fArray, source.fLength,
compat, (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_compose(result.fArray, result.fCapacity,
source.fArray, source.fLength,
compat, (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
}
}
return;
}
/* ### end new implementation */
if (U_FAILURE(status)) {
return;
}
result.truncate(0);
UnicodeString explodeBuf;
UTextOffset explodePos = EMPTY; // Position in input buffer
UTextOffset basePos = 0; // Position of last base in output string
uint16_t baseIndex = 0; // Index of last base in "actions" array
uint32_t classesSeen[2]; // Combining classes seen since last base
uint16_t action;
// Compatibility explosions have lower indices; skip them if necessary
uint16_t minExplode = (uint16_t)(compat ? 0 : ComposeData::MAX_COMPAT);
uint16_t minDecompLocal = (uint16_t)(compat ? 0 : DecompData::MAX_COMPAT);
UTextOffset i = 0;
emptyBitmask64(classesSeen);
while (i < source.length() || explodePos != EMPTY) {
// Get the next char from either the buffer or the source
UChar ch;
if (explodePos == EMPTY) {
ch = source[i++];
} else {
ch = explodeBuf[explodePos++];
if (explodePos >= explodeBuf.length()) {
explodePos = EMPTY;
explodeBuf.truncate(0);
}
}
// Get the basic info for the character
uint16_t charInfo = composeLookup(ch);
uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK);
uint16_t index = (uint16_t)(charInfo >> ComposeData::INDEX_SHIFT);
if (type == ComposeData::BASE ||
(type == ComposeData::NON_COMPOSING_COMBINING && index < minExplode)) {
emptyBitmask64(classesSeen);
baseIndex = index;
basePos = result.length();
result += ch;
}
else if (type == ComposeData::COMBINING)
{
uint32_t cclass = ComposeData::typeBit[index]; // 0..63
// We can only combine a character with the base if we haven't
// already seen a combining character with the same canonical class.
// We also only combine characters with an index from
// 1..COMBINING_COUNT-1. Indices >= COMBINING_COUNT are
// non-combining; these formerly had an index of zero.
if (index < ComposeData::COMBINING_COUNT
&& !isSetBitmask64(classesSeen, cclass)
&& (action = composeAction(baseIndex, index)) > 0)
{
if (action > ComposeData::MAX_COMPOSED) {
// Pairwise explosion. Actions above this value are really
// indices into an array that in turn contains indices
// into the exploding string table
// TODO: What if there are unprocessed chars in the explode buffer?
UChar newBase = pairExplode(explodeBuf, action);
explodePos = 0;
result[basePos] = newBase;
baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT);
} else {
// Normal pairwise combination. Replace the base char
UChar newBase = (UChar) action;
result[basePos] = newBase;
baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT);
}
//
// Since there are Unicode characters that cannot be combined in arbitrary
// order, we have to re-process any combining marks that go with this
// base character. There are only four characters in Unicode that have
// this problem. If they are fixed in Unicode 3.0, this code can go away.
//
UTextOffset len = result.length();
if (len - basePos > 1) {
for (UTextOffset j = basePos+1; j < len; j++) {
explodeBuf += result[j];
}
result.truncate(basePos+1);
emptyBitmask64(classesSeen);
if (explodePos == EMPTY) explodePos = 0;
}
} else {
// No combination with this character
bubbleAppend(result, ch, cclass);
setBitmask64(classesSeen, cclass);
}
}
else if (index > minExplode) {
// Single exploding character
explode(explodeBuf, index);
explodePos = 0;
}
else if (type == ComposeData::HANGUL && minExplode == 0) {
// If we're in compatibility mode we need to decompose Hangul to Jamo,
// because some of the Jamo might have compatibility decompositions.
hangulToJamo(ch, explodeBuf, minDecompLocal);
explodePos = 0;
}
else if (type == ComposeData::INITIAL_JAMO) {
emptyBitmask64(classesSeen);
baseIndex = ComposeData::INITIAL_JAMO_INDEX;
basePos = result.length();
result += ch;
}
else if (type == ComposeData::MEDIAL_JAMO
&& isEmptyBitmask64(classesSeen)
&& baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
// If the last character was an initial jamo, we can combine it with this
// one to create a Hangul character.
uint16_t l = (uint16_t)(result[basePos] - (UChar)JAMO_LBASE);
uint16_t v = (uint16_t)(ch - JAMO_VBASE);
result[basePos] = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
}
else if (type == ComposeData::FINAL_JAMO
&& isEmptyBitmask64(classesSeen)
&& baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
// If the last character was a medial jamo that we turned into Hangul,
// we can add this character too.
result[basePos] = (UChar)(result[basePos] + (ch - JAMO_TBASE));
baseIndex = 0;
basePos = -1;
emptyBitmask64(classesSeen);
} else {
baseIndex = 0;
basePos = -1;
emptyBitmask64(classesSeen);
result += ch;
}
}
}
@ -707,68 +535,21 @@ Normalizer::decompose(const UnicodeString& source,
UBool compat,
int32_t options,
UnicodeString& result,
UErrorCode &status)
{
/* ### TODO: begin new implementation */
if(unorm_usesNewImplementation()) {
if(source.isBogus()) {
UErrorCode &status) {
if(source.isBogus()) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_decompose(result.fArray, result.fCapacity,
source.fArray, source.fLength,
compat, (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
} else {
/* make sure that we do not operate on the same buffer in source and result */
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
result.fLength=unorm_decompose(result.fArray, result.fCapacity,
source.fArray, source.fLength,
compat, (options&IGNORE_HANGUL)!=0,
UnicodeString::growBuffer, &result,
&status);
if(U_FAILURE(status)) {
result.setToBogus();
}
}
return;
}
/* ### end new implementation */
if (U_FAILURE(status)) {
return;
}
UBool hangul = (options & IGNORE_HANGUL) == 0;
uint16_t minDecompLocal = (uint16_t)(compat ? 0 : DecompData::MAX_COMPAT);
UnicodeString buffer;
int32_t i = 0, bufPtr = -1;
result.truncate(0);
// Rewritten - Liu
while (i < source.length() || bufPtr >= 0) {
UChar ch;
if (bufPtr >= 0) {
ch = buffer.charAt(bufPtr++);
if (bufPtr == buffer.length()) {
bufPtr = -1;
}
} else {
ch = source[i++];
}
uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
uint16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK);
if (index > minDecompLocal) {
if ((offset & DecompData::DECOMP_RECURSE) != 0) {
buffer.truncate(0);
doAppend((const UChar*)DecompData::contents, index, buffer);
bufPtr = 0;
} else {
doAppend((const UChar*)DecompData::contents, index, result);
}
} else if (ch >= HANGUL_BASE && ch < HANGUL_LIMIT && hangul) {
hangulToJamo(ch, result, minDecompLocal);
} else {
result += ch;
}
}
fixCanonical(result);
}
/**

View file

@ -16,6 +16,10 @@
* 02/23/01 synwee Modified quickcheck and checkFCE to run through
* string for codepoints < 0x300 for the normalization
* mode NFC.
* 06/20/01+ Markus Scherer total rewrite, implement all normalization here
* instead of just wrappers around normlzr.cpp,
* load unorm.dat, support Unicode 3.1 with
* supplementary code points, etc.
*/
#include "unicode/utypes.h"
@ -28,24 +32,7 @@
#include "umutex.h"
#include "unormimp.h"
/* added by synwee ### TODO: remove once the new implementation is finished */
#include "unicode/uchar.h"
#include "unicode/utf16.h"
/* ### TODO: remove this once the new implementation is finished */
static UBool useNewImplementation=FALSE;
U_CAPI void U_EXPORT2
unorm_setNewImplementation(UBool useNew) {
useNewImplementation=useNew;
}
U_CAPI UBool U_EXPORT2
unorm_usesNewImplementation() {
return useNewImplementation;
}
/* new implementation ------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* Korean Hangul and Jamo constants */
enum {
@ -181,6 +168,15 @@ unorm_haveData(UErrorCode *pErrorCode) {
return _haveData(*pErrorCode);
}
U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrie(UErrorCode *pErrorCode) {
if(_haveData(*pErrorCode)) {
return fcdTrieIndex;
} else {
return NULL;
}
}
/* data access primitives --------------------------------------------------- */
inline uint32_t
@ -625,8 +621,8 @@ unorm_checkFCD(const UChar *src, int32_t srcLength) {
}
}
static UNormalizationCheckResult
_unorm_quickCheck(const UChar *src,
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar *src,
int32_t srcLength,
UNormalizationMode mode,
UErrorCode *pErrorCode) {
@ -751,7 +747,7 @@ U_CFUNC int32_t
unorm_decompose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, UBool ignoreHangul,
GrowBuffer *growBuffer, void *context,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode) {
UChar buffer[3];
const UChar *limit, *prevSrc, *p;
@ -1046,7 +1042,7 @@ _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
static uint8_t
_decomposeFCD(const UChar *src, const UChar *decompLimit, const UChar *limit,
UChar *dest, int32_t &destIndex, int32_t &destCapacity,
UBool canGrow, GrowBuffer *growBuffer, void *context) {
UBool canGrow, UGrowBuffer *growBuffer, void *context) {
UChar *reorderStart;
const UChar *p;
uint32_t norm32;
@ -1167,7 +1163,7 @@ _decomposeFCD(const UChar *src, const UChar *decompLimit, const UChar *limit,
static int32_t
unorm_makeFCD(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
GrowBuffer *growBuffer, void *context,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode) {
const UChar *limit, *prevSrc, *decompStart;
int32_t destIndex, length;
@ -1989,7 +1985,7 @@ U_CFUNC int32_t
unorm_compose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, UBool /* ### TODO: need to do this? -- ignoreHangul -- ### */,
GrowBuffer *growBuffer, void *context,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode) {
UChar stackBuffer[_STACK_BUFFER_CAPACITY];
UChar *buffer;
@ -2271,7 +2267,7 @@ U_CFUNC int32_t
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UNormalizationMode mode, UBool ignoreHangul,
GrowBuffer *growBuffer, void *context,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode) {
switch(mode) {
case UNORM_NFD:
@ -2329,638 +2325,36 @@ unorm_internalNormalize(UChar *dest, int32_t destCapacity,
}
}
/* old implementation ------------------------------------------------------- */
/* added by synwee for trie manipulation*/
#define STAGE_1_SHIFT_ 10
#define STAGE_2_SHIFT_ 4
#define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
#define STAGE_3_MASK_ 0xF
#define LAST_BYTE_MASK_ 0xFF
#define SECOND_LAST_BYTE_SHIFT_ 8
/* added by synwee for fast route in quickcheck and fcd */
#define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300
/*
* for a description of the file format,
* see icu/source/tools/genqchk/genqchk.c
*/
#define QCHK_DATA_NAME "qchk"
#define FCHK_DATA_NAME "fchk"
#define DATA_TYPE "dat"
static UDataMemory *quickcheckData = NULL;
static UDataMemory *fcdcheckData = NULL;
/**
* Authentication values
*/
static const uint8_t QCHK_DATA_FORMAT_[] = {0x71, 0x63, 0x68, 0x6b};
static const uint8_t FCHK_DATA_FORMAT_[] = {0x66, 0x63, 0x68, 0x6b};
static const uint8_t QCHK_FORMAT_VERSION_[] = {1, 0, 0, 0};
static const uint8_t FCHK_FORMAT_VERSION_[] = {1, 0, 0, 0};
/**
* index values loaded from qchk.dat.
* static uint16_t indexes[8];
*/
enum {
QCHK_INDEX_STAGE_2_BITS,
QCHK_INDEX_STAGE_3_BITS,
QCHK_INDEX_MIN_VALUES_SIZE,
QCHK_INDEX_STAGE_1_INDEX,
QCHK_INDEX_STAGE_2_INDEX,
QCHK_INDEX_STAGE_3_INDEX
};
/**
* index values loaded from qchk.dat.
* static uint16_t indexes[8];
*/
enum {
FCHK_INDEX_STAGE_2_BITS,
FCHK_INDEX_STAGE_3_BITS,
FCHK_INDEX_STAGE_1_INDEX,
FCHK_INDEX_STAGE_2_INDEX,
FCHK_INDEX_STAGE_3_INDEX
};
/**
* Array of mask for determining normalization quick check values.
* Indexes follows the values in UNormalizationMode
*/
static const uint8_t QCHK_MASK_[] = {0, 0, 0x11, 0x22, 0x44, 0x88};
/**
* Array of minimum codepoints that has UNORM_MAYBE or UNORM_NO quick check
* values. Indexes follows the values in UNormalizationMode.
* Generated values! Edit at your own risk.
*/
static const UChar32 *QCHK_MIN_VALUES_;
/**
* Flag to indicate if data has been loaded
*/
static UBool isQuickCheckLoaded = FALSE;
static UBool isFCDCheckLoaded = FALSE;
/**
* Minimum value to determine if quickcheck value contains a MAYBE
*/
static const uint8_t MIN_UNORM_MAYBE_ = 0x10;
/**
* Array of normalization form corresponding to the index code point.
* Hence codepoint 0xABCD will have normalization form QUICK_CHECK_DATA[0xABCD].
* UQUICK_CHECK_DATA[0xABCD] is a byte containing 2 sets of 4 bits information
* representing UNORM_MAYBE and UNORM_YES.<br>
* bits 1 2 3 4 5678<br>
* NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES<br>
* ie if UQUICK_CHECK_DATA[0xABCD] = 10000001, this means that 0xABCD is in
* NFD form and maybe in NFKC form
*/
static const uint16_t *QCHK_STAGE_1_;
static const uint16_t *QCHK_STAGE_2_;
static const uint8_t *QCHK_STAGE_3_;
/**
* Trie data for FCD.
* Each index corresponds to each code point.
* Trie value is the combining class of the first and the last character of the
* NFD of the codepoint.
* size uint16_t for the first 2 stages instead of uint32_t to reduce size.
*/
static const uint16_t *FCHK_STAGE_1_;
static const uint16_t *FCHK_STAGE_2_;
static const uint16_t *FCHK_STAGE_3_;
/** Public API for normalizing. */
U_CAPI int32_t
unorm_normalize(const UChar* src,
int32_t srcLength,
UNormalizationMode mode,
int32_t option,
UChar* dest,
int32_t destCapacity,
UErrorCode* pErrorCode)
{
if(useNewImplementation) {
/* check argument values */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
src==NULL || srcLength<-1
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* check for overlapping src and destination */
/* ### TODO: real API may provide a temp buffer */
if( (src>=dest && src<(dest+destCapacity)) ||
(srcLength>0 && dest>=src && dest<(src+srcLength))
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return unorm_internalNormalize(dest, destCapacity,
src, srcLength,
mode, (UBool)((option&UNORM_IGNORE_HANGUL)!=0),
NULL, NULL,
pErrorCode);
unorm_normalize(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t option,
UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
/* check argument values */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(U_FAILURE(*pErrorCode)) return -1;
if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
src==NULL || srcLength<-1
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* synwee : removed hard coded conversion */
Normalizer::EMode normMode = Normalizer::getNormalizerEMode(mode, *pErrorCode);
if (U_FAILURE(*pErrorCode))
return -1;
/* check for overlapping src and destination */
/* ### TODO: real API may provide a temp buffer */
if( (src>=dest && src<(dest+destCapacity)) ||
(srcLength>0 && dest>=src && dest<(src+srcLength))
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t len = (srcLength == -1 ? u_strlen(src) : srcLength);
const UnicodeString source(srcLength == -1, src, len);
UnicodeString dst(dest, 0, destCapacity);
/* synwee : note quickcheck is added in C ++ normalize method */
if ((option & UNORM_IGNORE_HANGUL) != 0)
option = Normalizer::IGNORE_HANGUL;
Normalizer::normalize(source, normMode, option, dst, *pErrorCode);
return uprv_fillOutputString(dst, dest, destCapacity, pErrorCode);
}
static UBool U_CALLCONV
isQuickCheckAcceptable(void *context,
const char *type, const char *name,
const UDataInfo *pInfo) {
if (pInfo->size >= 20 &&
pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
pInfo->charsetFamily == U_CHARSET_FAMILY &&
(uprv_memcmp(pInfo->dataFormat, QCHK_DATA_FORMAT_,
sizeof(QCHK_DATA_FORMAT_)) == 0) &&
/*
pInfo->dataFormat[0] == 0x71 &&
pInfo->dataFormat[1] == 0x63 &&
pInfo->dataFormat[2] == 0x68 &&
pInfo->dataFormat[3] == 0x6b &&
pInfo->formatVersion[0] == 1
*/
(uprv_memcmp(pInfo->formatVersion, QCHK_FORMAT_VERSION_,
sizeof(QCHK_FORMAT_VERSION_)) == 0)) {
return TRUE;
} else {
context = NULL;
type = NULL;
name = NULL;
return FALSE;
}
}
static UBool
loadQuickCheckData(UErrorCode *error) {
/* load quickcheck data from file if necessary */
if (!isQuickCheckLoaded && U_SUCCESS(*error)) {
UDataMemory *data;
/* open the data outside the mutex block */
data = udata_openChoice(NULL, DATA_TYPE, QCHK_DATA_NAME,
isQuickCheckAcceptable, NULL, error);
if (U_FAILURE(*error)) {
return isQuickCheckLoaded = FALSE;
}
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if (quickcheckData == NULL) {
const uint16_t *temp = (const uint16_t *)udata_getMemory(data);
const uint16_t *indexes = temp;
quickcheckData = data;
temp += 8;
QCHK_MIN_VALUES_ = (const UChar32 *)temp;
QCHK_STAGE_1_ = temp + indexes[QCHK_INDEX_STAGE_1_INDEX];
QCHK_STAGE_2_ = temp + indexes[QCHK_INDEX_STAGE_2_INDEX];
QCHK_STAGE_3_ = (const uint8_t *)(temp +
indexes[QCHK_INDEX_STAGE_3_INDEX]);
data = NULL;
}
umtx_unlock(NULL);
isQuickCheckLoaded = TRUE;
/* if a different thread set it first, then close the extra data */
if (data != NULL) {
udata_close(data); /* NULL if it was set correctly */
}
}
return isQuickCheckLoaded;
}
/**
* Performing quick check on a string, to quickly determine if the string is
* in a particular normalization format.
* Three types of result can be returned UNORM_YES, UNORM_NO or
* UNORM_MAYBE. Result UNORM_YES indicates that the argument
* string is in the desired normalized format, UNORM_NO determines that
* argument string is not in the desired normalized format. A
* UNORM_MAYBE result indicates that a more thorough check is required,
* the user may have to put the string in its normalized form and compare the
* results.
* @param source string for determining if it is in a normalized format
* @param sourcelength length of source to test
* @param mode normalization format from the enum UNormalizationMode
* @param status A pointer to an UErrorCode to receive any errors
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
*/
U_CAPI UNormalizationCheckResult
unorm_quickCheck(const UChar *source,
int32_t sourcelength,
UNormalizationMode mode,
UErrorCode* status)
{
uint8_t oldcombiningclass = 0;
uint8_t combiningclass;
uint8_t quickcheckvalue;
uint8_t mask = QCHK_MASK_[mode];
UChar32 min;
UChar32 codepoint;
UNormalizationCheckResult result = UNORM_YES;
const UChar *psource;
const UChar *pend = 0;
if(useNewImplementation) {
return _unorm_quickCheck(source, sourcelength, mode, status);
}
if (!loadQuickCheckData(status) || U_FAILURE(*status)) {
return UNORM_MAYBE;
}
min = QCHK_MIN_VALUES_[mode];
/* checking argument*/
if (mode >= UNORM_MODE_COUNT || mode < UNORM_NONE) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return UNORM_MAYBE;
}
if (sourcelength >= 0) {
psource = source;
pend = source + sourcelength;
for (;;) {
if (psource >= pend) {
return UNORM_YES;
}
/* fast route : since codepoints < min has combining class 0 and YES
looking at the minimum values, surrogates are not a problem */
if (*psource >= min) {
break;
}
psource ++;
}
}
else {
psource = source;
for (;;) {
if (*psource == 0) {
return UNORM_YES;
}
/* fast route : since codepoints < min has combining class 0 and YES
looking at the minimum values, surrogates are not a problem */
if (*psource >= min) {
break;
}
psource ++;
}
}
if (sourcelength >= 0) {
for (;;) {
int count = 0;
if (psource >= pend) {
break;
}
UTF_NEXT_CHAR(psource, count, pend - psource, codepoint);
combiningclass = u_getCombiningClass(codepoint);
/* not in canonical order */
if (oldcombiningclass > combiningclass && combiningclass != 0) {
return UNORM_NO;
}
oldcombiningclass = combiningclass;
/* trie access */
quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[
QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] +
(codepoint & STAGE_3_MASK_)] & mask);
/* value is a byte containing 2 sets of 4 bits information.
bits 1 2 3 4 5678<br>
NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES<br>
ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form
and maybe in NFKC form. */
if (quickcheckvalue == 0) {
return UNORM_NO;
}
if (quickcheckvalue >= MIN_UNORM_MAYBE_) {
result = UNORM_MAYBE;
}
psource += count;
}
}
else {
for (;;) {
int count = 0;
UTF_NEXT_CHAR(psource, count, pend - psource, codepoint);
if (codepoint == 0) {
break;
}
combiningclass = u_getCombiningClass(codepoint);
/* not in canonical order */
if (oldcombiningclass > combiningclass && combiningclass != 0) {
return UNORM_NO;
}
oldcombiningclass = combiningclass;
/* trie access */
quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[
QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] +
(codepoint & STAGE_3_MASK_)] & mask);
/* value is a byte containing 2 sets of 4 bits information.
bits 1 2 3 4 5678<br>
NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES<br>
ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form
and maybe in NFKC form. */
if (quickcheckvalue == 0) {
return UNORM_NO;
}
if (quickcheckvalue >= MIN_UNORM_MAYBE_) {
result = UNORM_MAYBE;
}
psource += count;
}
}
return result;
}
/* private methods ---------------------------------------------------------- */
static UBool U_CALLCONV
isFCDCheckAcceptable(void *context,
const char *type, const char *name,
const UDataInfo *pInfo) {
if(
pInfo->size >= 20 &&
pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
pInfo->charsetFamily == U_CHARSET_FAMILY &&
(uprv_memcmp(pInfo->dataFormat, FCHK_DATA_FORMAT_,
sizeof(FCHK_DATA_FORMAT_)) == 0) &&
/*
pInfo->dataFormat[0] == 0x71 &&
pInfo->dataFormat[1] == 0x63 &&
pInfo->dataFormat[2] == 0x68 &&
pInfo->dataFormat[3] == 0x6b &&
pInfo->formatVersion[0] == 1
*/
(uprv_memcmp(pInfo->formatVersion, FCHK_FORMAT_VERSION_,
sizeof(FCHK_FORMAT_VERSION_)) == 0)) {
return TRUE;
} else {
context = NULL;
type = NULL;
name = NULL;
return FALSE;
}
}
static UBool
loadFCDCheckData(UErrorCode *error) {
/* load fcdcheck data from file if necessary */
if (!isFCDCheckLoaded && U_SUCCESS(*error)) {
UDataMemory *data;
/* open the data outside the mutex block */
data = udata_openChoice(NULL, DATA_TYPE, FCHK_DATA_NAME,
isFCDCheckAcceptable, NULL, error);
if (U_FAILURE(*error)) {
return isFCDCheckLoaded = FALSE;
}
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if (fcdcheckData == NULL) {
const uint16_t *temp = (const uint16_t *)udata_getMemory(data);
const uint16_t *indexes = temp;
fcdcheckData = data;
temp += 8;
FCHK_STAGE_1_ = temp + indexes[FCHK_INDEX_STAGE_1_INDEX];
FCHK_STAGE_2_ = temp + indexes[FCHK_INDEX_STAGE_2_INDEX];
FCHK_STAGE_3_ = (const uint16_t *)(temp +
indexes[FCHK_INDEX_STAGE_3_INDEX]);
data = NULL;
}
umtx_unlock(NULL);
isFCDCheckLoaded = TRUE;
/* if a different thread set it first, then close the extra data */
if (data != NULL) {
udata_close(data); /* NULL if it was set correctly */
}
}
return isFCDCheckLoaded;
}
/**
* Gets the stage 1 data for checkFCD.
* @param error status
* @return checkFCD data stage 1, null if data can not be loaded
*/
U_CAPI const uint16_t * getFCHK_STAGE_1_(UErrorCode *error)
{
if (loadFCDCheckData(error)) {
return FCHK_STAGE_1_;
}
return NULL;
}
/**
* Gets the stage 2 data for checkFCD.
* @param error status
* @return checkFCD data stage 2, null if data can not be loaded
*/
U_CAPI const uint16_t * getFCHK_STAGE_2_(UErrorCode *error)
{
if (loadFCDCheckData(error)) {
return FCHK_STAGE_2_;
}
return NULL;
}
/**
* Gets the stage 3 data for checkFCD.
* @param error status
* @return checkFCD data stage 3, null if data can not be loaded
*/
U_CAPI const uint16_t * getFCHK_STAGE_3_(UErrorCode *error)
{
if (loadFCDCheckData(error)) {
return FCHK_STAGE_3_;
}
return NULL;
}
/**
* Private method which performs a quick FCD check on a string, to quickly
* determine if a string is in a required FCD format.
* FCD is the set of strings such that for each character in the string,
* decomposition without any canonical reordering will produce a NFD.
* @param source string for determining if it is in a normalized format
* @param sourcelength length of source to test
* @paran mode normalization format from the enum UNormalizationMode
* @param status A pointer to an UErrorCode to receive any errors
* @return TRUE if source is in FCD format, FALSE otherwise
*/
U_CAPI UBool
checkFCD(const UChar* source, int32_t sourcelength, UErrorCode* status)
{
if(useNewImplementation) {
return UNORM_YES==unorm_quickCheck(source, sourcelength, UNORM_FCD, status);
}
UChar32 codepoint;
const UChar *psource;
const UChar *pend = 0;
uint8_t oldfcdtrail = 0;
uint16_t fcd = 0;
if (!loadFCDCheckData(status) || U_FAILURE(*status)) {
return FALSE;
}
if (sourcelength >= 0) {
psource = source;
pend = source + sourcelength;
for (;;) {
if (psource >= pend) {
return TRUE;
}
/* fast route : since codepoints < NFC_ZER_CC_BLOCK_LIMIT_ has
combining class 0.
looking at the minimum values, surrogates are not a problem */
if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) {
break;
}
psource ++;
}
}
else {
psource = source;
for (;;) {
if (*psource == 0) {
return TRUE;
}
/* fast route : since codepoints < min has combining class 0 and YES
looking at the minimum values, surrogates are not a problem */
if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) {
break;
}
psource ++;
}
}
/* not end of string and yet failed simple compare
safe to shift back one char because the previous char has to be < 0x300 or the
start of a string */
if (psource == source) {
oldfcdtrail = 0;
}
else {
codepoint = *(psource - 1);
oldfcdtrail = (uint8_t)(FCHK_STAGE_3_[
FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)]
+ (codepoint & STAGE_3_MASK_)] & LAST_BYTE_MASK_);
}
if (sourcelength >= 0) {
for (;;) {
int count = 0;
uint8_t lead;
if (psource >= pend) {
return TRUE;
}
UTF_NEXT_CHAR(psource, count, pend - psource, codepoint);
/* trie access */
fcd = FCHK_STAGE_3_[
FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] +
(codepoint & STAGE_3_MASK_)];
lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
if (lead != 0 && oldfcdtrail > lead) {
return FALSE;
}
oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_);
psource += count;
}
}
else {
for (;;) {
int count = 0;
uint8_t lead;
UTF_NEXT_CHAR(psource, count, pend - psource, codepoint);
if (codepoint == 0) {
return TRUE;
}
/* trie access */
fcd = FCHK_STAGE_3_[
FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] +
(codepoint & STAGE_3_MASK_)];
lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
if (lead != 0 && oldfcdtrail > lead) {
return FALSE;
}
oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_);
psource += count;
}
}
return TRUE;
return unorm_internalNormalize(dest, destCapacity,
src, srcLength,
mode, (UBool)((option&UNORM_IGNORE_HANGUL)!=0),
NULL, NULL,
pErrorCode);
}

View file

@ -146,7 +146,7 @@ U_CFUNC int32_t
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UNormalizationMode mode, UBool ignoreHangul,
GrowBuffer *growBuffer, void *context,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode);
/**
@ -157,7 +157,7 @@ U_CFUNC int32_t
unorm_decompose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, UBool ignoreHangul,
GrowBuffer *growBuffer, void *context,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode);
/**
@ -168,21 +168,72 @@ U_CFUNC int32_t
unorm_compose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, UBool ignoreHangul,
GrowBuffer *growBuffer, void *context,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode);
/**
* internal API, but used by tests
* internal API, used by collation code
* Get access to the internal FCD trie table to be able to perform
* incremental, per-code unit, FCD checks in collation.
* One pointer is sufficient because the trie index values are offset
* by the index size, so that the same pointer is used to access the trie data.
* @internal
*/
U_CAPI void U_EXPORT2
unorm_setNewImplementation(UBool useNew);
U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrie(UErrorCode *pErrorCode);
#ifdef XP_CPLUSPLUS
/**
* internal API, but used by tests
* internal API, used by collation code
* Get the FCD value for a code unit, with
* bits 15..8 lead combining class
* bits 7..0 trail combining class
*
* If c is a lead surrogate and the value is not 0,
* then instead of combining classes the value
* is used in unorm_getFCD16FromSurrogatePair() to get the real value
* of the supplementary code point.
*
* @internal
*/
U_CAPI UBool U_EXPORT2
unorm_usesNewImplementation();
inline uint16_t
unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
return
fcdTrieIndex[
fcdTrieIndex[
c>>_NORM_TRIE_SHIFT
]+
(c&_NORM_STAGE_2_MASK)
];
}
/**
* internal API, used by collation code
* Get the FCD value for a supplementary code point, with
* bits 15..8 lead combining class
* bits 7..0 trail combining class
*
* @param fcd16 The FCD value for the lead surrogate, not 0.
* @param c2 The trail surrogate code unit.
*
* @internal
*/
inline uint16_t
unorm_getFCD16FromSurrogatePair(const uint16_t *fcdTrieIndex, uint16_t fcd16, UChar c2) {
/* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
uint32_t c=
((uint32_t)fcd16<<10)|
(c2&0x3ff);
return
fcdTrieIndex[
fcdTrieIndex[
c>>_NORM_TRIE_SHIFT
]+
(c&_NORM_STAGE_2_MASK)
];
}
#endif
#endif

View file

@ -29,6 +29,7 @@
#include "unicode/unorm.h"
#include "unicode/udata.h"
#include "unormimp.h"
#include "cpputils.h"
#include "cstring.h"
#include "ucmp32.h"
@ -51,8 +52,6 @@
static UCollator* UCA = NULL;
extern "C" UBool checkFCD(const UChar*, int32_t, UErrorCode*);
U_CDECL_BEGIN
static UBool U_CALLCONV
isAcceptableUCA(void * /*context*/,
@ -672,14 +671,7 @@ void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode
opts->alternateHandling = result->alternateHandling;
}
U_CAPI const uint16_t * getFCHK_STAGE_1_(UErrorCode *);
U_CAPI const uint16_t * getFCHK_STAGE_2_(UErrorCode *);
U_CAPI const uint16_t * getFCHK_STAGE_3_(UErrorCode *);
static const uint16_t *FCD_STAGE_1_;
static const uint16_t *FCD_STAGE_2_;
static const uint16_t *FCD_STAGE_3_;
static const uint16_t *fcdTrieIndex=NULL;
/**
@ -807,14 +799,8 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr
result->expansionCESize = (uint8_t*)result->image +
result->image->expansionCESize;
if (FCD_STAGE_1_ == NULL) {
FCD_STAGE_1_ = getFCHK_STAGE_1_(status);
}
if (FCD_STAGE_2_ == NULL) {
FCD_STAGE_2_ = getFCHK_STAGE_2_(status);
}
if (FCD_STAGE_3_ == NULL) {
FCD_STAGE_3_ = getFCHK_STAGE_3_(status);
if (fcdTrieIndex == NULL) {
fcdTrieIndex = unorm_getFCDTrie(status);
}
result->errorCode = *status;
@ -929,10 +915,8 @@ void collIterNormalize(collIterate *collationSource)
/* True because the previous call to this function will have always exited */
/* that way, and we get called for every char where cc might be non-zero. */
inline UBool collIterFCD(collIterate *collationSource) {
UChar32 codepoint;
UChar *srcP;
int32_t length;
int32_t count = 0;
UChar c, c2;
const UChar *srcP, *endP;
uint8_t leadingCC;
uint8_t prevTrailingCC = 0;
uint16_t fcd;
@ -940,52 +924,64 @@ inline UBool collIterFCD(collIterate *collationSource) {
srcP = collationSource->pos-1;
// If the source string is null terminated, use a fake too-long string length
// (needed for UTF_NEXT_CHAR). null will stop everything OK.)
length = (collationSource->flags & UCOL_ITER_HASLEN) ? collationSource->endp - srcP : INT32_MAX;
if (collationSource->flags & UCOL_ITER_HASLEN) {
endP = collationSource->endp;
} else {
endP = NULL;
}
// Get the trailing combining class of the current character. If it's zero,
// we are OK.
UTF_NEXT_CHAR(srcP, count, length, codepoint);
c = *srcP++;
/* trie access */
fcd = FCD_STAGE_3_[
FCD_STAGE_2_[FCD_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] +
(codepoint & STAGE_3_MASK_)];
prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
if (prevTrailingCC != 0) {
// The current char has a non-zero trailing CC. Scan forward until we find
// a char with a leading cc of zero.
for (;;)
{
if (count >= length) {
break;
fcd = unorm_getFCD16(fcdTrieIndex, c);
if (fcd != 0) {
if (UTF_IS_FIRST_SURROGATE(c)) {
if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
++srcP;
fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
} else {
fcd = 0;
}
int32_t savedCount = count;
UTF_NEXT_CHAR(srcP, count, length, codepoint);
}
/* trie access */
fcd = FCD_STAGE_3_[
FCD_STAGE_2_[FCD_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] +
(codepoint & STAGE_3_MASK_)];
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
if (leadingCC == 0) {
count = savedCount; // Hit char that is not part of combining sequence.
// back up over it. (Could be surrogate pair!)
break;
prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
if (prevTrailingCC != 0) {
// The current char has a non-zero trailing CC. Scan forward until we find
// a char with a leading cc of zero.
while (endP == NULL || srcP != endP)
{
const UChar *savedSrcP = srcP;
c = *srcP++;
/* trie access */
fcd = unorm_getFCD16(fcdTrieIndex, c);
if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) {
if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
++srcP;
fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
} else {
fcd = 0;
}
}
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
if (leadingCC == 0) {
srcP = savedSrcP; // Hit char that is not part of combining sequence.
// back up over it. (Could be surrogate pair!)
break;
}
if (leadingCC < prevTrailingCC) {
needNormalize = TRUE;
}
prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
}
if (leadingCC < prevTrailingCC) {
needNormalize = TRUE;
}
prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
}
}
collationSource->fcdPosition = srcP + count;
collationSource->fcdPosition = (UChar *)srcP;
return needNormalize;
}
@ -1208,23 +1204,29 @@ void collPrevIterNormalize(collIterate *data)
*/
inline UBool collPrevIterFCD(collIterate *data)
{
UChar32 codepoint;
const UChar *src, *start;
UChar c, c2;
uint8_t leadingCC;
uint8_t trailingCC = 0;
uint16_t fcd;
UBool result = FALSE;
int32_t length;
length = (data->pos + 1) - data->string;
start = data->string;
src = data->pos + 1;
/* Get the trailing combining class of the current character. */
UTF_PREV_CHAR(data->string, 0, length, codepoint);
/* trie access */
fcd = FCD_STAGE_3_[
FCD_STAGE_2_[FCD_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] +
(codepoint & STAGE_3_MASK_)];
c = *--src;
if (!UTF_IS_SURROGATE(c)) {
fcd = unorm_getFCD16(fcdTrieIndex, c);
} else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
--src;
fcd = unorm_getFCD16(fcdTrieIndex, c2);
if (fcd != 0) {
fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
}
} else /* unpaired surrogate */ {
fcd = 0;
}
leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
@ -1235,18 +1237,23 @@ inline UBool collPrevIterFCD(collIterate *data)
*/
for (;;)
{
if (length <= 0) {
length = -1;
break;
if (start == src) {
data->fcdPosition = NULL;
return result;
}
UTF_PREV_CHAR(data->string, 0, length, codepoint);
/* trie access */
fcd = FCD_STAGE_3_[
FCD_STAGE_2_[FCD_STAGE_1_[codepoint >> STAGE_1_SHIFT_] +
((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] +
(codepoint & STAGE_3_MASK_)];
c = *--src;
if (!UTF_IS_SURROGATE(c)) {
fcd = unorm_getFCD16(fcdTrieIndex, c);
} else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
--src;
fcd = unorm_getFCD16(fcdTrieIndex, c2);
if (fcd != 0) {
fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
}
} else /* unpaired surrogate */ {
fcd = 0;
}
trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
@ -1262,12 +1269,7 @@ inline UBool collPrevIterFCD(collIterate *data)
}
}
if (length < 0) {
data->fcdPosition = NULL;
}
else {
data->fcdPosition = data->string + length;
}
data->fcdPosition = (UChar *)src;
return result;
}
@ -3103,7 +3105,7 @@ ucol_calcSortKey(const UCollator *coll,
}
} else if((normMode != UCOL_OFF)
/* changed by synwee */
&& !checkFCD(source, len, status))
&& UNORM_YES!=unorm_quickCheck(source, len, UNORM_FCD, status))
{
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
if(U_FAILURE(*status)) {
@ -3595,7 +3597,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
/* If we need to normalize, we'll do it all at once at the beggining! */
UColAttributeValue normMode = coll->normalizationMode;
if(normMode != UCOL_OFF) {
if (!checkFCD(source, len, status))
if (UNORM_YES!=unorm_quickCheck(source, len, UNORM_FCD, status))
{
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
if(U_FAILURE(*status)) {

View file

@ -26,8 +26,6 @@
#define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array))
extern UBool checkFCD(const UChar *, int32_t, UErrorCode *);
static UCollator *myCollation;
static void
@ -566,7 +564,7 @@ void TestCheckFCD()
{0x0061, 0x030A, 0x00E2, 0x0323, 0},
{0x0061, 0x0323, 0x00E2, 0x0323, 0},
{0x0061, 0x0323, 0x1E05, 0x0302, 0} };
const UBool result[] = {TRUE, FALSE, FALSE, TRUE};
const UBool result[] = {UNORM_YES, UNORM_NO, UNORM_NO, UNORM_YES};
const UChar datachar[] = {0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
0x6a,
@ -581,26 +579,26 @@ void TestCheckFCD()
int count = 0;
if (checkFCD(FAST_, 10, &status) != TRUE)
log_err("checkFCD failed: expected value for fast checkFCD is TRUE\n");
if (checkFCD(FALSE_, 10, &status) != FALSE)
log_err("checkFCD failed: expected value for error checkFCD is FALSE\n");
if (checkFCD(TRUE_, 10, &status) != TRUE)
log_err("checkFCD failed: expected value for correct checkFCD is TRUE\n");
if (unorm_quickCheck(FAST_, 10, UNORM_FCD, &status) != UNORM_YES)
log_err("unorm_quickCheck(FCD) failed: expected value for fast unorm_quickCheck is UNORM_YES\n");
if (unorm_quickCheck(FALSE_, 10, UNORM_FCD, &status) != UNORM_NO)
log_err("unorm_quickCheck(FCD) failed: expected value for error unorm_quickCheck is UNORM_NO\n");
if (unorm_quickCheck(TRUE_, 10, UNORM_FCD, &status) != UNORM_YES)
log_err("unorm_quickCheck(FCD) failed: expected value for correct unorm_quickCheck is UNORM_YES\n");
if (U_FAILURE(status))
log_err("checkFCD failed: %s\n", u_errorName(status));
log_err("unorm_quickCheck(FCD) failed: %s\n", u_errorName(status));
while (count < 4)
{
UBool fcdresult = checkFCD(datastr[count], 4, &status);
UBool fcdresult = unorm_quickCheck(datastr[count], 4, UNORM_FCD, &status);
if (U_FAILURE(status)) {
log_err("checkFCD failed: exception occured at data set %d\n", count);
log_err("unorm_quickCheck(FCD) failed: exception occured at data set %d\n", count);
break;
}
else {
if (result[count] != fcdresult) {
log_err("checkFCD failed: Data set %d expected value %d\n", count,
log_err("unorm_quickCheck(FCD) failed: Data set %d expected value %d\n", count,
result[count]);
}
}
@ -614,7 +612,7 @@ void TestCheckFCD()
for (count = 0; count < 50; count ++)
{
int size = 0;
UBool testresult = TRUE;
UBool testresult = UNORM_YES;
UChar data[20];
UChar norm[100];
UChar nfd[100];
@ -627,7 +625,7 @@ void TestCheckFCD()
normsize += unorm_normalize(data + size, 1, UCOL_DECOMP_CAN, UCOL_IGNORE_HANGUL,
norm + normsize, 100 - normsize, &status);
if (U_FAILURE(status)) {
log_err("checkFCD failed: exception occured at data generation\n");
log_err("unorm_quickCheck(FCD) failed: exception occured at data generation\n");
break;
}
size ++;
@ -637,21 +635,21 @@ void TestCheckFCD()
nfdsize = unorm_normalize(data, size, UCOL_DECOMP_CAN, UCOL_IGNORE_HANGUL,
nfd, 100, &status);
if (U_FAILURE(status)) {
log_err("checkFCD failed: exception occured at normalized data generation\n");
log_err("unorm_quickCheck(FCD) failed: exception occured at normalized data generation\n");
}
if (nfdsize != normsize || u_memcmp(nfd, norm, nfdsize) != 0) {
testresult = FALSE;
testresult = UNORM_NO;
}
if (testresult == TRUE) {
log_verbose("result TRUE\n");
if (testresult == UNORM_YES) {
log_verbose("result UNORM_YES\n");
}
else {
log_verbose("result FALSE\n");
log_verbose("result UNORM_NO\n");
}
if (checkFCD(data, size, &status) != testresult || U_FAILURE(status)) {
log_err("checkFCD failed: expected %d for random data\n", testresult);
if (unorm_quickCheck(data, size, UNORM_FCD, &status) != testresult || U_FAILURE(status)) {
log_err("unorm_quickCheck(FCD) failed: expected %d for random data\n", testresult);
}
}
}