From 6eb5998fc12e58f4f0768ae8edf0ddc878b1600b Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 17 Aug 2001 00:21:18 +0000 Subject: [PATCH] ICU-1007 replace old normalization implementation by new one, use unorm_quickCheck(UNORM_FCD) instead of the temporary checkFCD() X-SVN-Rev: 5489 --- icu4c/source/common/normlzr.cpp | 303 ++--------- icu4c/source/common/unorm.cpp | 702 ++------------------------ icu4c/source/common/unormimp.h | 69 ++- icu4c/source/i18n/ucol.cpp | 174 +++---- icu4c/source/test/cintltst/cnormtst.c | 42 +- 5 files changed, 258 insertions(+), 1032 deletions(-) diff --git a/icu4c/source/common/normlzr.cpp b/icu4c/source/common/normlzr.cpp index 9de063ee37c..8e2ef843890 100644 --- a/icu4c/source/common/normlzr.cpp +++ b/icu4c/source/common/normlzr.cpp @@ -151,59 +151,33 @@ Normalizer::normalize(const UnicodeString& source, EMode mode, int32_t options, UnicodeString& result, - UErrorCode &status) -{ - if (quickCheck(source, mode, status) == UNORM_YES) - { - result = source; - return; - } - - /* ### TODO: begin new implementation */ - if(unorm_usesNewImplementation()) { - if(source.isBogus()) { + UErrorCode &status) { + if(source.isBogus()) { + result.setToBogus(); + } else { + /* make sure that we do not operate on the same buffer in source and result */ + result.cloneArrayIfNeeded(-1, source.length()+20, FALSE); + result.fLength=unorm_internalNormalize(result.fArray, result.fCapacity, + source.fArray, source.fLength, + getUNormalizationMode(mode, status), (options&IGNORE_HANGUL)!=0, + UnicodeString::growBuffer, &result, + &status); + if(U_FAILURE(status)) { result.setToBogus(); - } else { - /* make sure that we do not operate on the same buffer in source and result */ - result.cloneArrayIfNeeded(-1, source.length()+20, FALSE); - result.fLength=unorm_internalNormalize(result.fArray, result.fCapacity, - source.fArray, source.fLength, - getUNormalizationMode(mode, status), (options&IGNORE_HANGUL)!=0, - UnicodeString::growBuffer, &result, - &status); - if(U_FAILURE(status)) { - result.setToBogus(); - } } - return; } - /* ### end new implementation */ - - switch (mode) { - case NO_OP: - result = source; - break; - case COMPOSE: - case COMPOSE_COMPAT: - compose(source, (mode & COMPAT_BIT) != 0, options, result, status); - break; - case DECOMP: - case DECOMP_COMPAT: - decompose(source, (mode & COMPAT_BIT) != 0, options, result, status); - break; - } } UNormalizationCheckResult Normalizer::quickCheck(const UnicodeString& source, Normalizer::EMode mode, - UErrorCode &status) -{ - if (U_FAILURE(status)) - return UNORM_MAYBE; + UErrorCode &status) { + if(U_FAILURE(status)) { + return UNORM_MAYBE; + } - return unorm_quickCheck(source.fArray, source.length(), - getUNormalizationMode(mode, status), &status); + return unorm_quickCheck(source.fArray, source.length(), + getUNormalizationMode(mode, status), &status); } //------------------------------------------------------------------------- @@ -239,165 +213,19 @@ Normalizer::compose(const UnicodeString& source, UBool compat, int32_t options, UnicodeString& result, - UErrorCode &status) -{ - /* ### TODO: begin new implementation */ - if(unorm_usesNewImplementation()) { - if(source.isBogus()) { + UErrorCode &status) { + if(source.isBogus()) { + result.setToBogus(); + } else { + /* make sure that we do not operate on the same buffer in source and result */ + result.cloneArrayIfNeeded(-1, source.length()+20, FALSE); + result.fLength=unorm_compose(result.fArray, result.fCapacity, + source.fArray, source.fLength, + compat, (options&IGNORE_HANGUL)!=0, + UnicodeString::growBuffer, &result, + &status); + if(U_FAILURE(status)) { result.setToBogus(); - } else { - /* make sure that we do not operate on the same buffer in source and result */ - result.cloneArrayIfNeeded(-1, source.length()+20, FALSE); - result.fLength=unorm_compose(result.fArray, result.fCapacity, - source.fArray, source.fLength, - compat, (options&IGNORE_HANGUL)!=0, - UnicodeString::growBuffer, &result, - &status); - if(U_FAILURE(status)) { - result.setToBogus(); - } - } - return; - } - /* ### end new implementation */ - if (U_FAILURE(status)) { - return; - } - result.truncate(0); - UnicodeString explodeBuf; - - UTextOffset explodePos = EMPTY; // Position in input buffer - UTextOffset basePos = 0; // Position of last base in output string - uint16_t baseIndex = 0; // Index of last base in "actions" array - uint32_t classesSeen[2]; // Combining classes seen since last base - uint16_t action; - - // Compatibility explosions have lower indices; skip them if necessary - uint16_t minExplode = (uint16_t)(compat ? 0 : ComposeData::MAX_COMPAT); - uint16_t minDecompLocal = (uint16_t)(compat ? 0 : DecompData::MAX_COMPAT); - - UTextOffset i = 0; - - emptyBitmask64(classesSeen); - while (i < source.length() || explodePos != EMPTY) { - // Get the next char from either the buffer or the source - UChar ch; - if (explodePos == EMPTY) { - ch = source[i++]; - } else { - ch = explodeBuf[explodePos++]; - if (explodePos >= explodeBuf.length()) { - explodePos = EMPTY; - explodeBuf.truncate(0); - } - } - - // Get the basic info for the character - uint16_t charInfo = composeLookup(ch); - uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK); - uint16_t index = (uint16_t)(charInfo >> ComposeData::INDEX_SHIFT); - - if (type == ComposeData::BASE || - (type == ComposeData::NON_COMPOSING_COMBINING && index < minExplode)) { - emptyBitmask64(classesSeen); - baseIndex = index; - basePos = result.length(); - result += ch; - } - else if (type == ComposeData::COMBINING) - { - uint32_t cclass = ComposeData::typeBit[index]; // 0..63 - - // We can only combine a character with the base if we haven't - // already seen a combining character with the same canonical class. - // We also only combine characters with an index from - // 1..COMBINING_COUNT-1. Indices >= COMBINING_COUNT are - // non-combining; these formerly had an index of zero. - if (index < ComposeData::COMBINING_COUNT - && !isSetBitmask64(classesSeen, cclass) - && (action = composeAction(baseIndex, index)) > 0) - { - if (action > ComposeData::MAX_COMPOSED) { - // Pairwise explosion. Actions above this value are really - // indices into an array that in turn contains indices - // into the exploding string table - // TODO: What if there are unprocessed chars in the explode buffer? - UChar newBase = pairExplode(explodeBuf, action); - explodePos = 0; - result[basePos] = newBase; - - baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT); - } else { - // Normal pairwise combination. Replace the base char - UChar newBase = (UChar) action; - result[basePos] = newBase; - - baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT); - } - // - // Since there are Unicode characters that cannot be combined in arbitrary - // order, we have to re-process any combining marks that go with this - // base character. There are only four characters in Unicode that have - // this problem. If they are fixed in Unicode 3.0, this code can go away. - // - UTextOffset len = result.length(); - if (len - basePos > 1) { - for (UTextOffset j = basePos+1; j < len; j++) { - explodeBuf += result[j]; - } - result.truncate(basePos+1); - emptyBitmask64(classesSeen); - if (explodePos == EMPTY) explodePos = 0; - } - } else { - // No combination with this character - bubbleAppend(result, ch, cclass); - setBitmask64(classesSeen, cclass); - } - } - else if (index > minExplode) { - // Single exploding character - explode(explodeBuf, index); - explodePos = 0; - } - else if (type == ComposeData::HANGUL && minExplode == 0) { - // If we're in compatibility mode we need to decompose Hangul to Jamo, - // because some of the Jamo might have compatibility decompositions. - hangulToJamo(ch, explodeBuf, minDecompLocal); - explodePos = 0; - } - else if (type == ComposeData::INITIAL_JAMO) { - emptyBitmask64(classesSeen); - baseIndex = ComposeData::INITIAL_JAMO_INDEX; - basePos = result.length(); - result += ch; - } - else if (type == ComposeData::MEDIAL_JAMO - && isEmptyBitmask64(classesSeen) - && baseIndex == ComposeData::INITIAL_JAMO_INDEX) { - // If the last character was an initial jamo, we can combine it with this - // one to create a Hangul character. - uint16_t l = (uint16_t)(result[basePos] - (UChar)JAMO_LBASE); - uint16_t v = (uint16_t)(ch - JAMO_VBASE); - result[basePos] = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT); - - baseIndex = ComposeData::MEDIAL_JAMO_INDEX; - } - else if (type == ComposeData::FINAL_JAMO - && isEmptyBitmask64(classesSeen) - && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) { - // If the last character was a medial jamo that we turned into Hangul, - // we can add this character too. - result[basePos] = (UChar)(result[basePos] + (ch - JAMO_TBASE)); - - baseIndex = 0; - basePos = -1; - emptyBitmask64(classesSeen); - } else { - baseIndex = 0; - basePos = -1; - emptyBitmask64(classesSeen); - result += ch; } } } @@ -707,68 +535,21 @@ Normalizer::decompose(const UnicodeString& source, UBool compat, int32_t options, UnicodeString& result, - UErrorCode &status) -{ - /* ### TODO: begin new implementation */ - if(unorm_usesNewImplementation()) { - if(source.isBogus()) { + UErrorCode &status) { + if(source.isBogus()) { + result.setToBogus(); + } else { + /* make sure that we do not operate on the same buffer in source and result */ + result.cloneArrayIfNeeded(-1, source.length()+20, FALSE); + result.fLength=unorm_decompose(result.fArray, result.fCapacity, + source.fArray, source.fLength, + compat, (options&IGNORE_HANGUL)!=0, + UnicodeString::growBuffer, &result, + &status); + if(U_FAILURE(status)) { result.setToBogus(); - } else { - /* make sure that we do not operate on the same buffer in source and result */ - result.cloneArrayIfNeeded(-1, source.length()+20, FALSE); - result.fLength=unorm_decompose(result.fArray, result.fCapacity, - source.fArray, source.fLength, - compat, (options&IGNORE_HANGUL)!=0, - UnicodeString::growBuffer, &result, - &status); - if(U_FAILURE(status)) { - result.setToBogus(); - } - } - return; - } - /* ### end new implementation */ - if (U_FAILURE(status)) { - return; - } - UBool hangul = (options & IGNORE_HANGUL) == 0; - uint16_t minDecompLocal = (uint16_t)(compat ? 0 : DecompData::MAX_COMPAT); - UnicodeString buffer; - int32_t i = 0, bufPtr = -1; - - result.truncate(0); - - // Rewritten - Liu - while (i < source.length() || bufPtr >= 0) { - UChar ch; - - if (bufPtr >= 0) { - ch = buffer.charAt(bufPtr++); - if (bufPtr == buffer.length()) { - bufPtr = -1; - } - } else { - ch = source[i++]; - } - - uint16_t offset = ucmp16_getu(DecompData::offsets, ch); - uint16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK); - - if (index > minDecompLocal) { - if ((offset & DecompData::DECOMP_RECURSE) != 0) { - buffer.truncate(0); - doAppend((const UChar*)DecompData::contents, index, buffer); - bufPtr = 0; - } else { - doAppend((const UChar*)DecompData::contents, index, result); - } - } else if (ch >= HANGUL_BASE && ch < HANGUL_LIMIT && hangul) { - hangulToJamo(ch, result, minDecompLocal); - } else { - result += ch; } } - fixCanonical(result); } /** diff --git a/icu4c/source/common/unorm.cpp b/icu4c/source/common/unorm.cpp index bf847e97e27..c7110897707 100644 --- a/icu4c/source/common/unorm.cpp +++ b/icu4c/source/common/unorm.cpp @@ -16,6 +16,10 @@ * 02/23/01 synwee Modified quickcheck and checkFCE to run through * string for codepoints < 0x300 for the normalization * mode NFC. +* 06/20/01+ Markus Scherer total rewrite, implement all normalization here +* instead of just wrappers around normlzr.cpp, +* load unorm.dat, support Unicode 3.1 with +* supplementary code points, etc. */ #include "unicode/utypes.h" @@ -28,24 +32,7 @@ #include "umutex.h" #include "unormimp.h" -/* added by synwee ### TODO: remove once the new implementation is finished */ -#include "unicode/uchar.h" -#include "unicode/utf16.h" - -/* ### TODO: remove this once the new implementation is finished */ -static UBool useNewImplementation=FALSE; - -U_CAPI void U_EXPORT2 -unorm_setNewImplementation(UBool useNew) { - useNewImplementation=useNew; -} - -U_CAPI UBool U_EXPORT2 -unorm_usesNewImplementation() { - return useNewImplementation; -} - -/* new implementation ------------------------------------------------------- */ +/* -------------------------------------------------------------------------- */ /* Korean Hangul and Jamo constants */ enum { @@ -181,6 +168,15 @@ unorm_haveData(UErrorCode *pErrorCode) { return _haveData(*pErrorCode); } +U_CAPI const uint16_t * U_EXPORT2 +unorm_getFCDTrie(UErrorCode *pErrorCode) { + if(_haveData(*pErrorCode)) { + return fcdTrieIndex; + } else { + return NULL; + } +} + /* data access primitives --------------------------------------------------- */ inline uint32_t @@ -625,8 +621,8 @@ unorm_checkFCD(const UChar *src, int32_t srcLength) { } } -static UNormalizationCheckResult -_unorm_quickCheck(const UChar *src, +U_CAPI UNormalizationCheckResult U_EXPORT2 +unorm_quickCheck(const UChar *src, int32_t srcLength, UNormalizationMode mode, UErrorCode *pErrorCode) { @@ -751,7 +747,7 @@ U_CFUNC int32_t unorm_decompose(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBool compat, UBool ignoreHangul, - GrowBuffer *growBuffer, void *context, + UGrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode) { UChar buffer[3]; const UChar *limit, *prevSrc, *p; @@ -1046,7 +1042,7 @@ _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) { static uint8_t _decomposeFCD(const UChar *src, const UChar *decompLimit, const UChar *limit, UChar *dest, int32_t &destIndex, int32_t &destCapacity, - UBool canGrow, GrowBuffer *growBuffer, void *context) { + UBool canGrow, UGrowBuffer *growBuffer, void *context) { UChar *reorderStart; const UChar *p; uint32_t norm32; @@ -1167,7 +1163,7 @@ _decomposeFCD(const UChar *src, const UChar *decompLimit, const UChar *limit, static int32_t unorm_makeFCD(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, - GrowBuffer *growBuffer, void *context, + UGrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode) { const UChar *limit, *prevSrc, *decompStart; int32_t destIndex, length; @@ -1989,7 +1985,7 @@ U_CFUNC int32_t unorm_compose(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBool compat, UBool /* ### TODO: need to do this? -- ignoreHangul -- ### */, - GrowBuffer *growBuffer, void *context, + UGrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode) { UChar stackBuffer[_STACK_BUFFER_CAPACITY]; UChar *buffer; @@ -2271,7 +2267,7 @@ U_CFUNC int32_t unorm_internalNormalize(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UNormalizationMode mode, UBool ignoreHangul, - GrowBuffer *growBuffer, void *context, + UGrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode) { switch(mode) { case UNORM_NFD: @@ -2329,638 +2325,36 @@ unorm_internalNormalize(UChar *dest, int32_t destCapacity, } } - - - - - - - - - - - - - - - - - -/* old implementation ------------------------------------------------------- */ - -/* added by synwee for trie manipulation*/ -#define STAGE_1_SHIFT_ 10 -#define STAGE_2_SHIFT_ 4 -#define STAGE_2_MASK_AFTER_SHIFT_ 0x3F -#define STAGE_3_MASK_ 0xF -#define LAST_BYTE_MASK_ 0xFF -#define SECOND_LAST_BYTE_SHIFT_ 8 - -/* added by synwee for fast route in quickcheck and fcd */ -#define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300 - -/* - * for a description of the file format, - * see icu/source/tools/genqchk/genqchk.c - */ -#define QCHK_DATA_NAME "qchk" -#define FCHK_DATA_NAME "fchk" -#define DATA_TYPE "dat" - -static UDataMemory *quickcheckData = NULL; -static UDataMemory *fcdcheckData = NULL; - -/** -* Authentication values -*/ -static const uint8_t QCHK_DATA_FORMAT_[] = {0x71, 0x63, 0x68, 0x6b}; -static const uint8_t FCHK_DATA_FORMAT_[] = {0x66, 0x63, 0x68, 0x6b}; -static const uint8_t QCHK_FORMAT_VERSION_[] = {1, 0, 0, 0}; -static const uint8_t FCHK_FORMAT_VERSION_[] = {1, 0, 0, 0}; - -/** -* index values loaded from qchk.dat. -* static uint16_t indexes[8]; -*/ -enum { - QCHK_INDEX_STAGE_2_BITS, - QCHK_INDEX_STAGE_3_BITS, - QCHK_INDEX_MIN_VALUES_SIZE, - QCHK_INDEX_STAGE_1_INDEX, - QCHK_INDEX_STAGE_2_INDEX, - QCHK_INDEX_STAGE_3_INDEX -}; - -/** -* index values loaded from qchk.dat. -* static uint16_t indexes[8]; -*/ -enum { - FCHK_INDEX_STAGE_2_BITS, - FCHK_INDEX_STAGE_3_BITS, - FCHK_INDEX_STAGE_1_INDEX, - FCHK_INDEX_STAGE_2_INDEX, - FCHK_INDEX_STAGE_3_INDEX -}; - -/** -* Array of mask for determining normalization quick check values. -* Indexes follows the values in UNormalizationMode -*/ -static const uint8_t QCHK_MASK_[] = {0, 0, 0x11, 0x22, 0x44, 0x88}; -/** -* Array of minimum codepoints that has UNORM_MAYBE or UNORM_NO quick check -* values. Indexes follows the values in UNormalizationMode. -* Generated values! Edit at your own risk. -*/ -static const UChar32 *QCHK_MIN_VALUES_; - -/** -* Flag to indicate if data has been loaded -*/ -static UBool isQuickCheckLoaded = FALSE; -static UBool isFCDCheckLoaded = FALSE; - -/** -* Minimum value to determine if quickcheck value contains a MAYBE -*/ -static const uint8_t MIN_UNORM_MAYBE_ = 0x10; - -/** -* Array of normalization form corresponding to the index code point. -* Hence codepoint 0xABCD will have normalization form QUICK_CHECK_DATA[0xABCD]. -* UQUICK_CHECK_DATA[0xABCD] is a byte containing 2 sets of 4 bits information -* representing UNORM_MAYBE and UNORM_YES.
-* bits 1 2 3 4 5678
-* NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
-* ie if UQUICK_CHECK_DATA[0xABCD] = 10000001, this means that 0xABCD is in -* NFD form and maybe in NFKC form -*/ -static const uint16_t *QCHK_STAGE_1_; -static const uint16_t *QCHK_STAGE_2_; -static const uint8_t *QCHK_STAGE_3_; - -/** -* Trie data for FCD. -* Each index corresponds to each code point. -* Trie value is the combining class of the first and the last character of the -* NFD of the codepoint. -* size uint16_t for the first 2 stages instead of uint32_t to reduce size. -*/ -static const uint16_t *FCHK_STAGE_1_; -static const uint16_t *FCHK_STAGE_2_; -static const uint16_t *FCHK_STAGE_3_; - +/** Public API for normalizing. */ U_CAPI int32_t -unorm_normalize(const UChar* src, - int32_t srcLength, - UNormalizationMode mode, - int32_t option, - UChar* dest, - int32_t destCapacity, - UErrorCode* pErrorCode) -{ - if(useNewImplementation) { - /* check argument values */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } - - if( destCapacity<0 || (dest==NULL && destCapacity>0) || - src==NULL || srcLength<-1 - ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - /* check for overlapping src and destination */ - /* ### TODO: real API may provide a temp buffer */ - if( (src>=dest && src<(dest+destCapacity)) || - (srcLength>0 && dest>=src && dest<(src+srcLength)) - ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - return unorm_internalNormalize(dest, destCapacity, - src, srcLength, - mode, (UBool)((option&UNORM_IGNORE_HANGUL)!=0), - NULL, NULL, - pErrorCode); +unorm_normalize(const UChar *src, int32_t srcLength, + UNormalizationMode mode, int32_t option, + UChar *dest, int32_t destCapacity, + UErrorCode *pErrorCode) { + /* check argument values */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; } - if(U_FAILURE(*pErrorCode)) return -1; + if( destCapacity<0 || (dest==NULL && destCapacity>0) || + src==NULL || srcLength<-1 + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } - /* synwee : removed hard coded conversion */ - Normalizer::EMode normMode = Normalizer::getNormalizerEMode(mode, *pErrorCode); - if (U_FAILURE(*pErrorCode)) - return -1; + /* check for overlapping src and destination */ + /* ### TODO: real API may provide a temp buffer */ + if( (src>=dest && src<(dest+destCapacity)) || + (srcLength>0 && dest>=src && dest<(src+srcLength)) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } - int32_t len = (srcLength == -1 ? u_strlen(src) : srcLength); - const UnicodeString source(srcLength == -1, src, len); - UnicodeString dst(dest, 0, destCapacity); - /* synwee : note quickcheck is added in C ++ normalize method */ - if ((option & UNORM_IGNORE_HANGUL) != 0) - option = Normalizer::IGNORE_HANGUL; - Normalizer::normalize(source, normMode, option, dst, *pErrorCode); - return uprv_fillOutputString(dst, dest, destCapacity, pErrorCode); -} - -static UBool U_CALLCONV -isQuickCheckAcceptable(void *context, - const char *type, const char *name, - const UDataInfo *pInfo) { - if (pInfo->size >= 20 && - pInfo->isBigEndian == U_IS_BIG_ENDIAN && - pInfo->charsetFamily == U_CHARSET_FAMILY && - (uprv_memcmp(pInfo->dataFormat, QCHK_DATA_FORMAT_, - sizeof(QCHK_DATA_FORMAT_)) == 0) && - /* - pInfo->dataFormat[0] == 0x71 && - pInfo->dataFormat[1] == 0x63 && - pInfo->dataFormat[2] == 0x68 && - pInfo->dataFormat[3] == 0x6b && - pInfo->formatVersion[0] == 1 - */ - (uprv_memcmp(pInfo->formatVersion, QCHK_FORMAT_VERSION_, - sizeof(QCHK_FORMAT_VERSION_)) == 0)) { - return TRUE; - } else { - context = NULL; - type = NULL; - name = NULL; - return FALSE; - } -} - -static UBool -loadQuickCheckData(UErrorCode *error) { - /* load quickcheck data from file if necessary */ - if (!isQuickCheckLoaded && U_SUCCESS(*error)) { - UDataMemory *data; - - /* open the data outside the mutex block */ - data = udata_openChoice(NULL, DATA_TYPE, QCHK_DATA_NAME, - isQuickCheckAcceptable, NULL, error); - if (U_FAILURE(*error)) { - return isQuickCheckLoaded = FALSE; - } - - /* in the mutex block, set the data for this process */ - umtx_lock(NULL); - if (quickcheckData == NULL) { - const uint16_t *temp = (const uint16_t *)udata_getMemory(data); - const uint16_t *indexes = temp; - - quickcheckData = data; - - temp += 8; - QCHK_MIN_VALUES_ = (const UChar32 *)temp; - QCHK_STAGE_1_ = temp + indexes[QCHK_INDEX_STAGE_1_INDEX]; - QCHK_STAGE_2_ = temp + indexes[QCHK_INDEX_STAGE_2_INDEX]; - QCHK_STAGE_3_ = (const uint8_t *)(temp + - indexes[QCHK_INDEX_STAGE_3_INDEX]); - data = NULL; - } - umtx_unlock(NULL); - - isQuickCheckLoaded = TRUE; - - /* if a different thread set it first, then close the extra data */ - if (data != NULL) { - udata_close(data); /* NULL if it was set correctly */ - } - } - - return isQuickCheckLoaded; -} - -/** - * Performing quick check on a string, to quickly determine if the string is - * in a particular normalization format. - * Three types of result can be returned UNORM_YES, UNORM_NO or - * UNORM_MAYBE. Result UNORM_YES indicates that the argument - * string is in the desired normalized format, UNORM_NO determines that - * argument string is not in the desired normalized format. A - * UNORM_MAYBE result indicates that a more thorough check is required, - * the user may have to put the string in its normalized form and compare the - * results. - * @param source string for determining if it is in a normalized format - * @param sourcelength length of source to test - * @param mode normalization format from the enum UNormalizationMode - * @param status A pointer to an UErrorCode to receive any errors - * @return UNORM_YES, UNORM_NO or UNORM_MAYBE - */ -U_CAPI UNormalizationCheckResult -unorm_quickCheck(const UChar *source, - int32_t sourcelength, - UNormalizationMode mode, - UErrorCode* status) -{ - uint8_t oldcombiningclass = 0; - uint8_t combiningclass; - uint8_t quickcheckvalue; - uint8_t mask = QCHK_MASK_[mode]; - UChar32 min; - UChar32 codepoint; - UNormalizationCheckResult result = UNORM_YES; - const UChar *psource; - const UChar *pend = 0; - - if(useNewImplementation) { - return _unorm_quickCheck(source, sourcelength, mode, status); - } - - if (!loadQuickCheckData(status) || U_FAILURE(*status)) { - return UNORM_MAYBE; - } - - min = QCHK_MIN_VALUES_[mode]; - - /* checking argument*/ - if (mode >= UNORM_MODE_COUNT || mode < UNORM_NONE) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return UNORM_MAYBE; - } - - if (sourcelength >= 0) { - psource = source; - pend = source + sourcelength; - for (;;) { - if (psource >= pend) { - return UNORM_YES; - } - /* fast route : since codepoints < min has combining class 0 and YES - looking at the minimum values, surrogates are not a problem */ - if (*psource >= min) { - break; - } - psource ++; - } - } - else { - psource = source; - for (;;) { - if (*psource == 0) { - return UNORM_YES; - } - /* fast route : since codepoints < min has combining class 0 and YES - looking at the minimum values, surrogates are not a problem */ - if (*psource >= min) { - break; - } - psource ++; - } - } - - if (sourcelength >= 0) { - for (;;) { - int count = 0; - - if (psource >= pend) { - break; - } - UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); - combiningclass = u_getCombiningClass(codepoint); - /* not in canonical order */ - - if (oldcombiningclass > combiningclass && combiningclass != 0) { - return UNORM_NO; - } - - oldcombiningclass = combiningclass; - - /* trie access */ - quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[ - QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + - (codepoint & STAGE_3_MASK_)] & mask); - /* value is a byte containing 2 sets of 4 bits information. - bits 1 2 3 4 5678
- NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
- ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form - and maybe in NFKC form. */ - if (quickcheckvalue == 0) { - return UNORM_NO; - } - if (quickcheckvalue >= MIN_UNORM_MAYBE_) { - result = UNORM_MAYBE; - } - psource += count; - } - } - else { - for (;;) { - int count = 0; - UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); - if (codepoint == 0) { - break; - } - - combiningclass = u_getCombiningClass(codepoint); - /* not in canonical order */ - - if (oldcombiningclass > combiningclass && combiningclass != 0) { - return UNORM_NO; - } - - oldcombiningclass = combiningclass; - - /* trie access */ - quickcheckvalue = (uint8_t)(QCHK_STAGE_3_[ - QCHK_STAGE_2_[QCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + - (codepoint & STAGE_3_MASK_)] & mask); - /* value is a byte containing 2 sets of 4 bits information. - bits 1 2 3 4 5678
- NFKC NFC NFKD NFD MAYBES NFKC NFC NFKD NFD YES
- ie if quick[0xABCD] = 10000001, this means that 0xABCD is in NFD form - and maybe in NFKC form. */ - if (quickcheckvalue == 0) { - return UNORM_NO; - } - if (quickcheckvalue >= MIN_UNORM_MAYBE_) { - result = UNORM_MAYBE; - } - psource += count; - } - } - - return result; -} - -/* private methods ---------------------------------------------------------- */ - -static UBool U_CALLCONV -isFCDCheckAcceptable(void *context, - const char *type, const char *name, - const UDataInfo *pInfo) { - if( - pInfo->size >= 20 && - pInfo->isBigEndian == U_IS_BIG_ENDIAN && - pInfo->charsetFamily == U_CHARSET_FAMILY && - (uprv_memcmp(pInfo->dataFormat, FCHK_DATA_FORMAT_, - sizeof(FCHK_DATA_FORMAT_)) == 0) && - /* - pInfo->dataFormat[0] == 0x71 && - pInfo->dataFormat[1] == 0x63 && - pInfo->dataFormat[2] == 0x68 && - pInfo->dataFormat[3] == 0x6b && - pInfo->formatVersion[0] == 1 - */ - (uprv_memcmp(pInfo->formatVersion, FCHK_FORMAT_VERSION_, - sizeof(FCHK_FORMAT_VERSION_)) == 0)) { - return TRUE; - } else { - context = NULL; - type = NULL; - name = NULL; - return FALSE; - } -} - -static UBool -loadFCDCheckData(UErrorCode *error) { - /* load fcdcheck data from file if necessary */ - if (!isFCDCheckLoaded && U_SUCCESS(*error)) { - UDataMemory *data; - - /* open the data outside the mutex block */ - data = udata_openChoice(NULL, DATA_TYPE, FCHK_DATA_NAME, - isFCDCheckAcceptable, NULL, error); - if (U_FAILURE(*error)) { - return isFCDCheckLoaded = FALSE; - } - - /* in the mutex block, set the data for this process */ - umtx_lock(NULL); - if (fcdcheckData == NULL) { - const uint16_t *temp = (const uint16_t *)udata_getMemory(data); - const uint16_t *indexes = temp; - - fcdcheckData = data; - - temp += 8; - FCHK_STAGE_1_ = temp + indexes[FCHK_INDEX_STAGE_1_INDEX]; - FCHK_STAGE_2_ = temp + indexes[FCHK_INDEX_STAGE_2_INDEX]; - FCHK_STAGE_3_ = (const uint16_t *)(temp + - indexes[FCHK_INDEX_STAGE_3_INDEX]); - data = NULL; - } - umtx_unlock(NULL); - - isFCDCheckLoaded = TRUE; - - /* if a different thread set it first, then close the extra data */ - if (data != NULL) { - udata_close(data); /* NULL if it was set correctly */ - } - } - - return isFCDCheckLoaded; -} - -/** -* Gets the stage 1 data for checkFCD. -* @param error status -* @return checkFCD data stage 1, null if data can not be loaded -*/ -U_CAPI const uint16_t * getFCHK_STAGE_1_(UErrorCode *error) -{ - if (loadFCDCheckData(error)) { - return FCHK_STAGE_1_; - } - return NULL; -} - -/** -* Gets the stage 2 data for checkFCD. -* @param error status -* @return checkFCD data stage 2, null if data can not be loaded -*/ -U_CAPI const uint16_t * getFCHK_STAGE_2_(UErrorCode *error) -{ - if (loadFCDCheckData(error)) { - return FCHK_STAGE_2_; - } - return NULL; -} - -/** -* Gets the stage 3 data for checkFCD. -* @param error status -* @return checkFCD data stage 3, null if data can not be loaded -*/ -U_CAPI const uint16_t * getFCHK_STAGE_3_(UErrorCode *error) -{ - if (loadFCDCheckData(error)) { - return FCHK_STAGE_3_; - } - return NULL; -} - -/** -* Private method which performs a quick FCD check on a string, to quickly -* determine if a string is in a required FCD format. -* FCD is the set of strings such that for each character in the string, -* decomposition without any canonical reordering will produce a NFD. -* @param source string for determining if it is in a normalized format -* @param sourcelength length of source to test -* @paran mode normalization format from the enum UNormalizationMode -* @param status A pointer to an UErrorCode to receive any errors -* @return TRUE if source is in FCD format, FALSE otherwise -*/ -U_CAPI UBool -checkFCD(const UChar* source, int32_t sourcelength, UErrorCode* status) -{ - if(useNewImplementation) { - return UNORM_YES==unorm_quickCheck(source, sourcelength, UNORM_FCD, status); - } - - UChar32 codepoint; - const UChar *psource; - const UChar *pend = 0; - uint8_t oldfcdtrail = 0; - uint16_t fcd = 0; - - if (!loadFCDCheckData(status) || U_FAILURE(*status)) { - return FALSE; - } - - if (sourcelength >= 0) { - psource = source; - pend = source + sourcelength; - for (;;) { - if (psource >= pend) { - return TRUE; - } - /* fast route : since codepoints < NFC_ZER_CC_BLOCK_LIMIT_ has - combining class 0. - looking at the minimum values, surrogates are not a problem */ - if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) { - break; - } - psource ++; - } - } - else { - psource = source; - for (;;) { - if (*psource == 0) { - return TRUE; - } - /* fast route : since codepoints < min has combining class 0 and YES - looking at the minimum values, surrogates are not a problem */ - if (*psource >= NFC_ZERO_CC_BLOCK_LIMIT_) { - break; - } - psource ++; - } - } - - /* not end of string and yet failed simple compare - safe to shift back one char because the previous char has to be < 0x300 or the - start of a string */ - if (psource == source) { - oldfcdtrail = 0; - } - else { - codepoint = *(psource - 1); - oldfcdtrail = (uint8_t)(FCHK_STAGE_3_[ - FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] - + (codepoint & STAGE_3_MASK_)] & LAST_BYTE_MASK_); - } - - if (sourcelength >= 0) { - for (;;) { - int count = 0; - uint8_t lead; - - if (psource >= pend) { - return TRUE; - } - - UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); - - /* trie access */ - fcd = FCHK_STAGE_3_[ - FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + - (codepoint & STAGE_3_MASK_)]; - lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); - - if (lead != 0 && oldfcdtrail > lead) { - return FALSE; - } - oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_); - - psource += count; - } - } - else { - for (;;) { - int count = 0; - uint8_t lead; - - UTF_NEXT_CHAR(psource, count, pend - psource, codepoint); - if (codepoint == 0) { - return TRUE; - } - /* trie access */ - fcd = FCHK_STAGE_3_[ - FCHK_STAGE_2_[FCHK_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + - (codepoint & STAGE_3_MASK_)]; - - lead = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); - - if (lead != 0 && oldfcdtrail > lead) { - return FALSE; - } - oldfcdtrail = (uint8_t)(fcd & LAST_BYTE_MASK_); - psource += count; - } - } - return TRUE; + return unorm_internalNormalize(dest, destCapacity, + src, srcLength, + mode, (UBool)((option&UNORM_IGNORE_HANGUL)!=0), + NULL, NULL, + pErrorCode); } diff --git a/icu4c/source/common/unormimp.h b/icu4c/source/common/unormimp.h index 63a556dfb40..a08cfeae2da 100644 --- a/icu4c/source/common/unormimp.h +++ b/icu4c/source/common/unormimp.h @@ -146,7 +146,7 @@ U_CFUNC int32_t unorm_internalNormalize(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UNormalizationMode mode, UBool ignoreHangul, - GrowBuffer *growBuffer, void *context, + UGrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode); /** @@ -157,7 +157,7 @@ U_CFUNC int32_t unorm_decompose(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBool compat, UBool ignoreHangul, - GrowBuffer *growBuffer, void *context, + UGrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode); /** @@ -168,21 +168,72 @@ U_CFUNC int32_t unorm_compose(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UBool compat, UBool ignoreHangul, - GrowBuffer *growBuffer, void *context, + UGrowBuffer *growBuffer, void *context, UErrorCode *pErrorCode); /** - * internal API, but used by tests + * internal API, used by collation code + * Get access to the internal FCD trie table to be able to perform + * incremental, per-code unit, FCD checks in collation. + * One pointer is sufficient because the trie index values are offset + * by the index size, so that the same pointer is used to access the trie data. * @internal */ -U_CAPI void U_EXPORT2 -unorm_setNewImplementation(UBool useNew); +U_CAPI const uint16_t * U_EXPORT2 +unorm_getFCDTrie(UErrorCode *pErrorCode); + +#ifdef XP_CPLUSPLUS /** - * internal API, but used by tests + * internal API, used by collation code + * Get the FCD value for a code unit, with + * bits 15..8 lead combining class + * bits 7..0 trail combining class + * + * If c is a lead surrogate and the value is not 0, + * then instead of combining classes the value + * is used in unorm_getFCD16FromSurrogatePair() to get the real value + * of the supplementary code point. + * * @internal */ -U_CAPI UBool U_EXPORT2 -unorm_usesNewImplementation(); +inline uint16_t +unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) { + return + fcdTrieIndex[ + fcdTrieIndex[ + c>>_NORM_TRIE_SHIFT + ]+ + (c&_NORM_STAGE_2_MASK) + ]; +} + +/** + * internal API, used by collation code + * Get the FCD value for a supplementary code point, with + * bits 15..8 lead combining class + * bits 7..0 trail combining class + * + * @param fcd16 The FCD value for the lead surrogate, not 0. + * @param c2 The trail surrogate code unit. + * + * @internal + */ +inline uint16_t +unorm_getFCD16FromSurrogatePair(const uint16_t *fcdTrieIndex, uint16_t fcd16, UChar c2) { + /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */ + uint32_t c= + ((uint32_t)fcd16<<10)| + (c2&0x3ff); + return + fcdTrieIndex[ + fcdTrieIndex[ + c>>_NORM_TRIE_SHIFT + ]+ + (c&_NORM_STAGE_2_MASK) + ]; +} + +#endif #endif diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index 60854612883..ff604bbc194 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -29,6 +29,7 @@ #include "unicode/unorm.h" #include "unicode/udata.h" +#include "unormimp.h" #include "cpputils.h" #include "cstring.h" #include "ucmp32.h" @@ -51,8 +52,6 @@ static UCollator* UCA = NULL; -extern "C" UBool checkFCD(const UChar*, int32_t, UErrorCode*); - U_CDECL_BEGIN static UBool U_CALLCONV isAcceptableUCA(void * /*context*/, @@ -672,14 +671,7 @@ void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode opts->alternateHandling = result->alternateHandling; } - -U_CAPI const uint16_t * getFCHK_STAGE_1_(UErrorCode *); -U_CAPI const uint16_t * getFCHK_STAGE_2_(UErrorCode *); -U_CAPI const uint16_t * getFCHK_STAGE_3_(UErrorCode *); - -static const uint16_t *FCD_STAGE_1_; -static const uint16_t *FCD_STAGE_2_; -static const uint16_t *FCD_STAGE_3_; +static const uint16_t *fcdTrieIndex=NULL; /** @@ -807,14 +799,8 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr result->expansionCESize = (uint8_t*)result->image + result->image->expansionCESize; - if (FCD_STAGE_1_ == NULL) { - FCD_STAGE_1_ = getFCHK_STAGE_1_(status); - } - if (FCD_STAGE_2_ == NULL) { - FCD_STAGE_2_ = getFCHK_STAGE_2_(status); - } - if (FCD_STAGE_3_ == NULL) { - FCD_STAGE_3_ = getFCHK_STAGE_3_(status); + if (fcdTrieIndex == NULL) { + fcdTrieIndex = unorm_getFCDTrie(status); } result->errorCode = *status; @@ -929,10 +915,8 @@ void collIterNormalize(collIterate *collationSource) /* True because the previous call to this function will have always exited */ /* that way, and we get called for every char where cc might be non-zero. */ inline UBool collIterFCD(collIterate *collationSource) { - UChar32 codepoint; - UChar *srcP; - int32_t length; - int32_t count = 0; + UChar c, c2; + const UChar *srcP, *endP; uint8_t leadingCC; uint8_t prevTrailingCC = 0; uint16_t fcd; @@ -940,52 +924,64 @@ inline UBool collIterFCD(collIterate *collationSource) { srcP = collationSource->pos-1; - // If the source string is null terminated, use a fake too-long string length - // (needed for UTF_NEXT_CHAR). null will stop everything OK.) - length = (collationSource->flags & UCOL_ITER_HASLEN) ? collationSource->endp - srcP : INT32_MAX; + if (collationSource->flags & UCOL_ITER_HASLEN) { + endP = collationSource->endp; + } else { + endP = NULL; + } // Get the trailing combining class of the current character. If it's zero, // we are OK. - UTF_NEXT_CHAR(srcP, count, length, codepoint); + c = *srcP++; /* trie access */ - fcd = FCD_STAGE_3_[ - FCD_STAGE_2_[FCD_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + - (codepoint & STAGE_3_MASK_)]; - prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); - - if (prevTrailingCC != 0) { - // The current char has a non-zero trailing CC. Scan forward until we find - // a char with a leading cc of zero. - for (;;) - { - if (count >= length) { - break; + fcd = unorm_getFCD16(fcdTrieIndex, c); + if (fcd != 0) { + if (UTF_IS_FIRST_SURROGATE(c)) { + if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) { + ++srcP; + fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2); + } else { + fcd = 0; } - int32_t savedCount = count; - UTF_NEXT_CHAR(srcP, count, length, codepoint); + } - /* trie access */ - fcd = FCD_STAGE_3_[ - FCD_STAGE_2_[FCD_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + - (codepoint & STAGE_3_MASK_)]; - leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); - if (leadingCC == 0) { - count = savedCount; // Hit char that is not part of combining sequence. - // back up over it. (Could be surrogate pair!) - break; + prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); + + if (prevTrailingCC != 0) { + // The current char has a non-zero trailing CC. Scan forward until we find + // a char with a leading cc of zero. + while (endP == NULL || srcP != endP) + { + const UChar *savedSrcP = srcP; + + c = *srcP++; + /* trie access */ + fcd = unorm_getFCD16(fcdTrieIndex, c); + if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) { + if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) { + ++srcP; + fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2); + } else { + fcd = 0; + } + } + leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); + if (leadingCC == 0) { + srcP = savedSrcP; // Hit char that is not part of combining sequence. + // back up over it. (Could be surrogate pair!) + break; + } + + if (leadingCC < prevTrailingCC) { + needNormalize = TRUE; + } + + prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); } - - if (leadingCC < prevTrailingCC) { - needNormalize = TRUE; - } - - prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); } } - collationSource->fcdPosition = srcP + count; + collationSource->fcdPosition = (UChar *)srcP; return needNormalize; } @@ -1208,23 +1204,29 @@ void collPrevIterNormalize(collIterate *data) */ inline UBool collPrevIterFCD(collIterate *data) { - UChar32 codepoint; + const UChar *src, *start; + UChar c, c2; uint8_t leadingCC; uint8_t trailingCC = 0; uint16_t fcd; UBool result = FALSE; - int32_t length; - length = (data->pos + 1) - data->string; + start = data->string; + src = data->pos + 1; /* Get the trailing combining class of the current character. */ - UTF_PREV_CHAR(data->string, 0, length, codepoint); - - /* trie access */ - fcd = FCD_STAGE_3_[ - FCD_STAGE_2_[FCD_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + - (codepoint & STAGE_3_MASK_)]; + c = *--src; + if (!UTF_IS_SURROGATE(c)) { + fcd = unorm_getFCD16(fcdTrieIndex, c); + } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) { + --src; + fcd = unorm_getFCD16(fcdTrieIndex, c2); + if (fcd != 0) { + fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); + } + } else /* unpaired surrogate */ { + fcd = 0; + } leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); @@ -1235,18 +1237,23 @@ inline UBool collPrevIterFCD(collIterate *data) */ for (;;) { - if (length <= 0) { - length = -1; - break; + if (start == src) { + data->fcdPosition = NULL; + return result; } - UTF_PREV_CHAR(data->string, 0, length, codepoint); - - /* trie access */ - fcd = FCD_STAGE_3_[ - FCD_STAGE_2_[FCD_STAGE_1_[codepoint >> STAGE_1_SHIFT_] + - ((codepoint >> STAGE_2_SHIFT_) & STAGE_2_MASK_AFTER_SHIFT_)] + - (codepoint & STAGE_3_MASK_)]; + c = *--src; + if (!UTF_IS_SURROGATE(c)) { + fcd = unorm_getFCD16(fcdTrieIndex, c); + } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) { + --src; + fcd = unorm_getFCD16(fcdTrieIndex, c2); + if (fcd != 0) { + fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); + } + } else /* unpaired surrogate */ { + fcd = 0; + } trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); @@ -1262,12 +1269,7 @@ inline UBool collPrevIterFCD(collIterate *data) } } - if (length < 0) { - data->fcdPosition = NULL; - } - else { - data->fcdPosition = data->string + length; - } + data->fcdPosition = (UChar *)src; return result; } @@ -3103,7 +3105,7 @@ ucol_calcSortKey(const UCollator *coll, } } else if((normMode != UCOL_OFF) /* changed by synwee */ - && !checkFCD(source, len, status)) + && UNORM_YES!=unorm_quickCheck(source, len, UNORM_FCD, status)) { normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status); if(U_FAILURE(*status)) { @@ -3595,7 +3597,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll, /* If we need to normalize, we'll do it all at once at the beggining! */ UColAttributeValue normMode = coll->normalizationMode; if(normMode != UCOL_OFF) { - if (!checkFCD(source, len, status)) + if (UNORM_YES!=unorm_quickCheck(source, len, UNORM_FCD, status)) { normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status); if(U_FAILURE(*status)) { diff --git a/icu4c/source/test/cintltst/cnormtst.c b/icu4c/source/test/cintltst/cnormtst.c index 0aff002659a..2535ba61121 100644 --- a/icu4c/source/test/cintltst/cnormtst.c +++ b/icu4c/source/test/cintltst/cnormtst.c @@ -26,8 +26,6 @@ #define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array)) -extern UBool checkFCD(const UChar *, int32_t, UErrorCode *); - static UCollator *myCollation; static void @@ -566,7 +564,7 @@ void TestCheckFCD() {0x0061, 0x030A, 0x00E2, 0x0323, 0}, {0x0061, 0x0323, 0x00E2, 0x0323, 0}, {0x0061, 0x0323, 0x1E05, 0x0302, 0} }; - const UBool result[] = {TRUE, FALSE, FALSE, TRUE}; + const UBool result[] = {UNORM_YES, UNORM_NO, UNORM_NO, UNORM_YES}; const UChar datachar[] = {0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, @@ -581,26 +579,26 @@ void TestCheckFCD() int count = 0; - if (checkFCD(FAST_, 10, &status) != TRUE) - log_err("checkFCD failed: expected value for fast checkFCD is TRUE\n"); - if (checkFCD(FALSE_, 10, &status) != FALSE) - log_err("checkFCD failed: expected value for error checkFCD is FALSE\n"); - if (checkFCD(TRUE_, 10, &status) != TRUE) - log_err("checkFCD failed: expected value for correct checkFCD is TRUE\n"); + if (unorm_quickCheck(FAST_, 10, UNORM_FCD, &status) != UNORM_YES) + log_err("unorm_quickCheck(FCD) failed: expected value for fast unorm_quickCheck is UNORM_YES\n"); + if (unorm_quickCheck(FALSE_, 10, UNORM_FCD, &status) != UNORM_NO) + log_err("unorm_quickCheck(FCD) failed: expected value for error unorm_quickCheck is UNORM_NO\n"); + if (unorm_quickCheck(TRUE_, 10, UNORM_FCD, &status) != UNORM_YES) + log_err("unorm_quickCheck(FCD) failed: expected value for correct unorm_quickCheck is UNORM_YES\n"); if (U_FAILURE(status)) - log_err("checkFCD failed: %s\n", u_errorName(status)); + log_err("unorm_quickCheck(FCD) failed: %s\n", u_errorName(status)); while (count < 4) { - UBool fcdresult = checkFCD(datastr[count], 4, &status); + UBool fcdresult = unorm_quickCheck(datastr[count], 4, UNORM_FCD, &status); if (U_FAILURE(status)) { - log_err("checkFCD failed: exception occured at data set %d\n", count); + log_err("unorm_quickCheck(FCD) failed: exception occured at data set %d\n", count); break; } else { if (result[count] != fcdresult) { - log_err("checkFCD failed: Data set %d expected value %d\n", count, + log_err("unorm_quickCheck(FCD) failed: Data set %d expected value %d\n", count, result[count]); } } @@ -614,7 +612,7 @@ void TestCheckFCD() for (count = 0; count < 50; count ++) { int size = 0; - UBool testresult = TRUE; + UBool testresult = UNORM_YES; UChar data[20]; UChar norm[100]; UChar nfd[100]; @@ -627,7 +625,7 @@ void TestCheckFCD() normsize += unorm_normalize(data + size, 1, UCOL_DECOMP_CAN, UCOL_IGNORE_HANGUL, norm + normsize, 100 - normsize, &status); if (U_FAILURE(status)) { - log_err("checkFCD failed: exception occured at data generation\n"); + log_err("unorm_quickCheck(FCD) failed: exception occured at data generation\n"); break; } size ++; @@ -637,21 +635,21 @@ void TestCheckFCD() nfdsize = unorm_normalize(data, size, UCOL_DECOMP_CAN, UCOL_IGNORE_HANGUL, nfd, 100, &status); if (U_FAILURE(status)) { - log_err("checkFCD failed: exception occured at normalized data generation\n"); + log_err("unorm_quickCheck(FCD) failed: exception occured at normalized data generation\n"); } if (nfdsize != normsize || u_memcmp(nfd, norm, nfdsize) != 0) { - testresult = FALSE; + testresult = UNORM_NO; } - if (testresult == TRUE) { - log_verbose("result TRUE\n"); + if (testresult == UNORM_YES) { + log_verbose("result UNORM_YES\n"); } else { - log_verbose("result FALSE\n"); + log_verbose("result UNORM_NO\n"); } - if (checkFCD(data, size, &status) != testresult || U_FAILURE(status)) { - log_err("checkFCD failed: expected %d for random data\n", testresult); + if (unorm_quickCheck(data, size, UNORM_FCD, &status) != testresult || U_FAILURE(status)) { + log_err("unorm_quickCheck(FCD) failed: expected %d for random data\n", testresult); } } }