diff --git a/icu4c/source/i18n/coll.cpp b/icu4c/source/i18n/coll.cpp index 83c7565af00..d35a7d1c47c 100644 --- a/icu4c/source/i18n/coll.cpp +++ b/icu4c/source/i18n/coll.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** - * Copyright (C) 1996-2010, International Business Machines Corporation and + * Copyright (C) 1996-2011, International Business Machines Corporation and * others. All Rights Reserved. ****************************************************************************** */ @@ -833,7 +833,8 @@ Collator::getFunctionalEquivalent(const char* keyword, const Locale& locale, return Locale::createFromName(loc); } -int32_t Collator::getReorderCodes(int32_t *dest, +int32_t U_EXPORT2 +Collator::getReorderCodes(int32_t *dest, int32_t destCapacity, UErrorCode& status) const { @@ -843,7 +844,8 @@ int32_t Collator::getReorderCodes(int32_t *dest, return 0; } -void Collator::setReorderCodes(const int32_t *reorderCodes, +void U_EXPORT2 +Collator::setReorderCodes(const int32_t *reorderCodes, int32_t reorderCodesLength, UErrorCode& status) { @@ -852,6 +854,18 @@ void Collator::setReorderCodes(const int32_t *reorderCodes, } } +int32_t U_EXPORT2 +Collator::getEquivalentReorderCodes(int32_t reorderCode, + int32_t *dest, + int32_t destCapacity, + UErrorCode& status) +{ + if (U_SUCCESS(status)) { + status = U_UNSUPPORTED_ERROR; + } + return 0; +} + // UCollator private data members ---------------------------------------- /* This is useless information */ diff --git a/icu4c/source/i18n/tblcoll.cpp b/icu4c/source/i18n/tblcoll.cpp index 15f5c490b63..f751574a316 100644 --- a/icu4c/source/i18n/tblcoll.cpp +++ b/icu4c/source/i18n/tblcoll.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** - * Copyright (C) 1996-2010, International Business Machines Corporation and + * Copyright (C) 1996-2011, International Business Machines Corporation and * others. All Rights Reserved. ****************************************************************************** */ @@ -601,6 +601,13 @@ void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status); } +int32_t RuleBasedCollator::getEquivalentReorderCodes(int32_t reorderCode, + int32_t* dest, + int32_t destCapacity, + UErrorCode& status) +{ + return ucol_getEquivalentReorderCodes(reorderCode, dest, destCapacity, &status); +} /** * Create a hash code for this collation. Just hash the main rule table -- that diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index 65b157cf38f..80b26a218e2 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -676,6 +676,9 @@ ucol_close(UCollator *coll) if(coll->leadBytePermutationTable != NULL) { uprv_free(coll->leadBytePermutationTable); } + if(coll->defaultReorderCodes != NULL) { + uprv_free(coll->defaultReorderCodes); + } if(coll->reorderCodes != NULL) { uprv_free(coll->reorderCodes); } @@ -866,6 +869,8 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, con result->rules = NULL; result->rulesLength = 0; result->freeRulesOnClose = FALSE; + result->defaultReorderCodes = NULL; + result->defaultReorderCodesLength = 0; result->reorderCodes = NULL; result->reorderCodesLength = 0; result->leadBytePermutationTable = NULL; @@ -7140,7 +7145,7 @@ ucol_getStrength(const UCollator *coll) return ucol_getAttribute(coll, UCOL_STRENGTH, &status); } -U_INTERNAL int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ucol_getReorderCodes(const UCollator *coll, int32_t *dest, int32_t destCapacity, @@ -7154,6 +7159,9 @@ ucol_getReorderCodes(const UCollator *coll, return 0; } + printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength); + printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength); + if (coll->reorderCodesLength > destCapacity) { *pErrorCode = U_BUFFER_OVERFLOW_ERROR; return coll->reorderCodesLength; @@ -7164,9 +7172,9 @@ ucol_getReorderCodes(const UCollator *coll, return coll->reorderCodesLength; } -U_INTERNAL void U_EXPORT2 -ucol_setReorderCodes(UCollator *coll, - const int32_t *reorderCodes, +U_CAPI void U_EXPORT2 +ucol_setReorderCodes(UCollator* coll, + const int32_t* reorderCodes, int32_t reorderCodesLength, UErrorCode *pErrorCode) { if (U_FAILURE(*pErrorCode)) { @@ -7203,6 +7211,63 @@ ucol_setReorderCodes(UCollator *coll, } } +U_CAPI int32_t U_EXPORT2 +ucol_getEquivalentReorderCodes(int32_t reorderCode, + int32_t* dest, + int32_t destCapacity, + UErrorCode *pErrorCode) { + bool equivalentCodesSet[USCRIPT_CODE_LIMIT]; + uint16_t leadBytes[256]; + int leadBytesCount; + int leadByteIndex; + int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT]; + int reorderCodesForLeadByteCount; + int reorderCodeIndex; + + int32_t equivalentCodesCount = 0; + int setIndex; + + if (U_FAILURE(*pErrorCode)) { + return 0; + } + + if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + const UCollator* uca = ucol_initUCA(pErrorCode); + leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256); + for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) { + reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte( + uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT); + for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) { + equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true; + } + } + + for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { + if (equivalentCodesSet[setIndex] == true) { + equivalentCodesCount++; + } + } + + if (destCapacity == 0) { + return equivalentCodesCount; + } + + equivalentCodesCount = 0; + for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) { + if (equivalentCodesSet[setIndex] == true) { + dest[equivalentCodesCount++] = setIndex; + if (equivalentCodesCount >= destCapacity) { + break; + } + } + } + return equivalentCodesCount; +} + /****************************************************************************/ /* Following are misc functions */ diff --git a/icu4c/source/i18n/ucol_imp.h b/icu4c/source/i18n/ucol_imp.h index 6604bab6648..56dbff862c4 100644 --- a/icu4c/source/i18n/ucol_imp.h +++ b/icu4c/source/i18n/ucol_imp.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1998-2010, International Business Machines +* Copyright (C) 1998-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -1025,6 +1025,8 @@ struct UCollator { uint8_t tertiaryBottomCount; UVersionInfo dataVersion; /* Data info of UCA table */ + int32_t* defaultReorderCodes; + int32_t defaultReorderCodesLength; int32_t* reorderCodes; int32_t reorderCodesLength; uint8_t* leadBytePermutationTable; @@ -1091,8 +1093,14 @@ ucol_openRulesForImport( const UChar *rules, UErrorCode *status); -U_CAPI void U_EXPORT2 ucol_buildPermutationTable(UCollator *coll, UErrorCode *status); +U_CAPI void U_EXPORT2 +ucol_buildPermutationTable(UCollator *coll, UErrorCode *status); +U_CAPI int U_EXPORT2 +ucol_getLeadBytesForReorderCode(const UCollator *uca, int reorderCode, uint16_t* returnLeadBytes, int returnCapacity); + +U_CAPI int U_EXPORT2 +ucol_getReorderCodesForLeadByte(const UCollator *uca, int leadByte, int16_t* returnReorderCodes, int returnCapacity); #ifdef XP_CPLUSPLUS /* diff --git a/icu4c/source/i18n/ucol_res.cpp b/icu4c/source/i18n/ucol_res.cpp index a1a1e132365..846add6851d 100644 --- a/icu4c/source/i18n/ucol_res.cpp +++ b/icu4c/source/i18n/ucol_res.cpp @@ -1027,7 +1027,7 @@ ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) * Collation Reordering */ -static void ucol_setReorderCodesFromParser(UCollator *coll, UColTokenParser *parser, UErrorCode *status) { +void ucol_setReorderCodesFromParser(UCollator *coll, UColTokenParser *parser, UErrorCode *status) { if (U_FAILURE(*status)) { return; } @@ -1041,16 +1041,32 @@ static void ucol_setReorderCodesFromParser(UCollator *coll, UColTokenParser *par return; } + coll->defaultReorderCodesLength = parser->reorderCodesLength; + coll->defaultReorderCodes = (int32_t*) uprv_malloc(coll->defaultReorderCodesLength * sizeof(int32_t)); + uprv_memcpy(coll->defaultReorderCodes, parser->reorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t)); + coll->reorderCodesLength = parser->reorderCodesLength; coll->reorderCodes = (int32_t*) uprv_malloc(coll->reorderCodesLength * sizeof(int32_t)); uprv_memcpy(coll->reorderCodes, parser->reorderCodes, coll->reorderCodesLength * sizeof(int32_t)); } -static int ucol_getLeadBytesForReorderCode(UCollator *coll, int reorderCode, uint16_t* returnLeadBytes, int returnCapacity) { - uint16_t reorderCodeIndexLength = *((uint16_t*) ((uint8_t *)coll->UCA->image + coll->UCA->image->scriptToLeadByte)); - uint16_t* reorderCodeIndex = (uint16_t*) ((uint8_t *)coll->UCA->image + coll->UCA->image->scriptToLeadByte + 2 *sizeof(uint16_t)); +/* + * Data is stored in the reorder code to lead byte table as: + * index count - unsigned short (2 bytes) - number of index entries + * data size - unsigned short (2 bytes) - number of unsigned short data elements + * index[index count] - array of 2 unsigned shorts (4 bytes each entry) + * - reorder code, offset + * - index is sorted by reorder code + * - if an offset has the high bit set then it is not an offset but a single data entry + * once the high bit is stripped off + * data[data size] - array of unsigned short (2 bytes each entry) + * - the data is an usigned short count followed by count number + * of lead bytes stored in an unsigned short + */ +int ucol_getLeadBytesForReorderCode(const UCollator *uca, int reorderCode, uint16_t* returnLeadBytes, int returnCapacity) { + uint16_t reorderCodeIndexLength = *((uint16_t*) ((uint8_t *)uca->image + uca->image->scriptToLeadByte)); + uint16_t* reorderCodeIndex = (uint16_t*) ((uint8_t *)uca->image + uca->image->scriptToLeadByte + 2 *sizeof(uint16_t)); - // TODO - replace with a binary search // reorder code index is 2 uint16_t's - reorder code + offset for (int i = 0; i < reorderCodeIndexLength; i++) { if (reorderCode == reorderCodeIndex[i*2]) { @@ -1073,25 +1089,37 @@ static int ucol_getLeadBytesForReorderCode(UCollator *coll, int reorderCode, uin return 0; } -static int ucol_getReorderCodesForLeadByte(UCollator *coll, int leadByte, int16_t* returnReorderCodes, int returnCapacity) { - int leadByteIndexLength = *((uint16_t*) ((uint8_t *)coll->UCA->image + coll->UCA->image->leadByteToScript)); - uint16_t* leadByteIndex = (uint16_t*) ((uint8_t *)coll->UCA->image + coll->UCA->image->leadByteToScript + 2 *sizeof(uint16_t)); +/* + * Data is stored in the lead byte to reorder code table as: + * index count - unsigned short (2 bytes) - number of index entries + * data size - unsigned short (2 bytes) - number of unsigned short data elements + * index[index count] - array of unsigned short (2 bytes each entry) + * - index is sorted by lead byte + * - if an index has the high bit set then it is not an index but a single data entry + * once the high bit is stripped off + * data[data size] - array of unsigned short (2 bytes each entry) + * - the data is an usigned short count followed by count number of reorder codes + */ +int ucol_getReorderCodesForLeadByte(const UCollator *uca, int leadByte, int16_t* returnReorderCodes, int returnCapacity) { + uint16_t* leadByteTable = ((uint16_t*) ((uint8_t *)uca->image + uca->image->leadByteToScript)); + uint16_t leadByteIndexLength = *leadByteTable; if (leadByte >= leadByteIndexLength) { return 0; } - - if ((leadByteIndex[leadByte] & 0x8000) == 0x8000) { + uint16_t leadByteIndex = *(leadByteTable + (2 + leadByte)); + + if ((leadByteIndex & 0x8000) == 0x8000) { // offset isn't offset but instead is a single data element if (returnCapacity >= 1) { - returnReorderCodes[0] = leadByteIndex[leadByte] & ~0x8000; + returnReorderCodes[0] = leadByteIndex & ~0x8000; return 1; } return 0; } - uint16_t* dataOffsetBase = (uint16_t*) ((uint8_t *)leadByteIndex + leadByteIndexLength * (2 * sizeof(uint16_t))); - uint16_t reorderCodeCount = *(dataOffsetBase + leadByteIndex[leadByte]); - reorderCodeCount = reorderCodeCount > returnCapacity ? returnCapacity : reorderCodeCount; - uprv_memcpy(returnReorderCodes, dataOffsetBase + leadByteIndex[leadByte] + 1, reorderCodeCount * sizeof(uint16_t)); + //uint16_t* dataOffsetBase = leadByteTable + (2 + leadByteIndexLength); + uint16_t* reorderCodeData = leadByteTable + (2 + leadByteIndexLength) + leadByteIndex; + uint16_t reorderCodeCount = *reorderCodeData > returnCapacity ? returnCapacity : *reorderCodeData; + uprv_memcpy(returnReorderCodes, reorderCodeData + 1, reorderCodeCount * sizeof(uint16_t)); return reorderCodeCount; } @@ -1119,17 +1147,37 @@ void ucol_buildPermutationTable(UCollator *coll, UErrorCode *status) { bool permutationSlotFilled[256]; // nothing to do - if(U_FAILURE(*status) || coll == NULL || coll->reorderCodesLength == 0) { - if (coll != NULL) { - if (coll->leadBytePermutationTable != NULL) { - uprv_free(coll->leadBytePermutationTable); - coll->leadBytePermutationTable = NULL; - } + if(U_FAILURE(*status) || coll == NULL) { + return; + } + + // clear the reordering + if (coll->reorderCodes == NULL || coll->reorderCodesLength == 0 + || (coll->reorderCodesLength == 1 && coll->reorderCodes[0] == UCOL_REORDER_CODE_NONE)) { + if (coll->leadBytePermutationTable != NULL) { + uprv_free(coll->leadBytePermutationTable); + coll->leadBytePermutationTable = NULL; coll->reorderCodesLength = 0; } return; } + // set reordering to the default reordering + if (coll->reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) { + uprv_free(coll->reorderCodes); + coll->reorderCodes = NULL; + uprv_free(coll->leadBytePermutationTable); + coll->leadBytePermutationTable = NULL; + + coll->reorderCodes = (int32_t*)uprv_malloc(coll->defaultReorderCodesLength * sizeof(int32_t)); + if (internalReorderCodes == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return; + } + coll->reorderCodesLength = coll->defaultReorderCodesLength; + uprv_memcpy(coll->defaultReorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t)); + } + if (coll->leadBytePermutationTable == NULL) { coll->leadBytePermutationTable = (uint8_t*)uprv_malloc(256*sizeof(uint8_t)); if (coll->leadBytePermutationTable == NULL) { @@ -1203,7 +1251,7 @@ void ucol_buildPermutationTable(UCollator *coll, UErrorCode *status) { continue; } - uint16_t leadByteCount = ucol_getLeadBytesForReorderCode(coll, next, leadBytes, leadBytesSize); + uint16_t leadByteCount = ucol_getLeadBytesForReorderCode(coll->UCA, next, leadBytes, leadBytesSize); if (fromTheBottom) { for (int leadByteIndex = 0; leadByteIndex < leadByteCount; leadByteIndex++) { // don't place a lead byte twice in the permutation table diff --git a/icu4c/source/i18n/unicode/coll.h b/icu4c/source/i18n/unicode/coll.h index 357af13d9b8..e59a43d5f5d 100644 --- a/icu4c/source/i18n/unicode/coll.h +++ b/icu4c/source/i18n/unicode/coll.h @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 1996-2010, International Business Machines * +* Copyright (C) 1996-2011, International Business Machines * * Corporation and others. All Rights Reserved. * ****************************************************************************** */ @@ -598,29 +598,56 @@ public: virtual void setStrength(ECollationStrength newStrength) = 0; /** - * Get the current reordering of scripts (if one has been set). + * Retrieves the reordering codes for this collator. * @param dest The array to fill with the script ordering. - * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function will only return the length of the result without writing any of the result string (pre-flighting). - * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a failure before the function call. - * @return The length of the array of the script ordering. - * @see ucol_getReorderCodes - * @internal + * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function + * will only return the length of the result without writing any of the result string (pre-flighting). + * @param status A reference to an error code value, which must not indicate + * a failure before the function call. + * @return The length oof the script ordering array. + * @see ucol_setReorderCodes + * @see Collator#getEquivalentReorderCodes + * @see Collator#setReorderCodes + * @draft ICU 4.8 */ - virtual int32_t getReorderCodes(int32_t *dest, + virtual int32_t U_EXPORT2 getReorderCodes(int32_t *dest, int32_t destCapacity, UErrorCode& status) const; /** - * Set the ordering of scripts for this collator. - * @param reorderCodes An array of reorder codes in the new order. + * Sets the ordering of scripts for this collator. + * @param reorderCodes An array of script codes in the new order. This can be NULL if the + * length is also set to 0. An empty array will clear any reordering codes on the collator. * @param reorderCodesLength The length of reorderCodes. - * @see ucol_setReorderCodes - * @internal + * @see Collator#getReorderCodes + * @see Collator#getEquivalentReorderCodes + * @draft ICU 4.8 */ - virtual void setReorderCodes(const int32_t* reorderCodes, + virtual void U_EXPORT2 setReorderCodes(const int32_t* reorderCodes, int32_t reorderCodesLength, UErrorCode& status) ; + /** + * Retrieves the reorder codes that are grouped with the given reorder code. Some reorder + * codes will be grouped and must reorder together. + * @param reorderCode The reorder code to determine equivalence for. + * @param dest The array to fill with the script equivalene reordering codes. + * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the + * function will only return the length of the result without writing any of the result + * string (pre-flighting). + * @param status A reference to an error code value, which must not indicate + * a failure before the function call. + * @return The length of the of the reordering code equivalence array. + * @see ucol_setReorderCodes + * @see Collator#getReorderCodes + * @see Collator#setReorderCodes + * @draft ICU 4.8 + */ + static int32_t U_EXPORT2 getEquivalentReorderCodes(int32_t reorderCode, + int32_t* dest, + int32_t destCapacity, + UErrorCode& status); + /** * Get name of the object for the desired Locale, in the desired langauge * @param objectLocale must be from getAvailableLocales diff --git a/icu4c/source/i18n/unicode/tblcoll.h b/icu4c/source/i18n/unicode/tblcoll.h index 8e2225192f2..c725c50f8f1 100644 --- a/icu4c/source/i18n/unicode/tblcoll.h +++ b/icu4c/source/i18n/unicode/tblcoll.h @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 1996-2010, International Business Machines Corporation and +* Copyright (C) 1996-2011, International Business Machines Corporation and * others. All Rights Reserved. ****************************************************************************** */ @@ -667,28 +667,55 @@ public: virtual void setStrength(ECollationStrength newStrength); /** - * Get the current reordering of scripts (if one has been set). + * Retrieves the reordering codes for this collator. * @param dest The array to fill with the script ordering. - * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function will only return the length of the result without writing any of the result string (pre-flighting). - * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a failure before the function call. - * @return The length of the array of the script ordering. - * @see ucol_getReorderCodes - * @internal + * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function + * will only return the length of the result without writing any of the result string (pre-flighting). + * @param status A reference to an error code value, which must not indicate + * a failure before the function call. + * @return The length oof the script ordering array. + * @see ucol_setReorderCodes + * @see Collator#getEquivalentReorderCodes + * @see Collator#setReorderCodes + * @draft ICU 4.8 */ - virtual int32_t getReorderCodes(int32_t* dest, + virtual int32_t U_EXPORT2 getReorderCodes(int32_t *dest, int32_t destCapacity, UErrorCode& status) const; /** - * Set the ordering of scripts for this collator. - * @param reorderCodes An array of script codes in the new order. + * Sets the ordering of scripts for this collator. + * @param reorderCodes An array of script codes in the new order. This can be NULL if the + * length is also set to 0. An empty array will clear any reordering codes on the collator. * @param reorderCodesLength The length of reorderCodes. - * @see ucol_setReorderCodes - * @internal + * @see Collator#getReorderCodes + * @see Collator#getEquivalentReorderCodes + * @draft ICU 4.8 */ - virtual void setReorderCodes(const int32_t* reorderCodes, - int32_t reorderCodesLength, - UErrorCode& status); + virtual void U_EXPORT2 setReorderCodes(const int32_t* reorderCodes, + int32_t reorderCodesLength, + UErrorCode& status) ; + + /** + * Retrieves the reorder codes that are grouped with the given reorder code. Some reorder + * codes will be grouped and must reorder together. + * @param reorderCode The reorder code to determine equivalence for. + * @param dest The array to fill with the script equivalene reordering codes. + * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the + * function will only return the length of the result without writing any of the result + * string (pre-flighting). + * @param status A reference to an error code value, which must not indicate + * a failure before the function call. + * @return The length of the of the reordering code equivalence array. + * @see ucol_setReorderCodes + * @see Collator#getReorderCodes + * @see Collator#setReorderCodes + * @draft ICU 4.8 + */ + static int32_t U_EXPORT2 getEquivalentReorderCodes(int32_t reorderCode, + int32_t* dest, + int32_t destCapacity, + UErrorCode& status); private: diff --git a/icu4c/source/i18n/unicode/ucol.h b/icu4c/source/i18n/unicode/ucol.h index 4a4cd606e8a..4990efc4722 100644 --- a/icu4c/source/i18n/unicode/ucol.h +++ b/icu4c/source/i18n/unicode/ucol.h @@ -1,6 +1,6 @@ /* ******************************************************************************* -* Copyright (c) 1996-2010, International Business Machines Corporation and others. +* Copyright (c) 1996-2011, International Business Machines Corporation and others. * All Rights Reserved. ******************************************************************************* */ @@ -17,6 +17,7 @@ #include "unicode/parseerr.h" #include "unicode/uloc.h" #include "unicode/uset.h" +#include "unicode/uscript.h" /** * \file @@ -132,18 +133,68 @@ typedef enum { } UColAttributeValue; -/** Enum containing the codes for reordering segments of the collation table that are not script - * codes. These reordering codes are to be used in conjunction with the script codes. - * @internal +/** + * Enum containing the codes for reordering segments of the collation table that are not script + * codes. These reordering codes are to be used in conjunction with the script codes. + * @see ucol_getReorderCodes + * @see ucol_setReorderCodes + * @see ucol_getEquivalentReorderCodes + * @draft ICU 4.8 */ -typedef enum { - UCOL_REORDER_CODE_SPACE = 0x1000, - UCOL_REORDER_CODE_FIRST = UCOL_REORDER_CODE_SPACE, - UCOL_REORDER_CODE_PUNCTUATION = 0x1001, - UCOL_REORDER_CODE_SYMBOL = 0x1002, - UCOL_REORDER_CODE_CURRENCY = 0x1003, - UCOL_REORDER_CODE_DIGIT = 0x1004, - UCOL_REORDER_CODE_LIMIT = 0x1005 + typedef enum { + /** + * A special reordering code that is used to specify the default + * reordering codes for a locale. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_DEFAULT = -1, + /** + * A special reordering code that is used to specify no reordering codes. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_NONE = USCRIPT_UNKNOWN, + /** + * A special reordering code that is used to specify all other codes used for + * reordering except for the codes lised as UColReorderCode values and those + * listed explicitly in a reordering. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_OTHERS = USCRIPT_UNKNOWN, + /** + * Characters with the space property. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_SPACE = 0x1000, + /** + * The first entry in the enumeration of reordering groups. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_FIRST = UCOL_REORDER_CODE_SPACE, + /** + * Characters with the punctuation property. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_PUNCTUATION = 0x1001, + /** + * Characters with the symbol property. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_SYMBOL = 0x1002, + /** + * Characters with the currency property. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_CURRENCY = 0x1003, + /** + * Characters with the digit property. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_DIGIT = 0x1004, + /** + * The limit of the reorder codes. + * @draft ICU 4.8 + */ + UCOL_REORDER_CODE_LIMIT = 0x1005 } UColReorderCode; /** @@ -536,36 +587,66 @@ ucol_setStrength(UCollator *coll, UCollationStrength strength); /** - * Get the current reordering of scripts (if one has been set). + * Retrieves the reordering codes for this collator. + * These reordering codes are a combination of UScript codes and UColReorderCode entries. * @param coll The UCollator to query. * @param dest The array to fill with the script ordering. - * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function will only return the length of the result without writing any of the result string (pre-flighting). - * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a failure before the function call. - * @return The length of the array of the script ordering. + * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function + * will only return the length of the result without writing any of the result string (pre-flighting). + * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a + * failure before the function call. + * @return The number of reordering codes written to the dest array. * @see ucol_setReorderCodes - * @internal + * @see ucol_getEquivalentReorderCodes + * @draft ICU 4.8 */ -U_INTERNAL int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ucol_getReorderCodes(const UCollator* coll, int32_t* dest, int32_t destCapacity, UErrorCode *pErrorCode); /** - * Set the ordering of scripts for this collator. + * Sets the reordering codes for this collator. + * Reordering codes allow the collation ordering for groups of characters to be changed. + * The reordering codes are a combination of UScript codes and UColReorderCode entries. + * These allow the ordering of characters belonging to these groups to be changed as a group. * @param coll The UCollator to set. - * @param reorderCodes An array of script codes in the new order. + * @param reorderCodes An array of script codes in the new order. This can be NULL if the + * length is also set to 0. An empty array will clear any reordering codes on the collator. * @param reorderCodesLength The length of reorderCodes. - * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a failure before the function call. + * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate a + * failure before the function call. * @see ucol_getReorderCodes - * @internal + * @see ucol_getEquivalentReorderCodes + * @draft ICU 4.8 */ -U_INTERNAL void U_EXPORT2 +U_CFUNC void U_EXPORT2 ucol_setReorderCodes(UCollator* coll, const int32_t* reorderCodes, int32_t reorderCodesLength, UErrorCode *pErrorCode); +/** + * Retrieves the reorder codes that are grouped with the given reorder code. Some reorder + * codes will be grouped and must reorder together. + * @param reorderCode The reorder code to determine equivalence for. + * @param dest The array to fill with the script ordering. + * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function + * will only return the length of the result without writing any of the result string (pre-flighting). + * @param pErrorCode Must be a valid pointer to an error code value, which must not indicate + * a failure before the function call. + * @return The number of reordering codes written to the dest array. + * @see ucol_setReorderCodes + * @see ucol_getReorderCodes + * @draft ICU 4.8 + */ +U_CAPI int32_t U_EXPORT2 +ucol_getEquivalentReorderCodes(int32_t reorderCode, + int32_t* dest, + int32_t destCapacity, + UErrorCode *pErrorCode); + /** * Get the display name for a UCollator. * The display name is suitable for presentation to a user. diff --git a/icu4c/source/test/cintltst/cmsccoll.c b/icu4c/source/test/cintltst/cmsccoll.c index 05a034186db..2233dda0934 100644 --- a/icu4c/source/test/cintltst/cmsccoll.c +++ b/icu4c/source/test/cintltst/cmsccoll.c @@ -5995,8 +5995,10 @@ static void TestReorderingAPI(void) int32_t reorderCodes[3] = {USCRIPT_GREEK, USCRIPT_HAN, UCOL_REORDER_CODE_PUNCTUATION}; UCollationResult collResult; int32_t retrievedReorderCodesLength; + int32_t retrievedReorderCodes[10]; UChar greekString[] = { 0x03b1 }; UChar punctuationString[] = { 0x203e }; + int loopIndex; log_verbose("Testing non-lead bytes in a sort key with and without reordering\n"); @@ -6015,6 +6017,7 @@ static void TestReorderingAPI(void) return; } + /* get the reordering */ retrievedReorderCodesLength = ucol_getReorderCodes(myCollation, NULL, 0, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { log_err_status(status, "ERROR: getting error codes should have returned U_BUFFER_OVERFLOW_ERROR : %s\n", myErrorName(status)); @@ -6025,6 +6028,22 @@ static void TestReorderingAPI(void) log_err_status(status, "ERROR: retrieved reorder codes length was %d but should have been %d\n", retrievedReorderCodesLength, LEN(reorderCodes)); return; } + /* now let's really get it */ + retrievedReorderCodesLength = ucol_getReorderCodes(myCollation, retrievedReorderCodes, LEN(retrievedReorderCodes), &status); + if (U_FAILURE(status)) { + log_err_status(status, "ERROR: getting reorder codes: %s\n", myErrorName(status)); + return; + } + if (retrievedReorderCodesLength != LEN(reorderCodes)) { + log_err_status(status, "ERROR: retrieved reorder codes length was %d but should have been %d\n", retrievedReorderCodesLength, LEN(reorderCodes)); + return; + } + for (loopIndex = 0; loopIndex < retrievedReorderCodesLength; loopIndex++) { + if (retrievedReorderCodes[loopIndex] != reorderCodes[loopIndex]) { + log_err_status(status, "ERROR: retrieved reorder code doesn't match set reorder code at index %d\n", loopIndex); + return; + } + } collResult = ucol_strcoll(myCollation, greekString, LEN(greekString), punctuationString, LEN(punctuationString)); if (collResult != UCOL_LESS) { log_err_status(status, "ERROR: collation result should have been UCOL_LESS\n"); @@ -6038,6 +6057,7 @@ static void TestReorderingAPI(void) return; } + /* get the reordering again */ retrievedReorderCodesLength = ucol_getReorderCodes(myCollation, NULL, 0, &status); if (retrievedReorderCodesLength != 0) { log_err_status(status, "ERROR: retrieved reorder codes length was %d but should have been %d\n", retrievedReorderCodesLength, 0); @@ -6053,6 +6073,197 @@ static void TestReorderingAPI(void) ucol_close(myCollation); } +/* + * Test reordering API. + */ +static void TestReorderingAPIWithRuleCreatedCollator(void) +{ + UErrorCode status = U_ZERO_ERROR; + UCollator *myCollation; + UChar rules[90]; + int32_t rulesReorderCodes[2] = {USCRIPT_HAN, USCRIPT_GREEK}; + int32_t reorderCodes[3] = {USCRIPT_GREEK, USCRIPT_HAN, UCOL_REORDER_CODE_PUNCTUATION}; + UCollationResult collResult; + int32_t retrievedReorderCodesLength; + int32_t retrievedReorderCodes[10]; + UChar greekString[] = { 0x03b1 }; + UChar punctuationString[] = { 0x203e }; + UChar hanString[] = { 0x65E5, 0x672C }; + int loopIndex; + + log_verbose("Testing non-lead bytes in a sort key with and without reordering\n"); + + /* build collator from rules */ + u_uastrcpy(rules, "[reorder Hani Grek]"); + myCollation = ucol_openRules(rules, u_strlen(rules), UCOL_DEFAULT, UCOL_TERTIARY, NULL, &status); + if(U_FAILURE(status)) { + log_err_status(status, "ERROR: in creation of collator: %s\n", myErrorName(status)); + return; + } + + /* get the reordering */ + retrievedReorderCodesLength = ucol_getReorderCodes(myCollation, retrievedReorderCodes, LEN(retrievedReorderCodes), &status); + printf("retrievedReorderCodesLength = %d\n", retrievedReorderCodesLength); + if (U_FAILURE(status)) { + log_err_status(status, "ERROR: getting reorder codes: %s\n", myErrorName(status)); + return; + } + if (retrievedReorderCodesLength != LEN(rulesReorderCodes)) { + log_err_status(status, "ERROR: retrieved reorder codes length was %d but should have been %d\n", retrievedReorderCodesLength, LEN(rulesReorderCodes)); + return; + } + for (loopIndex = 0; loopIndex < retrievedReorderCodesLength; loopIndex++) { + if (retrievedReorderCodes[loopIndex] != rulesReorderCodes[loopIndex]) { + log_err_status(status, "ERROR: retrieved reorder code doesn't match set reorder code at index %d\n", loopIndex); + return; + } + } + collResult = ucol_strcoll(myCollation, greekString, LEN(greekString), hanString, LEN(hanString)); + if (collResult != UCOL_GREATER) { + log_err_status(status, "ERROR: collation result should have been UCOL_LESS\n"); + return; + } + + + /* set the reorderding */ + ucol_setReorderCodes(myCollation, reorderCodes, LEN(reorderCodes), &status); + if (U_FAILURE(status)) { + log_err_status(status, "ERROR: setting reorder codes: %s\n", myErrorName(status)); + return; + } + + /* get the reordering */ + retrievedReorderCodesLength = ucol_getReorderCodes(myCollation, NULL, 0, &status); + if (status != U_BUFFER_OVERFLOW_ERROR) { + log_err_status(status, "ERROR: getting error codes should have returned U_BUFFER_OVERFLOW_ERROR : %s\n", myErrorName(status)); + return; + } + status = U_ZERO_ERROR; + if (retrievedReorderCodesLength != LEN(reorderCodes)) { + log_err_status(status, "ERROR: retrieved reorder codes length was %d but should have been %d\n", retrievedReorderCodesLength, LEN(reorderCodes)); + return; + } + /* now let's really get it */ + retrievedReorderCodesLength = ucol_getReorderCodes(myCollation, retrievedReorderCodes, LEN(retrievedReorderCodes), &status); + if (U_FAILURE(status)) { + log_err_status(status, "ERROR: getting reorder codes: %s\n", myErrorName(status)); + return; + } + if (retrievedReorderCodesLength != LEN(reorderCodes)) { + log_err_status(status, "ERROR: retrieved reorder codes length was %d but should have been %d\n", retrievedReorderCodesLength, LEN(reorderCodes)); + return; + } + for (loopIndex = 0; loopIndex < retrievedReorderCodesLength; loopIndex++) { + if (retrievedReorderCodes[loopIndex] != reorderCodes[loopIndex]) { + log_err_status(status, "ERROR: retrieved reorder code doesn't match set reorder code at index %d\n", loopIndex); + return; + } + } + collResult = ucol_strcoll(myCollation, greekString, LEN(greekString), punctuationString, LEN(punctuationString)); + if (collResult != UCOL_LESS) { + log_err_status(status, "ERROR: collation result should have been UCOL_LESS\n"); + return; + } + + /* clear the reordering */ + ucol_setReorderCodes(myCollation, NULL, 0, &status); + if (U_FAILURE(status)) { + log_err_status(status, "ERROR: setting reorder codes to NULL: %s\n", myErrorName(status)); + return; + } + + /* get the reordering again */ + retrievedReorderCodesLength = ucol_getReorderCodes(myCollation, NULL, 0, &status); + if (retrievedReorderCodesLength != 0) { + log_err_status(status, "ERROR: retrieved reorder codes length was %d but should have been %d\n", retrievedReorderCodesLength, 0); + return; + } + + collResult = ucol_strcoll(myCollation, greekString, LEN(greekString), punctuationString, LEN(punctuationString)); + if (collResult != UCOL_GREATER) { + log_err_status(status, "ERROR: collation result should have been UCOL_GREATER\n"); + return; + } + + ucol_close(myCollation); +} + +static int compareUScriptCodes(const void * a, const void * b) +{ + return ( *(int32_t*)a - *(int32_t*)b ); +} + +static void TestEquivalentReorderingScripts() { + UErrorCode status = U_ZERO_ERROR; + int32_t equivalentScripts[50]; + int32_t equivalentScriptsLength; + int loopIndex; + int32_t equivalentScriptsResult[] = { + USCRIPT_BOPOMOFO, + USCRIPT_LISU, + USCRIPT_LYCIAN, + USCRIPT_CARIAN, + USCRIPT_LYDIAN, + USCRIPT_YI, + USCRIPT_OLD_ITALIC, + USCRIPT_GOTHIC, + USCRIPT_DESERET, + USCRIPT_SHAVIAN, + USCRIPT_OSMANYA, + USCRIPT_LINEAR_B, + USCRIPT_CYPRIOT, + USCRIPT_OLD_SOUTH_ARABIAN, + USCRIPT_AVESTAN, + USCRIPT_IMPERIAL_ARAMAIC, + USCRIPT_INSCRIPTIONAL_PARTHIAN, + USCRIPT_INSCRIPTIONAL_PAHLAVI, + USCRIPT_UGARITIC, + USCRIPT_OLD_PERSIAN, + USCRIPT_CUNEIFORM, + USCRIPT_EGYPTIAN_HIEROGLYPHS + }; + + qsort(equivalentScriptsResult, LEN(equivalentScriptsResult), sizeof(int32_t), compareUScriptCodes); + + /* UScript.GOTHIC */ + equivalentScriptsLength = ucol_getEquivalentReorderCodes(USCRIPT_GOTHIC, equivalentScripts, LEN(equivalentScripts), &status); + if (U_FAILURE(status)) { + log_err_status(status, "ERROR: retrieving equivalent reorder codes: %s\n", myErrorName(status)); + return; + } + /*qsort(equivalentScripts, equivalentScriptsLength, sizeof(int32_t), compareUScriptCodes);*/ + if (equivalentScriptsLength != LEN(equivalentScriptsResult)) { + log_err_status(status, "ERROR: retrieved equivalent script length wrong: expected = %d, was = %d\n", LEN(equivalentScriptsResult), equivalentScriptsLength); + return; + } + for (loopIndex = 0; loopIndex < equivalentScriptsLength; loopIndex++) { + if (equivalentScriptsResult[loopIndex] != equivalentScripts[loopIndex]) { + log_err_status(status, "ERROR: equivalent scripts results don't match: expected = %d, was = %d\n", equivalentScriptsResult[loopIndex], equivalentScripts[loopIndex]); + return; + } + } + + /* UScript.SHAVIAN */ + equivalentScriptsLength = ucol_getEquivalentReorderCodes(USCRIPT_SHAVIAN, equivalentScripts, LEN(equivalentScripts), &status); + if (U_FAILURE(status)) { + log_err_status(status, "ERROR: retrieving equivalent reorder codes: %s\n", myErrorName(status)); + return; + } + /*qsort(equivalentScripts, equivalentScriptsLength, sizeof(int32_t), compareUScriptCodes);*/ + if (equivalentScriptsLength != LEN(equivalentScriptsResult)) { + log_err_status(status, "ERROR: retrieved equivalent script length wrong: expected = %d, was = %d\n", LEN(equivalentScriptsResult), equivalentScriptsLength); + return; + } + for (loopIndex = 0; loopIndex < equivalentScriptsLength; loopIndex++) { + if (equivalentScriptsResult[loopIndex] != equivalentScripts[loopIndex]) { + log_err_status(status, "ERROR: equivalent scripts results don't match: expected = %d, was = %d\n", equivalentScriptsResult[loopIndex], equivalentScripts[loopIndex]); + return; + } + } +} + + + /* * Utility function to test one collation reordering test case set. * @param testcases Array of test cases. @@ -6526,6 +6737,8 @@ void addMiscCollTest(TestNode** root) TEST(TestBeforeRuleWithScriptReordering); TEST(TestNonLeadBytesDuringCollationReordering); TEST(TestReorderingAPI); + TEST(TestReorderingAPIWithRuleCreatedCollator); + TEST(TestEquivalentReorderingScripts); TEST(TestGreekFirstReorder); TEST(TestGreekLastReorder); TEST(TestNonScriptReorder);