diff --git a/icu4c/source/common/uvector.cpp b/icu4c/source/common/uvector.cpp index d0ecb449c9e..2a566ed21f6 100644 --- a/icu4c/source/common/uvector.cpp +++ b/icu4c/source/common/uvector.cpp @@ -1,6 +1,6 @@ /* ****************************************************************************** -* Copyright (C) 1999-2010, International Business Machines Corporation and * +* Copyright (C) 1999-2011, International Business Machines Corporation and * * others. All Rights Reserved. * ****************************************************************************** * Date Name Description @@ -549,5 +549,16 @@ void UVector::sort(USortComparator *compare, UErrorCode &ec) { } } + +/** + * Sort with a user supplied comparator of type UComparator. + */ +void UVector::sortWithUComparator(UComparator *compare, const void *context, UErrorCode &ec) { + if (U_SUCCESS(ec)) { + uprv_sortArray(elements, count, sizeof(UHashTok), + compare, context, FALSE, &ec); + } +} + U_NAMESPACE_END diff --git a/icu4c/source/common/uvector.h b/icu4c/source/common/uvector.h index 2724cb1c83b..c28ddd317c6 100644 --- a/icu4c/source/common/uvector.h +++ b/icu4c/source/common/uvector.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1999-2009, International Business Machines +* Copyright (C) 1999-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description @@ -14,6 +14,7 @@ #include "unicode/utypes.h" #include "unicode/uobject.h" +#include "uarrsort.h" #include "uhash.h" U_NAMESPACE_BEGIN @@ -259,6 +260,14 @@ public: */ void sort(USortComparator *compare, UErrorCode &ec); + /** + * Sort the contents of this vector using a caller-supplied function + * of type UComparator to do the comparison. Provides more flexibility + * than uvector::sort() because an additional user-parameter can be passed to + * the comparison function. + */ + void sortWithUComparator(UComparator *compare, const void *context, UErrorCode &ec); + /** * ICU "poor man's RTTI", returns a UClassID for this class. */ diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index a23e5d9df39..a6ac3725576 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -1,6 +1,6 @@ #****************************************************************************** # -# Copyright (C) 1998-2010, International Business Machines +# Copyright (C) 1998-2011, International Business Machines # Corporation and others. All Rights Reserved. # #****************************************************************************** @@ -84,7 +84,7 @@ zonemeta.o zstrfmt.o plurrule.o plurfmt.o selfmt.o dtitvfmt.o dtitvinf.o \ tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o currpinf.o \ uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o uspoof_wsconf.o \ ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o locdspnm.o \ -decNumber.o decContext.o +decNumber.o decContext.o alphaindex.o ## Header files to install HEADERS = $(srcdir)/unicode/*.h diff --git a/icu4c/source/i18n/alphaindex.cpp b/icu4c/source/i18n/alphaindex.cpp new file mode 100644 index 00000000000..c95531d0413 --- /dev/null +++ b/icu4c/source/i18n/alphaindex.cpp @@ -0,0 +1,1319 @@ +/* +******************************************************************************* +* Copyright (C) 2009-2011, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ + +/** + * \file + * \brief C API: AlphabeticIndex class + */ + +#include "unicode/utypes.h" + +#include "unicode/alphaindex.h" +#include "unicode/coll.h" +#include "unicode/normalizer2.h" +#include "unicode/strenum.h" +#include "unicode/tblcoll.h" +#include "unicode/ulocdata.h" +#include "unicode/uniset.h" +#include "unicode/uobject.h" +#include "unicode/uscript.h" +#include "unicode/usetiter.h" +#include "unicode/ustring.h" + +#include "cstring.h" +#include "mutex.h" +#include "uassert.h" +#include "ucln_in.h" +#include "uhash.h" +#include "uvector.h" + +#include +#include +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex) + +// Forward Declarations +static int32_t U_CALLCONV +PreferenceComparator(const void *context, const void *left, const void *right); + +static int32_t U_CALLCONV +sortCollateComparator(const void *context, const void *left, const void *right); + +static int32_t U_CALLCONV +recordCompareFn(const void *context, const void *left, const void *right); + +// +// UHash support function, delete a UnicodeSet +// TODO: move this function into uhash. +// +static void U_CALLCONV +uhash_deleteUnicodeSet(void *obj) { + delete static_cast(obj); +} + +// UVector support function, delete a Bucket. +static void U_CALLCONV +alphaIndex_deleteBucket(void *obj) { + delete static_cast(obj); +} + +// UVector support function, delete a Record. +static void U_CALLCONV +alphaIndex_deleteRecord(void *obj) { + delete static_cast(obj); +} + + + +static const Normalizer2 *nfkdNormalizer; + +// +// Append the contents of a UnicodeSet to a UVector of UnicodeStrings. +// Append everything - individual characters are handled as strings of length 1. +// The destination vector owns the appended strings. + +static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) { + UnicodeSetIterator setIter(source); + while (setIter.next()) { + const UnicodeString &str = setIter.getString(); + dest.addElement(str.clone(), status); + } +} + + +AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) { + init(status); + if (U_FAILURE(status)) { + return; + } + locale_ = locale; + langType_ = langTypeFromLocale(locale_); + + collator_ = Collator::createInstance(locale, status); + if (collator_ != NULL) { + collatorPrimaryOnly_ = collator_->clone(); + } + if (collatorPrimaryOnly_ != NULL) { + collatorPrimaryOnly_->setStrength(Collator::PRIMARY); + } + getIndexExemplars(*initialLabels_, locale, status); + indexBuildRequired_ = TRUE; + if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + } + firstScriptCharacters_ = firstStringsInScript(status); +} + + +AlphabeticIndex::~AlphabeticIndex() { + uhash_close(alreadyIn_); + delete bucketList_; + delete collator_; + delete collatorPrimaryOnly_; + delete firstScriptCharacters_; + delete labels_; + delete inputRecords_; + delete noDistinctSorting_; + delete notAlphabetic_; + delete initialLabels_; +} + + +AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + initialLabels_->addAll(additions); + return *this; +} + + +AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + UnicodeSet additions; + getIndexExemplars(additions, locale, status); + initialLabels_->addAll(additions); + return *this; +} + + +int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { + buildIndex(status); + if (U_FAILURE(status)) { + return 0; + } + return bucketList_->size(); +} + + +int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { + if (U_FAILURE(status)) { + return 0; + } + return inputRecords_->size(); +} + + +void AlphabeticIndex::buildIndex(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + if (!indexBuildRequired_) { + return; + } + + // Discard any already-built data. + // This is important when the user builds and uses an index, then subsequently modifies it, + // necessitating a rebuild. + + bucketList_->removeAllElements(); + labels_->removeAllElements(); + uhash_removeAll(alreadyIn_); + noDistinctSorting_->clear(); + notAlphabetic_->clear(); + + // first sort the incoming Labels, with a "best" ordering among items + // that are the same according to the collator + + UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector. + preferenceSorting.setDeleter(uhash_deleteUnicodeString); + appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status); + preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status); + + // We now make a set of Labels. + // Some of the input may, however, be redundant. + // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" + // So we make a pass through, filtering out those cases. + // TODO: filtering these out would seem to be at odds with the eventual goal + // of being able to split buckets that contain too many items. + + UnicodeSet labelSet; + for (int32_t psIndex=0; psIndex(preferenceSorting.elementAt(psIndex)); + // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet, + // is it even possible for duplicates to show up in this check? + if (labelSet.contains(item)) { + UnicodeSetIterator itemAlreadyInIter(labelSet); + while (itemAlreadyInIter.next()) { + const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString(); + if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) { + UnicodeSet *targets = static_cast(uhash_get(alreadyIn_, &itemAlreadyIn)); + if (targets == NULL) { + // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet()); + targets = new UnicodeSet(); + uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status); + } + targets->add(item); + break; + } + } + } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point. + collatorPrimaryOnly_->compare(item, separated(item)) == 0) { + noDistinctSorting_->add(item); + } else if (!ALPHABETIC->containsSome(item)) { + notAlphabetic_->add(item); + } else { + labelSet.add(item); + } + } + + // Move the set of Labels from the set into a vector, and sort + // according to the collator. + + appendUnicodeSetToUVector(*labels_, labelSet, status); + labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status); + + // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element + // Implemented by copying the elements to be retained to a new UVector. + + const int32_t size = labelSet.size() - 1; + if (size > maxLabelCount_) { + UVector *newLabels = new UVector(status); + newLabels->setDeleter(uhash_deleteUnicodeString); + int32_t count = 0; + int32_t old = -1; + for (int32_t srcIndex=0; srcIndexsize(); srcIndex++) { + const UnicodeString *str = static_cast(labels_->elementAt(srcIndex)); + ++count; + const int32_t bump = count * maxLabelCount_ / size; + if (bump == old) { + // it.remove(); + } else { + newLabels->addElement(str->clone(), status); + old = bump; + } + } + delete labels_; + labels_ = newLabels; + } + + // We now know the list of labels. + // Create a corresponding list of buckets, one per label. + + buildBucketList(status); // Corresponds to Java BucketList constructor. + + // Bin the Records into the Buckets. + bucketRecords(status); + + indexBuildRequired_ = FALSE; + resetBucketIterator(status); +} + +// +// buildBucketList() Corresponds to the BucketList constructor in the Java version. + +void AlphabeticIndex::buildBucketList(UErrorCode &status) { + UnicodeString labelStr = getUnderflowLabel(); + Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status); + bucketList_->addElement(b, status); + + // Build up the list, adding underflow, additions, overflow + // insert infix labels as needed, using \uFFFF. + const UnicodeString *last = static_cast(labels_->elementAt(0)); + b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status); + bucketList_->addElement(b, status); + + UnicodeSet lastSet; + UnicodeSet set; + AlphabeticIndex::getScriptSet(lastSet, *last, status); + lastSet.removeAll(*IGNORE_SCRIPTS); + + for (int i = 1; i < labels_->size(); ++i) { + UnicodeString *current = static_cast(labels_->elementAt(i)); + getScriptSet(set, *current, status); + set.removeAll(*IGNORE_SCRIPTS); + if (lastSet.containsNone(set)) { + // check for adjacent + const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status); + if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) { + labelStr = getInflowLabel(); + b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status); + bucketList_->addElement(b, status); + i++; + lastSet = set; + } + } + b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status); + bucketList_->addElement(b, status); + last = current; + lastSet = set; + } + const UnicodeString &limitString = getOverflowComparisonString(*last, status); + b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status); + bucketList_->addElement(b, status); + // final overflow bucket +} + + +// +// Place all of the raw input records into the correct bucket. +// +// Begin by sorting the input records; this lets us bin them in a single pass. +// +// Note on storage management: The input records are owned by the +// inputRecords_ vector, and will (eventually) be auto-deleted by it. +// The Bucket objects have pointers to the Record objects, but do not own them. +// +void AlphabeticIndex::bucketRecords(UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + + inputRecords_->sortWithUComparator(recordCompareFn, collator_, status); + U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow + // bucket, even if no user labels. + int32_t bucketIndex = 0; + Bucket *destBucket = static_cast(bucketList_->elementAt(bucketIndex)); + Bucket *nextBucket = NULL; + if (bucketIndex+1 < bucketList_->size()) { + nextBucket = static_cast(bucketList_->elementAt(bucketIndex+1)); + } + int32_t recordIndex = 0; + Record *r = static_cast(inputRecords_->elementAt(recordIndex)); + while (recordIndex < inputRecords_->size()) { + if (nextBucket == NULL || + collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) { + // Record goes in current bucket. Advance to next record, + // stay on current bucket. + destBucket->records_->addElement(r, status); + ++recordIndex; + r = static_cast(inputRecords_->elementAt(recordIndex)); + } else { + // Advance to the next bucket, stay on current record. + bucketIndex++; + destBucket = nextBucket; + if (bucketIndex+1 < bucketList_->size()) { + nextBucket = static_cast(bucketList_->elementAt(bucketIndex+1)); + } else { + nextBucket = NULL; + } + U_ASSERT(destBucket != NULL); + } + } + +} + + +void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) { + if (U_FAILURE(status)) { + return; + } + + LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); + UnicodeSet exemplars; + ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); + if (U_SUCCESS(status)) { + dest.addAll(exemplars); + return; + } + status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR + + // Locale data did not include explicit Index characters. + // Synthesize a set of them from the locale's standard exemplar characters. + + ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); + if (U_FAILURE(status)) { + return; + } + + // Upper-case any that aren't already so. + // (We only do this for synthesized index characters.) + + UnicodeSetIterator it(exemplars); + UnicodeString upperC; + UnicodeSet lowersToRemove; + UnicodeSet uppersToAdd; + while (it.next()) { + const UnicodeString &exemplarC = it.getString(); + upperC = exemplarC; + upperC.toUpper(locale); + if (exemplarC != upperC) { + lowersToRemove.add(exemplarC); + uppersToAdd.add(upperC); + } + } + exemplars.removeAll(lowersToRemove); + exemplars.addAll(uppersToAdd); + + // get the exemplars, and handle special cases + + // question: should we add auxiliary exemplars? + if (exemplars.containsSome(*CORE_LATIN)) { + exemplars.addAll(*CORE_LATIN); + } + if (exemplars.containsSome(*HANGUL)) { + // cut down to small list + UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status); + exemplars.removeAll(BLOCK_HANGUL_SYLLABLES); + exemplars.addAll(*HANGUL); + } + if (exemplars.containsSome(*ETHIOPIC)) { + // cut down to small list + // make use of the fact that Ethiopic is allocated in 8's, where + // the base is 0 mod 8. + UnicodeSetIterator it(*ETHIOPIC); + while (it.next() && !it.isString()) { + if ((it.getCodepoint() & 0x7) != 0) { + exemplars.remove(it.getCodepoint()); + } + } + } + dest.addAll(exemplars); +} + + +/* + * Return the string with interspersed CGJs. Input must have more than 2 codepoints. + */ +static const UChar32 CGJ = (UChar)0x034F; +UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { + UnicodeString result; + if (item.length() == 0) { + return result; + } + int32_t i = 0; + for (;;) { + UChar32 cp = item.char32At(i); + result.append(cp); + i = item.moveIndex32(i, 1); + if (i >= item.length()) { + break; + } + result.append(CGJ); + } + return result; +} + + +UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { + return FALSE; +} + + +UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { + return FALSE; +} + + +const RuleBasedCollator &AlphabeticIndex::getCollator() const { + // There are no known non-RuleBasedCollator collators, and none ever expected. + // But, in case that changes, better a null pointer than a wrong type. + return *dynamic_cast(collator_); +} + + +const UnicodeString &AlphabeticIndex::getInflowLabel() const { + return inflowLabel_; +} + +const UnicodeString &AlphabeticIndex::getOverflowLabel() const { + return overflowLabel_; +} + + +const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { + return underflowLabel_; +} + + +AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { + inflowLabel_ = label; + indexBuildRequired_ = TRUE; + return *this; +} + + +AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { + overflowLabel_ = label; + indexBuildRequired_ = TRUE; + return *this; +} + + +AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { + underflowLabel_ = label; + indexBuildRequired_ = TRUE; + return *this; +} + + +int32_t AlphabeticIndex::AlphabeticIndex::getMaxLabelCount() const { + return maxLabelCount_; +} + + +AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (maxLabelCount <= 0) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + maxLabelCount_ = maxLabelCount; + if (maxLabelCount < bucketList_->size()) { + indexBuildRequired_ = TRUE; + } + return *this; +} + + +const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) { + for (int32_t i=0; isize(); i++) { + const UnicodeString *s = + static_cast(firstScriptCharacters_->elementAt(i)); + if (collator_->compare(*s, lowerLimit) > 0) { + return *s; + } + } + return *EMPTY_STRING; +} + +UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) { + if (U_FAILURE(status)) { + return &dest; + } + UChar32 cp = codePoint.char32At(0); + UScriptCode scriptCode = uscript_getScript(cp, &status); + dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status); + return &dest; +} + +// +// init() - Common code for constructors. +// + +void AlphabeticIndex::init(UErrorCode &status) { + // Initialize statics if needed. + AlphabeticIndex::staticInit(status); + + // Put the object into a known state so that the destructor will function. + + alreadyIn_ = NULL; + bucketList_ = NULL; + collator_ = NULL; + collatorPrimaryOnly_ = NULL; + currentBucket_ = NULL; + firstScriptCharacters_ = NULL; + initialLabels_ = NULL; + indexBuildRequired_ = TRUE; + inputRecords_ = NULL; + itemsIterIndex_ = 0; + labels_ = NULL; + labelsIterIndex_ = 0; + maxLabelCount_ = 99; + noDistinctSorting_ = NULL; + notAlphabetic_ = NULL; + recordCounter_ = 0; + + if (U_FAILURE(status)) { + return; + } + alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash, + uhash_compareUnicodeString, // key Comparator, + NULL, // value Comparator + &status); + uhash_setKeyDeleter(alreadyIn_, uhash_deleteUnicodeString); + uhash_setValueDeleter(alreadyIn_, uhash_deleteUnicodeSet); + + bucketList_ = new UVector(status); + bucketList_->setDeleter(alphaIndex_deleteBucket); + labels_ = new UVector(status); + labels_->setDeleter(uhash_deleteUnicodeString); + labels_->setComparer(uhash_compareUnicodeString); + inputRecords_ = new UVector(status); + inputRecords_->setDeleter(alphaIndex_deleteRecord); + + noDistinctSorting_ = new UnicodeSet(); + notAlphabetic_ = new UnicodeSet(); + initialLabels_ = new UnicodeSet(); + + inflowLabel_.remove(); + inflowLabel_.append((UChar)0x2026); // Ellipsis + overflowLabel_ = inflowLabel_; + underflowLabel_ = inflowLabel_; + + // TODO: check for memory allocation failures. +} + + +static UBool indexCharactersAreInitialized = FALSE; + +// Index Characters Clean up function. Delete statically allocated constant stuff. +U_CDECL_BEGIN +static UBool U_CALLCONV indexCharacters_cleanup(void) { + AlphabeticIndex::staticCleanup(); + return TRUE; +} +U_CDECL_END + +void AlphabeticIndex::staticCleanup() { + delete ALPHABETIC; + ALPHABETIC = NULL; + delete HANGUL; + HANGUL = NULL; + delete ETHIOPIC; + ETHIOPIC = NULL; + delete CORE_LATIN; + CORE_LATIN = NULL; + delete IGNORE_SCRIPTS; + IGNORE_SCRIPTS = NULL; + delete TO_TRY; + TO_TRY = NULL; + delete UNIHAN; + UNIHAN = NULL; + delete EMPTY_STRING; + EMPTY_STRING = NULL; + nfkdNormalizer = NULL; // ref to a singleton. Do not delete. + indexCharactersAreInitialized = FALSE; +} + + +UnicodeSet *AlphabeticIndex::ALPHABETIC; +UnicodeSet *AlphabeticIndex::HANGUL; +UnicodeSet *AlphabeticIndex::ETHIOPIC; +UnicodeSet *AlphabeticIndex::CORE_LATIN; +UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS; +UnicodeSet *AlphabeticIndex::TO_TRY; +UnicodeSet *AlphabeticIndex::UNIHAN; +const UnicodeString *AlphabeticIndex::EMPTY_STRING; + +// +// staticInit() One-time initialization of constants. +// Thread safe. Called from constructors. +// Mutex overhead is not a concern. AlphabeticIndex constructors are +// sufficiently heavy that the cost of the mutex check is not significant. + +void AlphabeticIndex::staticInit(UErrorCode &status) { + static UMTX IndexCharsInitMutex; + + Mutex mutex(&IndexCharsInitMutex); + if (indexCharactersAreInitialized || U_FAILURE(status)) { + return; + } + UBool finishedInit = FALSE; + + { + UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]"); + ALPHABETIC = new UnicodeSet(alphaString, status); + if (ALPHABETIC == NULL) { + goto err; + } + + HANGUL = new UnicodeSet(); + HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC). + add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558); + if (HANGUL== NULL) { + goto err; + } + + + UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); + ETHIOPIC = new UnicodeSet(EthiopicStr, status); + if (ETHIOPIC == NULL) { + goto err; + } + + CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z'); + if (CORE_LATIN == NULL) { + goto err; + } + + UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE( + "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]"); + IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status); + IGNORE_SCRIPTS->freeze(); + if (IGNORE_SCRIPTS == NULL) { + goto err; + } + + UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]"); + TO_TRY = new UnicodeSet(nfcqcStr, status); + if (TO_TRY == NULL) { + goto err; + } + + UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]"); + UNIHAN = new UnicodeSet(unihanStr, status); + if (UNIHAN == NULL) { + goto err; + } + + EMPTY_STRING = new UnicodeString(); + + nfkdNormalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, status); + if (nfkdNormalizer == NULL) { + goto err; + } + } + finishedInit = TRUE; + + err: + if (!finishedInit && U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + } + if (U_FAILURE(status)) { + indexCharacters_cleanup(); + return; + } + ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup); + indexCharactersAreInitialized = TRUE; +} + + +// +// Comparison function for UVector sorting with a collator. +// +static int32_t U_CALLCONV +sortCollateComparator(const void *context, const void *left, const void *right) { + const UHashTok *leftTok = static_cast(left); + const UHashTok *rightTok = static_cast(right); + const UnicodeString *leftString = static_cast(leftTok->pointer); + const UnicodeString *rightString = static_cast(rightTok->pointer); + const Collator *col = static_cast(context); + + if (leftString == rightString) { + // Catches case where both are NULL + return 0; + } + if (leftString == NULL) { + return 1; + }; + if (rightString == NULL) { + return -1; + } + Collator::EComparisonResult r = col->compare(*leftString, *rightString); + return (int32_t) r; +} + +// +// Comparison function for UVector sorting with a collator. +// +static int32_t U_CALLCONV +recordCompareFn(const void *context, const void *left, const void *right) { + const UHashTok *leftTok = static_cast(left); + const UHashTok *rightTok = static_cast(right); + const AlphabeticIndex::Record *leftRec = static_cast(leftTok->pointer); + const AlphabeticIndex::Record *rightRec = static_cast(rightTok->pointer); + const Collator *col = static_cast(context); + + Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_); + if (r == Collator::EQUAL) { + if (leftRec->serialNumber_ < rightRec->serialNumber_) { + r = Collator::LESS; + } else if (leftRec->serialNumber_ > rightRec->serialNumber_) { + r = Collator::GREATER; + } + } + return (int32_t) r; +} + + +#if 0 +// +// First characters in scripts. +// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. +// The vector is sorted according to this index's collation. +// +// This code is too slow to use, so for now hard code the data. +// Hard coded implementation is follows. +// +UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) { + + if (U_FAILURE(status)) { + return NULL; + } + + UnicodeString results[USCRIPT_CODE_LIMIT]; + UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a"); + + UnicodeSetIterator siter(*TO_TRY); + while (siter.next()) { + const UnicodeString ¤t = siter.getString(); + Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A); + if (r < 0) { // TODO fix; we only want "real" script characters, not + // symbols. + continue; + } + + int script = uscript_getScript(current.char32At(0), &status); + if (results[script].length() == 0) { + results[script] = current; + } + else if (ruleBasedCollator->compare(current, results[script]) < 0) { + results[script] = current; + } + } + + UnicodeSet extras; + UnicodeSet expansions; + RuleBasedCollator *rbc = dynamic_cast(ruleBasedCollator); + const UCollator *uRuleBasedCollator = rbc->getUCollator(); + ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status); + extras.addAll(expansions).removeAll(*TO_TRY); + if (extras.size() != 0) { + const Normalizer2 *normalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_COMPOSE, status); + UnicodeSetIterator extrasIter(extras); + while (extrasIter.next()) { + const UnicodeString ¤t = extrasIter.next(); + if (!TO_TRY->containsAll(current)) + continue; + if (!normalizer->isNormalized(current, status) || + ruleBasedCollator->compare(current, LOWER_A) < 0) { + continue; + } + int script = uscript_getScript(current.char32At(0), &status); + if (results[script].length() == 0) { + results[script] = current; + } else if (ruleBasedCollator->compare(current, results[script]) < 0) { + results[script] = current; + } + } + } + + UVector *dest = new UVector(status); + dest->setDeleter(uhash_deleteUnicodeString); + for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) { + if (results[i].length() > 0) { + dest->addElement(results[i].clone(), status); + } + } + dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status); + return dest; +} +#endif + + +// +// First characters in scripts. +// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. +// The vector is sorted according to this index's collation. +// +// It takes too much time to compute this from character properties, so hard code it for now. +// Character constants copied from corresponding declaration in ICU4J. + +static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0, + 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, + 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, + 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, 0xABC0, 0, 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, 0x1B83, 0, + 0xD802, 0xDE00, 0, 0x0E01, 0, 0x0E81, 0, 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, + 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, + 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, + 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, 0xD800, 0xDE80, 0, + 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, + 0xD801, 0xDC80, 0, 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, + 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, 0x4E00, 0 }; + +UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { + if (U_FAILURE(status)) { + return NULL; + } + UVector *dest = new UVector(status); + dest->setDeleter(uhash_deleteUnicodeString); + if (dest == NULL && U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + } + const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; + const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]); + do { + if (U_FAILURE(status)) { + return dest; + } + UnicodeString *str = new UnicodeString(src, -1); + if (str == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } + dest->addElement(str, status); + src += str->length() + 1; + } while (src < limit); + dest->sortWithUComparator(sortCollateComparator, collator_, status); + return dest; +} + + +AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) { + const char *lang = loc.getLanguage(); + if (uprv_strcmp(lang, "zh") != 0) { + return kNormal; + } + const char *script = loc.getScript(); + if (uprv_strcmp(script, "Hant") == 0) { + return kTraditional; + } + const char *country = loc.getCountry(); + if (uprv_strcmp(country, "TW") == 0) { + return kTraditional; + } + return kSimplified; +} + + +// +// Pinyin Hacks. Direct port from Java. +// + +static const UChar32 probeCharInLong = 0x28EAD; + + +static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz" + 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, + /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; + + +// Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code. + +AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = { + {(UChar)0, (UChar)0, (UChar)0}, // A + {(UChar)0x516B, (UChar)0, (UChar)0}, // B + {(UChar)0x5693, (UChar)0, (UChar)0}, // C + {(UChar)0x5491, (UChar)0, (UChar)0}, // D + {(UChar)0x59B8, (UChar)0, (UChar)0}, // E + {(UChar)0x53D1, (UChar)0, (UChar)0}, // F + {(UChar)0x65EE, (UChar)0, (UChar)0}, // G + {(UChar)0x54C8, (UChar)0, (UChar)0}, // H + {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J + {(UChar)0x5494, (UChar)0, (UChar)0}, // K + {(UChar)0x5783, (UChar)0, (UChar)0}, // L + {(UChar)0x5452, (UChar)0, (UChar)0}, // M + {(UChar)0x5514, (UChar)0, (UChar)0}, // N + {(UChar)0x5594, (UChar)0, (UChar)0}, // O + {(UChar)0x5991, (UChar)0, (UChar)0}, // P + {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q + {(UChar)0x513F, (UChar)0, (UChar)0}, // R + {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S + {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T + {(UChar)0x7A75, (UChar)0, (UChar)0}, // W + {(UChar)0x5915, (UChar)0, (UChar)0}, // X + {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y + {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z + {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array + }; + +static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"; + 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, + /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; + +AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = { + {(UChar)0, (UChar)0, (UChar)0}, // A + {(UChar)0x516B, (UChar)0, (UChar)0}, // b + {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c + {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d + {(UChar)0x59B8, (UChar)0, (UChar)0}, // e + {(UChar)0x53D1, (UChar)0, (UChar)0}, // f + {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g + {(UChar)0x54C8, (UChar)0, (UChar)0}, // h + {(UChar)0x4E0C, (UChar)0, (UChar)0}, // j + {(UChar)0x5494, (UChar)0, (UChar)0}, // k + {(UChar)0x3547, (UChar)0, (UChar)0}, // l + {(UChar)0x5452, (UChar)0, (UChar)0}, // m + {(UChar)0x5514, (UChar)0, (UChar)0}, // n + {(UChar)0x5594, (UChar)0, (UChar)0}, // o + {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p + {(UChar)0x4E03, (UChar)0, (UChar)0}, // q + {(UChar)0x513F, (UChar)0, (UChar)0}, // r + {(UChar)0x4EE8, (UChar)0, (UChar)0}, // s + {(UChar)0x4ED6, (UChar)0, (UChar)0}, // t + {(UChar)0x7A75, (UChar)0, (UChar)0}, // w + {(UChar)0x5915, (UChar)0, (UChar)0}, // x + {(UChar)0x4E2B, (UChar)0, (UChar)0}, // y + {(UChar)0x5E00, (UChar)0, (UChar)0}, // z + {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array + }; + + +// +// Probe the collation data, and decide which Pinyin tables should be used +// +// ICU can be built with a choice between two Chinese collations. +// The hack Pinyin tables to use depend on which one is in use. +// We can assume that any given copy of ICU will have only one of the collations available, +// and that there is no way, in a given process, to create two alphabetic indexes using +// different Chinese collations. Which means the probe can be done once +// and the results cached. +// +// This whole arrangement is temporary. +// +AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP = NULL; +const UChar *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL; + +void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) { + { + Mutex m; + if (PINYIN_LOWER_BOUNDS != NULL) { + return; + } + } + UnicodeSet *colSet = col->getTailoredSet(status); + if (U_FAILURE(status) || colSet == NULL) { + delete colSet; + if (U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + } + return; + } + UBool useLongTables = colSet->contains(probeCharInLong); + delete colSet; + { + Mutex m; + if (useLongTables) { + PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG; + HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG; + } else { + PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT; + HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT; + } + } +} + +// Pinyin Hack: +// Modify a Chinese name by prepending a Latin letter. The modified name is used +// when putting records (names) into buckets, to put the name under a Latin index heading. + +void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) { + + if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) { + dest = name; + return; + } + + UErrorCode status = U_ZERO_ERROR; + initPinyinBounds(col, status); + if (U_FAILURE(status)) { + dest = name; + return; + } + // TODO: use binary search + int index; + for (index=0; ; index++) { + if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) { + index--; + break; + } + int32_t compareResult = col->compare(name, (*HACK_PINYIN_LOOKUP)[index]); + if (compareResult < 0) { + index--; + } + if (compareResult <= 0) { + break; + } + } + UChar c = PINYIN_LOWER_BOUNDS[index]; + dest.setTo(c); + dest.append(name); + return; +} + + + +/** + * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is + * better, and otherwise binary order is better. + * + * For use with array sort or UVector. + * @param context A UErrorCode pointer. + * @param left A UHashTok pointer, which must refer to a UnicodeString * + * @param right A UHashTok pointer, which must refer to a UnicodeString * + */ + +static int32_t U_CALLCONV +PreferenceComparator(const void *context, const void *left, const void *right) { + const UHashTok *leftTok = static_cast(left); + const UHashTok *rightTok = static_cast(right); + const UnicodeString *s1 = static_cast(leftTok->pointer); + const UnicodeString *s2 = static_cast(rightTok->pointer); + UErrorCode &status = *(UErrorCode *)(context); // Cast off both static and const. + if (s1 == s2) { + return 0; + } + + UnicodeString n1 = nfkdNormalizer->normalize(*s1, status); + UnicodeString n2 = nfkdNormalizer->normalize(*s2, status); + int32_t result = n1.length() - n2.length(); + if (result != 0) { + return result; + } + + result = n1.compareCodePointOrder(n2); + if (result != 0) { + return result; + } + return s1->compareCodePointOrder(*s2); +} + + +// +// Constructor & Destructor for AlphabeticIndex::Record +// +// Records are internal only, instances are not directly surfaced in the public API. +// This class is mostly struct-like, with all public fields. + +AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data): + alphaIndex_(alphaIndex), name_(name), data_(data) +{ + UnicodeString prefixedName; + alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_); + serialNumber_ = ++alphaIndex->recordCounter_; +} + +AlphabeticIndex::Record::~Record() { +} + + +AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + Record *r = new Record(this, name, data); + inputRecords_->addElement(r, status); + indexBuildRequired_ = TRUE; + //std::string ss; + //std::string ss2; + //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << + // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; + return *this; +} + + +AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + inputRecords_->removeAllElements(); + indexBuildRequired_ = TRUE; + return *this; +} + + +int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { + buildIndex(status); + if (U_FAILURE(status)) { + return 0; + } + + // For simplified Chinese prepend a prefix to the name. + // For non-Chinese locales or non-Chinese names, the name is not modified. + + UnicodeString prefixedName; + hackName(prefixedName, name, collatorPrimaryOnly_); + + // TODO: use a binary search. + for (int32_t i = 0; i < bucketList_->size(); ++i) { + Bucket *bucket = static_cast(bucketList_->elementAt(i)); + Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_); + if (comp < 0) { + return i - 1; + } + } + // Loop runs until we find the bucket following the one that would hold prefixedName. + // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather + // than returning from the middle. + + return bucketList_->size() - 1; +} + + +int32_t AlphabeticIndex::getBucketIndex() const { + return labelsIterIndex_; +} + + +UBool AlphabeticIndex::nextBucket(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (indexBuildRequired_ && currentBucket_ != NULL) { + status = U_ENUM_OUT_OF_SYNC_ERROR; + return FALSE; + } + buildIndex(status); + if (U_FAILURE(status)) { + return FALSE; + } + ++labelsIterIndex_; + if (labelsIterIndex_ >= bucketList_->size()) { + labelsIterIndex_ = bucketList_->size(); + return FALSE; + } + currentBucket_ = static_cast(bucketList_->elementAt(labelsIterIndex_)); + resetRecordIterator(); + return TRUE; +} + +const UnicodeString &AlphabeticIndex::getBucketLabel() const { + if (currentBucket_ != NULL) { + return currentBucket_->label_; + } else { + return *EMPTY_STRING; + } +} + + +UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { + if (currentBucket_ != NULL) { + return currentBucket_->labelType_; + } else { + return U_ALPHAINDEX_NORMAL; + } +} + + +int32_t AlphabeticIndex::getBucketRecordCount() const { + if (currentBucket_ != NULL) { + return currentBucket_->records_->size(); + } else { + return 0; + } +} + + +AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + buildIndex(status); + labelsIterIndex_ = -1; + currentBucket_ = NULL; + return *this; +} + + +UBool AlphabeticIndex::nextRecord(UErrorCode &status) { + if (U_FAILURE(status)) { + return FALSE; + } + if (currentBucket_ == NULL) { + // We are trying to iterate over the items in a bucket, but there is no + // current bucket from the enumeration of buckets. + status = U_INVALID_STATE_ERROR; + return FALSE; + } + if (indexBuildRequired_) { + status = U_ENUM_OUT_OF_SYNC_ERROR; + return FALSE; + } + ++itemsIterIndex_; + if (itemsIterIndex_ >= currentBucket_->records_->size()) { + itemsIterIndex_ = currentBucket_->records_->size(); + return FALSE; + } + return TRUE; +} + + +const UnicodeString &AlphabeticIndex::getRecordName() const { + const UnicodeString *retStr = EMPTY_STRING; + if (currentBucket_ != NULL && + itemsIterIndex_ >= 0 && + itemsIterIndex_ < currentBucket_->records_->size()) { + Record *item = static_cast(currentBucket_->records_->elementAt(itemsIterIndex_)); + retStr = &item->name_; + } + return *retStr; +} + +const void *AlphabeticIndex::getRecordData() const { + const void *retPtr = NULL; + if (currentBucket_ != NULL && + itemsIterIndex_ >= 0 && + itemsIterIndex_ < currentBucket_->records_->size()) { + Record *item = static_cast(currentBucket_->records_->elementAt(itemsIterIndex_)); + retPtr = item->data_; + } + return retPtr; +} + + +AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { + itemsIterIndex_ = -1; + return *this; +} + + + +AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, + const UnicodeString &lowerBoundary, + UAlphabeticIndexLabelType type, + UErrorCode &status): + label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) { + if (U_FAILURE(status)) { + return; + } + records_ = new UVector(status); + if (records_ == NULL && U_SUCCESS(status)) { + status = U_MEMORY_ALLOCATION_ERROR; + } +} + + +AlphabeticIndex::Bucket::~Bucket() { + delete records_; +} + +U_NAMESPACE_END diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h index 82de8389bc4..b128bebeb4c 100644 --- a/icu4c/source/i18n/ucln_in.h +++ b/icu4c/source/i18n/ucln_in.h @@ -1,7 +1,7 @@ /* ****************************************************************************** * * -* Copyright (C) 2001-2009, International Business Machines * +* Copyright (C) 2001-2011, International Business Machines * * Corporation and others. All Rights Reserved. * * * ****************************************************************************** @@ -46,6 +46,7 @@ typedef enum ECleanupI18NType { UCLN_I18N_UCOL_BLD, UCLN_I18N_CSDET, UCLN_I18N_COLL_DATA, + UCLN_I18N_INDEX_CHARACTERS, UCLN_I18N_COUNT /* This must be last */ } ECleanupI18NType; diff --git a/icu4c/source/i18n/ulocdata.c b/icu4c/source/i18n/ulocdata.c index 20d02462987..a864c3aa395 100644 --- a/icu4c/source/i18n/ulocdata.c +++ b/icu4c/source/i18n/ulocdata.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * * -* Copyright (C) 2003-2009, International Business Machines * +* Copyright (C) 2003-2011, International Business Machines * * Corporation and others. All Rights Reserved. * * * ****************************************************************************** @@ -100,7 +100,9 @@ U_CAPI USet* U_EXPORT2 ulocdata_getExemplarSet(ULocaleData *uld, USet *fillIn, uint32_t options, ULocaleDataExemplarSetType extype, UErrorCode *status){ - static const char* const exemplarSetTypes[] = { "ExemplarCharacters", "AuxExemplarCharacters" }; + static const char* const exemplarSetTypes[] = { "ExemplarCharacters", + "AuxExemplarCharacters", + "ExemplarCharactersIndex"}; const UChar *exemplarChars = NULL; int32_t len = 0; UErrorCode localStatus = U_ZERO_ERROR; diff --git a/icu4c/source/i18n/unicode/alphaindex.h b/icu4c/source/i18n/unicode/alphaindex.h new file mode 100644 index 00000000000..f9dff87ade2 --- /dev/null +++ b/icu4c/source/i18n/unicode/alphaindex.h @@ -0,0 +1,717 @@ +/* +******************************************************************************* +* +* Copyright (C) 2011 International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +*/ + +#ifndef INDEXCHARS_H +#define INDEXCHARS_H + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/locid.h" + +/** + * \file + * \brief C++ API: Index Characters + */ + + +U_CDECL_BEGIN + +/** + * Constants for Alphabetic Index Label Types. + * The form of these enum constants anticipates having a plain C API + * for Alphabetic Indexes that will also use them. + * @draft ICU 4.8 + */ +typedef enum UAlphabeticIndexLabelType { + /** + * Normal Label, typically the starting letter of the names + * in the bucket with this label. + * @draft ICU 4.8 + */ + U_ALPHAINDEX_NORMAL = 0, + + /** + * Undeflow Label. The bucket with this label contains names + * in scripts that sort before any of the bucket labels in this index. + * @draft ICU 4.8 + */ + U_ALPHAINDEX_UNDERFLOW = 1, + + /** + * Inflow Label. The bucket with this label contains names + * in scripts that sort between two of the bucket labels in this index. + * Inflow labels are created when an index contains normal labels for + * multiple scripts, and skips other scripts that sort between some of the + * included scripts. + * @draft ICU 4.8 + */ + U_ALPHAINDEX_INFLOW = 2, + + /** + * Overflow Label. Te bucket with this label contains names in scripts + * that sort after all of the bucket labels in this index. + * @draft ICU 4.8 + */ + U_ALPHAINDEX_OVERFLOW = 3 + } UAlphabeticIndexLabelType; + + +struct UHashtable; +U_CDECL_END + +U_NAMESPACE_BEGIN + +// Forward Declarations + +class Collator; +class RuleBasedCollator; +class StringEnumeration; +class UnicodeSet; +class UVector; + + + +/** + * class AlphabeticIndex supports the creation of a UI index appropriate for a given language, such as: + * + *
+ *  … A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Æ Ø Å …
+ *
+ *  A
+ *     Addison
+ *     Albertson
+ *     Azensky
+ *  B
+ *     Baker
+ *  ...
+ * 
+ * + * The class can generate a list of labels for use as a UI "index", that is, a list of + * clickable characters (or character sequences) that allow the user to see a segment + * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in + * the target list, where everything in the bucket is greater than or equal to the character + * (according to the locale's collation). Strings can be added to the index; + * they will be in sorted order in the right bucket. + *

+ * The class also supports having buckets for strings before the first (underflow), + * after the last (overflow), and between scripts (inflow). For example, if the index + * is constructed with labels for Russian and English, Greek characters would fall + * into an inflow bucket between the other two scripts. + *

+ * The AlphabeticIndex class is not intended for public subclassing. + *

+ * Example + *

+ * The "show..." methods below are just to illustrate usage. + * + *

+ * // Create a simple index.  "Item" is assumed to be an application
+ * // defined type that the application's UI and other processing knows about,
+ * //  and that has a name.
+ *
+ * UErrorCode status = U_ZERO_ERROR;
+ * AlphabeticIndex index = new AlphabeticIndex(desiredLocale, status);
+ * index->addLabels(additionalLocale, status);
+ * for (Item *item in some source of Items ) {
+ *     index->addRecord(item->name(), item, status);
+ * }
+ * ...
+ * // Show index at top. We could skip or gray out empty buckets
+ *
+ * while (index->nextBucket(status)) {
+ *     if (showAll || index->getBucketRecordCount() != 0) {
+ *         showLabelAtTop(UI, index->getBucketLabel());
+ *     }
+ * }
+ *  ...
+ * // Show the buckets with their contents, skipping empty buckets
+ *
+ * index->resetBucketIterator(status);
+ * while (index->nextBucket(status)) {
+ *    if (index->getBucketRecordCount() != 0) {
+ *        showLabelInList(UI, index->getBucketLabel());
+ *        while (index->nextRecord(status)) {
+ *            showIndexedItem(UI, static_cast(index->getRecordData()))
+ * 
+ * + * The caller can build different UIs using this class. + * For example, an index character could be omitted or grayed-out + * if its bucket is empty. Small buckets could also be combined based on size, such as: + * + *
+ * … A-F G-N O-Z …
+ * 
+ * + *

+ * Notes: + *

    + *
  • Additional collation parameters can be passed in as part of the locale name. + * For example, German plus numeric + * sorting would be "de@kn-true". + * + * @draft ICU 4.8 + * @provisional This API might change or be removed in a future release. + */ + + +class U_I18N_API AlphabeticIndex: public UObject { + + public: + + /** + * Construct an AlphabeticIndex object for the specified locale. If the locale's + * data does not include index characters, a set of them will be + * synthesized based on the locale's exemplar characters. The locale + * determines the sorting order for both the index characters and the + * user item names appearing under each Index character. + * + * @param locale the desired locale. + * @param status Error code, will be set with the reason if the construction + * of the AlphabeticIndex object fails. + * @draft ICU 4.8 + */ + AlphabeticIndex(const Locale &locale, UErrorCode &status); + + + + /** + * Add Labels to this Index. The labels are additions to those + * that are already in the index; they do not replace the existing + * ones. + * @param additions The additional characters to add to the index, such as A-Z. + * @param status Error code, will be set with the reason if the + * operation fails. + * @return this, for chaining + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &addLabels(const UnicodeSet &additions, UErrorCode &status); + + /** + * Add the index characters from a Locale to the index. The labels + * are added to those that are already in the index; they do not replace the + * existing index characters. The collation order for this index is not + * changed; it remains that of the locale that was originally specified + * when creating this Index. + * + * @param locale The locale whose index characters are to be added. + * @param status Error code, will be set with the reason if the + * operation fails. + * @return this, for chaining + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &addLabels(const Locale &locale, UErrorCode &status); + + /** + * Destructor + */ + virtual ~AlphabeticIndex(); + + + /** + * Get the Collator that establishes the ordering of the items in this index. + * Ownership of the collator remains with the AlphabeticIndex instance. + * + * The returned collator is a reference to the internal collator used by this + * index. It may be safely used to compare the names of items or to get + * sort keys for names. However if any settings need to be changed, + * or other non-const methods called, a cloned copy must be made first. + * + * @return The collator + * @draft ICU 4.8 + */ + virtual const RuleBasedCollator &getCollator() const; + + + /** + * Get the default label used for abbreviated buckets between other index characters. + * For example, consider the labels when Latin and Greek are used: + * X Y Z ... Α Β Γ. + * + * @return inflow label + * @draft ICU 4.8 + */ + virtual const UnicodeString &getInflowLabel() const; + + /** + * Set the default label used for abbreviated buckets between other index characters. + * An inflow label will be automatically inserted if two otherwise-adjacent label characters + * are from different scripts, e.g. Latin and Cyrillic, and a third script, e.g. Greek, + * sorts between the two. The default inflow character is an ellipsis (...) + * + * @param inflowLabel the new Inflow label. + * @param status Error code, will be set with the reason if the operation fails. + * @return this + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &setInflowLabel(const UnicodeString &inflowLabel, UErrorCode &status); + + + + /** + * Get the special label used for items that sort after the last normal label, + * and that would not otherwise have an appropriate label. + * + * @return the overflow label + * @draft ICU 4.8 + */ + virtual const UnicodeString &getOverflowLabel() const; + + + /** + * Set the label used for items that sort after the last normal label, + * and that would not otherwise have an appropriate label. + * + * @param overflowLabel the new overflow label. + * @param status Error code, will be set with the reason if the operation fails. + * @return this + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &setOverflowLabel(const UnicodeString &overflowLabel, UErrorCode &status); + + /** + * Get the special label used for items that sort before the first normal label, + * and that would not otherwise have an appropriate label. + * + * @return underflow label + * @draft ICU 4.8 + */ + virtual const UnicodeString &getUnderflowLabel() const; + + /** + * Set the label used for items that sort before the first normal label, + * and that would not otherwise have an appropriate label. + * + * @param underflowLabel the new underflow label. + * @param status Error code, will be set with the reason if the operation fails. + * @return this + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &setUnderflowLabel(const UnicodeString &underflowLabel, UErrorCode &status); + + + /** + * Get the limit on the number of labels permitted in the index. + * The number does not include over, under and inflow labels. + * + * @return maxLabelCount maximum number of labels. + * @draft ICU 4.8 + */ + virtual int32_t getMaxLabelCount() const; + + /** + * Set a limit on the number of labels permitted in the index. + * The number does not include over, under and inflow labels. + * Currently, if the number is exceeded, then every + * nth item is removed to bring the count down. + * A more sophisticated mechanism may be available in the future. + * + * @param maxLabelCount the maximum number of labels. + * @return This, for chaining + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status); + + + /** + * Get the Unicode character (or tailored string) that defines an overflow bucket; + * that is anything greater than or equal to that string should go in that bucket, + * instead of with the last character. Normally that is the first character of the script + * after lowerLimit. Thus in X Y Z ... Devanagari-ka, the overflow character for Z + * would be the Greek-alpha. + * + * @param lowerLimit The character below the overflow (or inflow) bucket + * @return string that defines top of the overflow buck for lowerLimit, or an empty string if there is none + * @internal + */ + virtual const UnicodeString &getOverflowComparisonString(const UnicodeString &lowerLimit, + UErrorCode &status); + + + /** + * Add a record to the index. Each record will be associated with an index Bucket + * based on the record's name. The list of records for each bucket will be sorted + * based on the collation ordering of the names in the index's locale. + * Records with duplicate names are permitted; they will be kept in the order + * that they were added. + * + * @param name The display name for the Record. The Record will be placed in + * a bucket based on this name. + * @param data An optional pointer to user data associated with this + * item. When iterating the contents of a bucket, both the + * data pointer the name will be available for each Record. + * @param status Error code, will be set with the reason if the operation fails. + * @return This, for chaining. + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &addRecord(const UnicodeString &name, const void *data, UErrorCode &status); + + /** + * Remove all Records from the Index. The set of Buckets, which define the headings under + * which records are classified, is not altered. + * + * @param status Error code, will be set with the reason if the operation fails. + * @return This, for chaining. + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &clearRecords(UErrorCode &status); + + + /** Get the number of labels in this index. + * Note: may trigger lazy index construction. + * + * @param status Error code, will be set with the reason if the operation fails. + * @return The number of labels in this index, including any under, over or + * in-flow labels. + * @draft ICU 4.8 + */ + virtual int32_t getBucketCount(UErrorCode &status); + + + /** Get the total number of Records in this index, that is, the number + * of pairs added. + * + * @param status Error code, will be set with the reason if the operation fails. + * @return The number of records in this index, that is, the total number + * of (name, data) items added with addRecord(). + * @draft ICU 4.8 + */ + virtual int32_t getRecordCount(UErrorCode &status); + + + + /** + * Given the name of a record, return the zero-based index of the Bucket + * in which the item should appear. The name need not be in the index. + * A Record will not be added to the index by this function. + * Bucket numbers are zero-based, in Bucket iteration order. + * + * @param name The name whose bucket position in the index is to be determined. + * @param status Error code, will be set with the reason if the operation fails. + * @return The bucket number for this name. + * @draft ICU 4.8 + * + */ + virtual int32_t getBucketIndex(const UnicodeString &itemName, UErrorCode &status); + + + /** + * Get the zero based index of the current Bucket from an iteration + * over the Buckets of this index. Return -1 if no iteration is in process. + * @return the index of the current Bucket + * @draft ICU 4.8 + */ + virtual int32_t getBucketIndex() const; + + + /** + * Advance the iteration over the Buckets of this index. Return FALSE if + * there are no more Buckets. + * + * @param status Error code, will be set with the reason if the operation fails. + * U_ENUM_OUT_OF_SYNC_ERROR will be reported if the index is modified while + * an enumeration of its contents are in process. + * + * @return TRUE if success, FALSE if at end of iteration + * @draft ICU 4.8 + */ + virtual UBool nextBucket(UErrorCode &status); + + /** + * Return the name of the Label of the current bucket from an iteration over the buckets. + * If the iteration is before the first Bucket (nextBucket() has not been called), + * or after the last, return an empty string. + * + * @return the bucket label. + * @draft ICU 4.8 + */ + virtual const UnicodeString &getBucketLabel() const; + + /** + * Return the type of the label for the current Bucket (selected by the + * iteration over Buckets.) + * + * @return the label type. + * @draft ICU 4.8 + */ + virtual UAlphabeticIndexLabelType getBucketLabelType() const; + + /** + * Get the number of Records in the current Bucket. + * If the current bucket iteration position is before the first label or after the + * last, return 0. + * + * @return the number of Records. + * @draft ICU 4.8 + */ + virtual int32_t getBucketRecordCount() const; + + + /** + * Reset the Bucket iteration for this index. The next call to nextBucket() + * will restart the iteration at the first label. + * + * @param status Error code, will be set with the reason if the operation fails. + * @return this, for chaining. + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &resetBucketIterator(UErrorCode &status); + + /** + * Advance to the next record in the current Bucket. + * When nextBucket() is called, Record iteration is reset to just before the + * first Record in the new Bucket. + * + * @param status Error code, will be set with the reason if the operation fails. + * U_ENUM_OUT_OF_SYNC_ERROR will be reported if the index is modified while + * an enumeration of its contents are in process. + * @return TRUE if successful, FALSE when the iteration advances past the last item. + * @draft ICU 4.8 + */ + virtual UBool nextRecord(UErrorCode &status); + + /** + * Get the name of the current Record. + * Return an empty string if the Record iteration position is before first + * or after the last. + * + * @return The name of the current index item. + * @draft ICU 4.8 + */ + virtual const UnicodeString &getRecordName() const; + + + /** + * Return the data pointer of the Record currently being iterated over. + * Return NULL if the current iteration position before the first item in this Bucket, + * or after the last. + * + * @return The current Record's data pointer. + * @draft ICU 4.8 + */ + virtual const void *getRecordData() const; + + + /** + * Reset the Record iterator position to before the first Record in the current Bucket. + * + * @return This, for chaining. + * @draft ICU 4.8 + */ + virtual AlphabeticIndex &resetRecordIterator(); + +private: + // No ICU "poor man's RTTI" for this class nor its subclasses. + virtual UClassID getDynamicClassID() const; + + /** + * No Copy constructor. + * @internal + */ + AlphabeticIndex(const AlphabeticIndex &other); + + /** + * No assignment. + */ + AlphabeticIndex &operator =(const AlphabeticIndex & /*other*/) { return *this;}; + + /** + * No Equality operators. + * @internal + */ + virtual UBool operator==(const AlphabeticIndex& other) const; + + /** + * Inequality operator. + * @internal + */ + virtual UBool operator!=(const AlphabeticIndex& other) const; + + // Common initialization, for use from all constructors. + void init(UErrorCode &status); + + // Initialize & destruct static constants used by this class. + static void staticInit(UErrorCode &status); + + // Pinyin stuff. If the input name is Chinese, add the Pinyin prefix to the dest string. + void hackName(UnicodeString &dest, const UnicodeString &name, const Collator *coll); + void initPinyinBounds(const Collator *coll, UErrorCode &status); + + public: + /** + * Delete all shared (static) data associated with an AlphabeticIndex. + * Internal function, not intended for direct use. + * @internal. + */ + static void staticCleanup(); + private: + + // Add index characters from the specified locale to the dest set. + // Does not remove any previous contents from dest. + static void getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status); + + UVector *firstStringsInScript(UErrorCode &status); + + static UnicodeString separated(const UnicodeString &item); + + static UnicodeSet *getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status); + + void buildIndex(UErrorCode &status); + void buildBucketList(UErrorCode &status); + void bucketRecords(UErrorCode &status); + + + public: + + // The following internal items are declared public only to allow access from + // implementation code written in plain C. They are not intended for + // public use. + + /** + * A record, or item, in the index. + * @internal + */ + struct Record: public UMemory { + AlphabeticIndex *alphaIndex_; + const UnicodeString name_; + UnicodeString sortingName_; // Usually the same as name_; different for Pinyin. + const void *data_; + int32_t serialNumber_; // Defines sorting order for names that compare equal. + Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data); + ~Record(); + }; + + /** + * Holds all user records before they are distributed into buckets. + * Type of contents is (Record *) + * @internal + */ + UVector *inputRecords_; + + /** + * A Bucket holds an index label and references to everything belonging to that label. + * For implementation use only. Declared public because pure C implementation code needs access. + * @internal + */ + struct Bucket: public UMemory { + UnicodeString label_; + UnicodeString lowerBoundary_; + UAlphabeticIndexLabelType labelType_; + UVector *records_; // Records are owned by inputRecords_ vector. + + Bucket(const UnicodeString &label, // Parameter strings are copied. + const UnicodeString &lowerBoundary, + UAlphabeticIndexLabelType type, UErrorCode &status); + ~Bucket(); + }; + + public: + + /** + * Language Types. For internal ICU use only. + * @internal + */ + enum ELangType { + /** @internal */ + kNormal, + /** @internal */ + kSimplified, + /** @internal */ + kTraditional + }; + + /** + * Get the Language Type for this Index. Based on the locale. + * @internal + */ + static ELangType langTypeFromLocale(const Locale &loc); + + + private: + + // Holds the contents of this index, buckets of user items. + // UVector elements are of type (Bucket *) + UVector *bucketList_; + + int32_t labelsIterIndex_; // Index of next item to return. + int32_t itemsIterIndex_; + Bucket *currentBucket_; // While an iteration of the index in underway, + // point to the bucket for the current label. + // NULL when no iteration underway. + + UBool indexBuildRequired_; // Caller has made changes to the index that + // require rebuilding & bucketing before the + // contents can be iterated. + + int32_t maxLabelCount_; // Limit on # of labels permitted in the index. + + UHashtable *alreadyIn_; // Key=UnicodeString, value=UnicodeSet + + UnicodeSet *initialLabels_; // Initial (unprocessed) set of Labels. Union + // of those explicitly set by the user plus + // those from locales. Raw values, before + // crunching into bucket labels. + + UVector *labels_; // List of Labels, after processing, sorting. + // Contents are (UnicodeString *) + + UnicodeSet *noDistinctSorting_; // As the set of labels is built, strings may + // be discarded from the exemplars. This contains + // some of the discards, and is + // intended for debugging. + + UnicodeSet *notAlphabetic_; // As the set of labels is built, strings may + // be discarded from the exemplars. This contains + // some of the discards, and is + // intended for debugging. + + + UVector *firstScriptCharacters_; // The first character from each script, + // in collation order. + + Locale locale_; + Collator *collator_; + Collator *collatorPrimaryOnly_; + + UnicodeString inflowLabel_; + UnicodeString overflowLabel_; + UnicodeString underflowLabel_; + UnicodeString overflowComparisonString_; + + ELangType langType_; // The language type, simplified Chinese, Traditional Chinese, + // or not Chinese (Normal). Part of the Pinyin support + + typedef const UChar PinyinLookup[24][3]; + static PinyinLookup HACK_PINYIN_LOOKUP_SHORT; + static PinyinLookup HACK_PINYIN_LOOKUP_LONG; + + // These will be lazily set to the short or long tables based on which + // Chinese collation has been configured into the ICU library. + static PinyinLookup *HACK_PINYIN_LOOKUP; + static const UChar *PINYIN_LOWER_BOUNDS; + + + + int32_t recordCounter_; // Counts Records created. For minting record serial numbers. + +// Constants. Lazily initialized the first time an AlphabeticIndex object is created. + + static UnicodeSet *ALPHABETIC; + static UnicodeSet *CORE_LATIN; + static UnicodeSet *ETHIOPIC; + static UnicodeSet *HANGUL; + static UnicodeSet *IGNORE_SCRIPTS; + static UnicodeSet *TO_TRY; + static UnicodeSet *UNIHAN; + static const UnicodeString *EMPTY_STRING; + +}; + +U_NAMESPACE_END +#endif + diff --git a/icu4c/source/i18n/unicode/ulocdata.h b/icu4c/source/i18n/unicode/ulocdata.h index 3431f25e248..c15b43b1631 100644 --- a/icu4c/source/i18n/unicode/ulocdata.h +++ b/icu4c/source/i18n/unicode/ulocdata.h @@ -1,7 +1,7 @@ /* ****************************************************************************** * * -* Copyright (C) 2003-2010, International Business Machines * +* Copyright (C) 2003-2011, International Business Machines * * Corporation and others. All Rights Reserved. * * * ****************************************************************************** @@ -41,7 +41,8 @@ typedef struct ULocaleData ULocaleData; typedef enum ULocaleDataExemplarSetType { ULOCDATA_ES_STANDARD=0, /* Basic set */ ULOCDATA_ES_AUXILIARY=1, /* Auxiliary set */ - ULOCDATA_ES_COUNT=2 + ULOCDATA_ES_INDEX=2, /* Index Character set */ + ULOCDATA_ES_COUNT=3 } ULocaleDataExemplarSetType; /** The possible types of delimiters. @@ -142,9 +143,11 @@ ulocdata_getNoSubstitute(ULocaleData *uld); * always set, regardless of the value of 'options'. * @param extype Specifies the type of exemplar set to be retrieved. * @param status Pointer to an input-output error code value; - * must not be NULL. + * must not be NULL. Will be set to U_MISSING_RESOURCE_ERROR + * if the requested data is not available. * @return USet* Either fillIn, or if fillIn is NULL, a pointer to * a newly-allocated USet that the user must close. + * In case of error, NULL is returned. * @stable ICU 3.4 */ U_STABLE USet* U_EXPORT2 diff --git a/icu4c/source/test/cintltst/cldrtest.c b/icu4c/source/test/cintltst/cldrtest.c index bee48cefdc0..3a566695667 100644 --- a/icu4c/source/test/cintltst/cldrtest.c +++ b/icu4c/source/test/cintltst/cldrtest.c @@ -1234,6 +1234,43 @@ static void TestCoverage(void){ ulocdata_close(uld); } +static void TestIndexChars(void) { + /* Very basic test of ULOCDATA_ES_INDEX. + * No comprehensive test of data, just basic check that the code path is alive. + */ + UErrorCode status = U_ZERO_ERROR; + ULocaleData *uld; + USet *exemplarChars; + USet *indexChars; + + uld = ulocdata_open("en", &status); + exemplarChars = uset_openEmpty(); + indexChars = uset_openEmpty(); + ulocdata_getExemplarSet(uld, exemplarChars, 0, ULOCDATA_ES_STANDARD, &status); + ulocdata_getExemplarSet(uld, indexChars, 0, ULOCDATA_ES_INDEX, &status); + if (U_FAILURE(status)) { + log_err("File %s, line %d, Failure opening exemplar chars: %s", __FILE__, __LINE__, u_errorName(status)); + goto close_sets; + } + /* en data, standard exemplars are [a-z], lower case. */ + /* en data, index characters are [A-Z], upper case. */ + if ((uset_contains(exemplarChars, (UChar32)0x41) || uset_contains(indexChars, (UChar32)0x61))) { + log_err("File %s, line %d, Exemplar characters incorrect.", __FILE__, __LINE__ ); + goto close_sets; + } + if (!(uset_contains(exemplarChars, (UChar32)0x61) && uset_contains(indexChars, (UChar32)0x41) )) { + log_err("File %s, line %d, Exemplar characters incorrect.", __FILE__, __LINE__ ); + goto close_sets; + } + + close_sets: + uset_close(exemplarChars); + uset_close(indexChars); + ulocdata_close(uld); +} + + + static void TestCurrencyList(void){ #if !UCONFIG_NO_FORMATTING UErrorCode errorCode = U_ZERO_ERROR; @@ -1284,4 +1321,6 @@ void addCLDRTest(TestNode** root) TESTCASE(TestExemplarSet); TESTCASE(TestLocaleDisplayPattern); TESTCASE(TestCoverage); + TESTCASE(TestIndexChars); } + diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index 4d59aae636e..e603509682e 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -58,7 +58,7 @@ itrbnf.o itrbnfrt.o itrbnfp.o ucaconf.o icusvtst.o \ uobjtest.o idnaref.o idnaconf.o nptrans.o punyref.o testidn.o testidna.o uts46test.o \ incaltst.o calcasts.o v32test.o uvectest.o textfile.o tokiter.o utxttest.o \ windttst.o winnmtst.o winutil.o csdetest.o tzrulets.o tzoffloc.o tzfmttst.o ssearch.o dtifmtts.o \ -tufmtts.o itspoof.o simplethread.o bidiconf.o locnmtst.o dcfmtest.o +tufmtts.o itspoof.o simplethread.o bidiconf.o locnmtst.o dcfmtest.o alphaindextst.o DEPS = $(OBJECTS:.o=.d) diff --git a/icu4c/source/test/intltest/alphaindextst.cpp b/icu4c/source/test/intltest/alphaindextst.cpp new file mode 100644 index 00000000000..e374dd59572 --- /dev/null +++ b/icu4c/source/test/intltest/alphaindextst.cpp @@ -0,0 +1,417 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2011, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// +// file: alphaindex.cpp +// Alphabetic Index Tests. +// +#include "intltest.h" +#include "alphaindextst.h" + +#include "unicode/alphaindex.h" +#include "unicode/coll.h" +#include "unicode/tblcoll.h" +#include "unicode/uniset.h" + +// #include +// #include + +AlphabeticIndexTest::AlphabeticIndexTest() { +} + +AlphabeticIndexTest::~AlphabeticIndexTest() { +} + +void AlphabeticIndexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) +{ + if (exec) logln("TestSuite AlphabeticIndex: "); + switch (index) { + + case 0: name = "APITest"; + if (exec) APITest(); + break; + + case 1: name = "ManyLocales"; + if (exec) ManyLocalesTest(); + break; + + case 2: name = "HackPinyinTest"; + if (exec) HackPinyinTest(); + break; + + default: name = ""; + break; //needed to end loop + } +} + +#define TEST_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: Test failure. status=%s", \ + __FILE__, __LINE__, u_errorName(status)); return;}} + +#define TEST_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: Test failure \n", __FILE__, __LINE__);};} + +// +// APITest. Invoke every function at least once, and check that it does something. +// Does not attempt to check complete functionality. +// +void AlphabeticIndexTest::APITest() { + + // + // Simple constructor and destructor, getBucketCount() + // + UErrorCode status = U_ZERO_ERROR; + int32_t lc = 0; + int32_t i = 0; + AlphabeticIndex *index = new AlphabeticIndex(Locale::getEnglish(), status); + TEST_CHECK_STATUS; + lc = index->getBucketCount(status); + TEST_CHECK_STATUS; + TEST_ASSERT(28 == lc); // 26 letters plus two under/overflow labels. + //printf("getBucketCount() == %d\n", lc); + delete index; + + // addLabels() + + status = U_ZERO_ERROR; + index = new AlphabeticIndex(Locale::getEnglish(), status); + TEST_CHECK_STATUS; + UnicodeSet additions; + additions.add((UChar32)0x410).add((UChar32)0x415); // A couple of Cyrillic letters + index->addLabels(additions, status); + TEST_CHECK_STATUS; + lc = index->getBucketCount(status); + TEST_CHECK_STATUS; + // TODO: should get 31. Java also gives 30. Needs fixing + TEST_ASSERT(30 == lc); // 26 Latin letters plus + // TEST_ASSERT(31 == lc); // 26 Latin letters plus + // 2 Cyrillic letters plus + // 1 inflow label plus + // two under/overflow labels. + // std::cout << lc << std::endl; + delete index; + + + // addLabels(Locale) + + status = U_ZERO_ERROR; + index = new AlphabeticIndex(Locale::getEnglish(), status); + TEST_CHECK_STATUS; + AlphabeticIndex &aip = index->addLabels(Locale::getJapanese(), status); + TEST_ASSERT(&aip == index); + TEST_CHECK_STATUS; + lc = index->getBucketCount(status); + TEST_CHECK_STATUS; + TEST_ASSERT(35 < lc); // Japanese should add a bunch. Don't rely on the exact value. + delete index; + + // GetCollator(), Get under/in/over flow labels + + status = U_ZERO_ERROR; + index = new AlphabeticIndex(Locale::getGerman(), status); + TEST_CHECK_STATUS; + Collator *germanCol = Collator::createInstance(Locale::getGerman(), status); + TEST_CHECK_STATUS; + const RuleBasedCollator &indexCol = index->getCollator(); + TEST_ASSERT(*germanCol == indexCol); + delete germanCol; + + UnicodeString ELLIPSIS; ELLIPSIS.append((UChar32)0x2026); + UnicodeString s = index->getUnderflowLabel(); + TEST_ASSERT(ELLIPSIS == s); + s = index->getOverflowLabel(); + TEST_ASSERT(ELLIPSIS == s); + s = index->getInflowLabel(); + TEST_ASSERT(ELLIPSIS == s); + index->setOverflowLabel(UNICODE_STRING_SIMPLE("O"), status); + index->setUnderflowLabel(UNICODE_STRING_SIMPLE("U"), status).setInflowLabel(UNICODE_STRING_SIMPLE("I"), status); + s = index->getUnderflowLabel(); + TEST_ASSERT(UNICODE_STRING_SIMPLE("U") == s); + s = index->getOverflowLabel(); + TEST_ASSERT(UNICODE_STRING_SIMPLE("O") == s); + s = index->getInflowLabel(); + TEST_ASSERT(UNICODE_STRING_SIMPLE("I") == s); + + + + + delete index; + + + + const UnicodeString adam = UNICODE_STRING_SIMPLE("Adam"); + const UnicodeString baker = UNICODE_STRING_SIMPLE("Baker"); + const UnicodeString charlie = UNICODE_STRING_SIMPLE("Charlie"); + const UnicodeString chad = UNICODE_STRING_SIMPLE("Chad"); + const UnicodeString zed = UNICODE_STRING_SIMPLE("Zed"); + const UnicodeString Cyrillic = UNICODE_STRING_SIMPLE("\\u0410\\u0443\\u0435").unescape(); + + // addRecord(), verify that it comes back out. + // + status = U_ZERO_ERROR; + index = new AlphabeticIndex(Locale::getEnglish(), status); + TEST_CHECK_STATUS; + index->addRecord(UnicodeString("Adam"), this, status); + UBool b; + TEST_CHECK_STATUS; + index->resetBucketIterator(status); + TEST_CHECK_STATUS; + index->nextBucket(status); // Move to underflow label + index->nextBucket(status); // Move to "A" + TEST_CHECK_STATUS; + const UnicodeString &label2 = index->getBucketLabel(); + UnicodeString A_STR = UNICODE_STRING_SIMPLE("A"); + TEST_ASSERT(A_STR == label2); + + b = index->nextRecord(status); + TEST_CHECK_STATUS; + TEST_ASSERT(b); + const UnicodeString &itemName = index->getRecordName(); + TEST_ASSERT(adam == itemName); + + const void *itemContext = index->getRecordData(); + TEST_ASSERT(itemContext == this); + + delete index; + + // clearRecords, addRecord(), Iteration + + status = U_ZERO_ERROR; + index = new AlphabeticIndex(Locale::getEnglish(), status); + TEST_CHECK_STATUS; + while (index->nextBucket(status)) { + TEST_CHECK_STATUS; + while (index->nextRecord(status)) { + TEST_CHECK_STATUS; + TEST_ASSERT(FALSE); // No items have been added. + } + TEST_CHECK_STATUS; + } + + index->addRecord(adam, NULL, status); + index->addRecord(baker, NULL, status); + index->addRecord(charlie, NULL, status); + index->addRecord(chad, NULL, status); + TEST_CHECK_STATUS; + int itemCount = 0; + index->resetBucketIterator(status); + while (index->nextBucket(status)) { + TEST_CHECK_STATUS; + while (index->nextRecord(status)) { + TEST_CHECK_STATUS; + ++itemCount; + } + } + TEST_CHECK_STATUS; + TEST_ASSERT(itemCount == 4); + + TEST_ASSERT(index->nextBucket(status) == FALSE); + index->resetBucketIterator(status); + TEST_CHECK_STATUS; + TEST_ASSERT(index->nextBucket(status) == TRUE); + + index->clearRecords(status); + TEST_CHECK_STATUS; + index->resetBucketIterator(status); + while (index->nextBucket(status)) { + TEST_CHECK_STATUS; + while (index->nextRecord(status)) { + TEST_ASSERT(FALSE); // No items have been added. + } + } + TEST_CHECK_STATUS; + delete index; + + // getBucketLabel(), getBucketType() + + status = U_ZERO_ERROR; + index = new AlphabeticIndex(Locale::getEnglish(), status); + TEST_CHECK_STATUS; + index->setUnderflowLabel(adam, status).setOverflowLabel(charlie, status); + TEST_CHECK_STATUS; + for (i=0; index->nextBucket(status); i++) { + TEST_CHECK_STATUS; + UnicodeString label = index->getBucketLabel(); + UAlphabeticIndexLabelType type = index->getBucketLabelType(); + if (i == 0) { + TEST_ASSERT(type == U_ALPHAINDEX_UNDERFLOW); + TEST_ASSERT(label == adam); + } else if (i <= 26) { + // Labels A - Z for English locale + TEST_ASSERT(type == U_ALPHAINDEX_NORMAL); + UnicodeString expectedLabel((UChar)(0x40 + i)); + TEST_ASSERT(expectedLabel == label); + } else if (i == 27) { + TEST_ASSERT(type == U_ALPHAINDEX_OVERFLOW); + TEST_ASSERT(label == charlie); + } else { + TEST_ASSERT(FALSE); + } + } + TEST_ASSERT(i==28); + delete index; + + // getBucketIndex() + + status = U_ZERO_ERROR; + index = new AlphabeticIndex(Locale::getEnglish(), status); + TEST_CHECK_STATUS; + int32_t n = index->getBucketIndex(adam, status); + TEST_CHECK_STATUS; + TEST_ASSERT(n == 1); /* Label #0 is underflow, 1 is A, etc. */ + n = index->getBucketIndex(baker, status); + TEST_ASSERT(n == 2); + n = index->getBucketIndex(Cyrillic, status); + TEST_ASSERT(n == 27); // Overflow label + n = index->getBucketIndex(zed, status); + TEST_ASSERT(n == 26); + + for (i=0; index->nextBucket(status); i++) { + n = index->getBucketIndex(); + TEST_ASSERT(n == i); + UnicodeString label = index->getBucketLabel(); + TEST_ASSERT(n == i); + } + TEST_ASSERT(i == 28); + + delete index; + index = new AlphabeticIndex(Locale::createFromName("ru"), status); + //Locale loc = Locale::createFromName(localeName); + TEST_CHECK_STATUS; + n = index->getBucketIndex(adam, status); + TEST_CHECK_STATUS; + TEST_ASSERT(n == 0); // Label #0 is underflow + n = index->getBucketIndex(baker, status); + TEST_ASSERT(n == 0); + n = index->getBucketIndex(Cyrillic, status); + TEST_ASSERT(n == 1); // Overflow label + n = index->getBucketIndex(zed, status); + TEST_ASSERT(n == 0); + + delete index; + +} + + +static const char * KEY_LOCALES[] = { + "en", "es", "de", "fr", "ja", "it", "tr", "pt", "zh", "nl", + "pl", "ar", "ru", "zh_Hant", "ko", "th", "sv", "fi", "da", + "he", "nb", "el", "hr", "bg", "sk", "lt", "vi", "lv", "sr", + "pt_PT", "ro", "hu", "cs", "id", "sl", "fil", "fa", "uk", + "ca", "hi", "et", "eu", "is", "sw", "ms", "bn", "am", "ta", + "te", "mr", "ur", "ml", "kn", "gu", "or", ""}; + + +void AlphabeticIndexTest::ManyLocalesTest() { + UErrorCode status = U_ZERO_ERROR; + int32_t lc = 0; + AlphabeticIndex *index = NULL; + + for (int i=0; ; ++i) { + status = U_ZERO_ERROR; + const char *localeName = KEY_LOCALES[i]; + if (localeName[0] == 0) { + break; + } + // std::cout << localeName << " "; + Locale loc = Locale::createFromName(localeName); + index = new AlphabeticIndex(loc, status); + TEST_CHECK_STATUS; + lc = index->getBucketCount(status); + TEST_CHECK_STATUS; + // std::cout << "getBucketCount() == " << lc << std::endl; + + while (index->nextBucket(status)) { + TEST_CHECK_STATUS; + const UnicodeString &label = index->getBucketLabel(); + TEST_ASSERT(label.length()>0); + // std::string ss; + // std::cout << ":" << label.toUTF8String(ss); + } + // std::cout << ":" << std::endl; + + + delete index; + } +} + + +// Test data for Pinyin based indexes. +// The Chinese characters should be distributed under latin labels in +// an index. + +static const char *pinyinTestData[] = { + "\\u0101", "\\u5416", "\\u58ba", // + "b", "\\u516b", "\\u62d4", "\\u8500", // + "c", "\\u5693", "\\u7938", "\\u9e7e", // + "d", "\\u5491", "\\u8fcf", "\\u964a", // + "\\u0113","\\u59b8", "\\u92e8", "\\u834b", // + "f", "\\u53d1", "\\u9197", "\\u99a5", // + "g", "\\u7324", "\\u91d3", "\\u8142", // + "h", "\\u598e", "\\u927f", "\\u593b", // + "j", "\\u4e0c", "\\u6785", "\\u9d58", // + "k", "\\u5494", "\\u958b", "\\u7a52", // + "l", "\\u5783", "\\u62c9", "\\u9ba5", // + "m", "\\u5638", "\\u9ebb", "\\u65c0", // + "n", "\\u62ff", "\\u80ad", "\\u685b", // + "\\u014D", "\\u5662", "\\u6bee", "\\u8bb4", // + "p", "\\u5991", "\\u8019", "\\u8c31", // + "q", "\\u4e03", "\\u6053", "\\u7f56", // + "r", "\\u5465", "\\u72aa", "\\u6e03", // + "s", "\\u4ee8", "\\u9491", "\\u93c1", // + "t", "\\u4ed6", "\\u9248", "\\u67dd", // + "w", "\\u5c72", "\\u5558", "\\u5a7a", // + "x", "\\u5915", "\\u5438", "\\u6bbe", // + "y", "\\u4e2b", "\\u82bd", "\\u8574", // + "z", "\\u5e00", "\\u707d", "\\u5c0a", + NULL + }; + +void AlphabeticIndexTest::HackPinyinTest() { + UErrorCode status = U_ZERO_ERROR; + AlphabeticIndex aindex(Locale::createFromName("zh"), status); + TEST_CHECK_STATUS; + + UnicodeString names[sizeof(pinyinTestData) / sizeof(pinyinTestData[0])]; + int32_t nameCount; + for (nameCount=0; pinyinTestData[nameCount] != NULL; nameCount++) { + names[nameCount] = UnicodeString(pinyinTestData[nameCount], -1, UnicodeString::kInvariant).unescape(); + aindex.addRecord(names[nameCount], &names[nameCount], status); + TEST_CHECK_STATUS; + if (U_FAILURE(status)) { + return; + } + } + TEST_ASSERT(nameCount == aindex.getRecordCount(status)); + + // Weak checking: make sure that none of the Chinese names landed in the overflow bucket + // of the index, and that the names are distributed among several buckets. + // (Exact expected data would be subject to change with evolution of the collation rules.) + + int32_t bucketCount = 0; + int32_t filledBucketCount = 0; + while (aindex.nextBucket(status)) { + bucketCount++; + UnicodeString label = aindex.getBucketLabel(); + // std::string s; + // std::cout << label.toUTF8String(s) << ": "; + + UBool bucketHasContents = FALSE; + while (aindex.nextRecord(status)) { + bucketHasContents = TRUE; + UnicodeString name = aindex.getRecordName(); + if (aindex.getBucketLabelType() != U_ALPHAINDEX_NORMAL) { + errln("File %s, Line %d, Name \"\\u%x\" is in an under or overflow bucket.", + __FILE__, __LINE__, name.char32At(0)); + } + // s.clear(); + // std::cout << aindex.getRecordName().toUTF8String(s) << " "; + } + if (bucketHasContents) { + filledBucketCount++; + } + // std::cout << std::endl; + } + TEST_ASSERT(bucketCount > 25); + TEST_ASSERT(filledBucketCount > 15); +} diff --git a/icu4c/source/test/intltest/alphaindextst.h b/icu4c/source/test/intltest/alphaindextst.h new file mode 100644 index 00000000000..a9c543641a7 --- /dev/null +++ b/icu4c/source/test/intltest/alphaindextst.h @@ -0,0 +1,28 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2011, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// +// file: alphaindex.h +// Alphabetic Index Tests. +// + +#ifndef ALPHAINDEXTST_H +#define ALPHAINDEXTST_H + +#include "intltest.h" + +class AlphabeticIndexTest: public IntlTest { +public: + AlphabeticIndexTest(); + virtual ~AlphabeticIndexTest(); + + virtual void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL ); + + virtual void APITest(); + virtual void ManyLocalesTest(); + virtual void HackPinyinTest(); +}; + +#endif diff --git a/icu4c/source/test/intltest/tscoll.cpp b/icu4c/source/test/intltest/tscoll.cpp index 3644e4a3a1e..17b175de140 100644 --- a/icu4c/source/test/intltest/tscoll.cpp +++ b/icu4c/source/test/intltest/tscoll.cpp @@ -1,6 +1,6 @@ /******************************************************************** * COPYRIGHT: - * Copyright (c) 1997-2009, International Business Machines Corporation and + * Copyright (c) 1997-2011, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -49,6 +49,7 @@ #include "ucaconf.h" #include "svccoll.h" #include "cmemory.h" +#include "alphaindextst.h" //#include "rndmcoll.h" // Set to 1 to test offsets in backAndForth() @@ -97,6 +98,7 @@ void IntlTestCollator::runIndexedTest( int32_t index, UBool exec, const char* &n TESTCLASS(20, CollationFinnishTest); // removed by weiv - we have changed Finnish collation //TESTCLASS(21, RandomCollatorTest); // See ticket 5747 about reenabling this test. TESTCLASS(21, SSearchTest); + TESTCLASS(22, AlphabeticIndexTest); default: name = ""; break; }