mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-9880 add ImmutableIndex, replace Chinese hacks and support zhuyin, handle index characters with multiple primary weights, lazy-create rarely-used objects, bug fixes, port other improvements from Java
X-SVN-Rev: 33245
This commit is contained in:
parent
407be346b7
commit
f5cd9984c6
9 changed files with 1396 additions and 1126 deletions
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1999-2011, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
* Copyright (C) 1999-2013, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
* Date Name Description
|
||||
* 10/22/99 alan Creation.
|
||||
|
@ -552,12 +552,12 @@ void UVector::sort(UElementComparator *compare, UErrorCode &ec) {
|
|||
|
||||
|
||||
/**
|
||||
* Sort with a user supplied comparator of type UComparator.
|
||||
* Stable sort with a user supplied comparator of type UComparator.
|
||||
*/
|
||||
void UVector::sortWithUComparator(UComparator *compare, const void *context, UErrorCode &ec) {
|
||||
if (U_SUCCESS(ec)) {
|
||||
uprv_sortArray(elements, count, sizeof(UElement),
|
||||
compare, context, FALSE, &ec);
|
||||
compare, context, TRUE, &ec);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2011, International Business Machines
|
||||
* Copyright (C) 1999-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
|
@ -243,9 +243,9 @@ public:
|
|||
void sort(UElementComparator *compare, UErrorCode &ec);
|
||||
|
||||
/**
|
||||
* Sort the contents of this vector using a caller-supplied function
|
||||
* Stable sort the contents of this vector using a caller-supplied function
|
||||
* of type UComparator to do the comparison. Provides more flexibility
|
||||
* than uvector::sort() because an additional user-parameter can be passed to
|
||||
* than UVector::sort() because an additional user parameter can be passed to
|
||||
* the comparison function.
|
||||
*/
|
||||
void sortWithUComparator(UComparator *compare, const void *context, UErrorCode &ec);
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -51,7 +51,6 @@ typedef enum ECleanupI18NType {
|
|||
UCLN_I18N_UCOL_RES,
|
||||
UCLN_I18N_UCOL_BLD,
|
||||
UCLN_I18N_CSDET,
|
||||
UCLN_I18N_INDEX_CHARACTERS,
|
||||
UCLN_I18N_GENDERINFO,
|
||||
UCLN_I18N_CDFINFO,
|
||||
UCLN_I18N_REGION,
|
||||
|
|
|
@ -71,19 +71,25 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
// Forward Declarations
|
||||
|
||||
namespace {
|
||||
|
||||
class BucketList;
|
||||
|
||||
} // namespace;
|
||||
|
||||
class Collator;
|
||||
class RuleBasedCollator;
|
||||
class StringEnumeration;
|
||||
class UnicodeSet;
|
||||
class UVector;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* class AlphabeticIndex supports the creation of a UI index appropriate for a given language, such as:
|
||||
*
|
||||
* AlphabeticIndex supports the creation of a UI index appropriate for a given language.
|
||||
* It can support either direct use, or use with a client that doesn't support localized collation.
|
||||
* The following is an example of what an index might look like in a UI:
|
||||
*
|
||||
* <pre>
|
||||
* <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z \\u00C6 \\u00D8 \\u00C5 ...</b>
|
||||
* <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ...</b>
|
||||
*
|
||||
* <b>A</b>
|
||||
* Addison
|
||||
|
@ -107,10 +113,14 @@ class UVector;
|
|||
* into an inflow bucket between the other two scripts.
|
||||
* <p>
|
||||
* The AlphabeticIndex class is not intended for public subclassing.
|
||||
* <p>
|
||||
* <i>Example</i>
|
||||
* <p>
|
||||
* The "show..." methods below are just to illustrate usage.
|
||||
*
|
||||
* <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
|
||||
* as well as characters from the user's language,
|
||||
* then it is a good idea to call addLabels(Locale::getEnglish(), status).</p>
|
||||
*
|
||||
* <h2>Direct Use</h2>
|
||||
* <p>The following shows an example of building an index directly.
|
||||
* The "show..." methods below are just to illustrate usage.
|
||||
*
|
||||
* <pre>
|
||||
* // Create a simple index. "Item" is assumed to be an application
|
||||
|
@ -150,21 +160,138 @@ class UVector;
|
|||
* <b>... A-F G-N O-Z ...</b>
|
||||
* </pre>
|
||||
*
|
||||
* <p>
|
||||
* <b>Notes:</b>
|
||||
* <h2>Client Support</h2>
|
||||
* <p>Callers can also use the AlphabeticIndex::ImmutableIndex, or the AlphabeticIndex itself,
|
||||
* to support sorting on a client that doesn't support AlphabeticIndex functionality.
|
||||
*
|
||||
* <p>The ImmutableIndex is both immutable and thread-safe.
|
||||
* The corresponding AlphabeticIndex methods are not thread-safe because
|
||||
* they "lazily" build the index buckets.
|
||||
* <ul>
|
||||
* <li>Additional collation parameters can be passed in as part of the locale name.
|
||||
* For example, German plus numeric
|
||||
* sorting would be "de@kn-true".
|
||||
* <li>ImmutableIndex.getBucket(index) provides random access to all
|
||||
* buckets and their labels and label types.
|
||||
* <li>The AlphabeticIndex bucket iterator or ImmutableIndex.getBucket(0..getBucketCount-1)
|
||||
* can be used to get a list of the labels,
|
||||
* such as "...", "A", "B",..., and send that list to the client.
|
||||
* <li>When the client has a new name, it sends that name to the server.
|
||||
* The server needs to call the following methods,
|
||||
* and communicate the bucketIndex and collationKey back to the client.
|
||||
*
|
||||
* <pre>
|
||||
* int32_t bucketIndex = index.getBucketIndex(name, status);
|
||||
* const UnicodeString &label = immutableIndex.getBucket(bucketIndex)->getLabel(); // optional
|
||||
* int32_t skLength = collator.getSortKey(name, sk, skCapacity);
|
||||
* </pre>
|
||||
*
|
||||
* <li>The client would put the name (and associated information) into its bucket for bucketIndex. The sort key sk is a
|
||||
* sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @stable ICU 4.8
|
||||
*/
|
||||
|
||||
|
||||
class U_I18N_API AlphabeticIndex: public UObject {
|
||||
public:
|
||||
/**
|
||||
* An index "bucket" with a label string and type.
|
||||
* It is referenced by getBucketIndex(),
|
||||
* and returned by ImmutableIndex.getBucket().
|
||||
*
|
||||
* The Bucket class is not intended for public subclassing.
|
||||
* @draft ICU 51
|
||||
*/
|
||||
class U_I18N_API Bucket : public UObject {
|
||||
public:
|
||||
/**
|
||||
* Destructor.
|
||||
* @draft ICU 51
|
||||
*/
|
||||
virtual ~Bucket();
|
||||
|
||||
public:
|
||||
/**
|
||||
* Returns the label string.
|
||||
*
|
||||
* @return the label string for the bucket
|
||||
* @draft ICU 51
|
||||
*/
|
||||
const UnicodeString &getLabel() const { return label_; }
|
||||
/**
|
||||
* Returns whether this bucket is a normal, underflow, overflow, or inflow bucket.
|
||||
*
|
||||
* @return the bucket label type
|
||||
* @draft ICU 51
|
||||
*/
|
||||
UAlphabeticIndexLabelType getLabelType() const { return labelType_; }
|
||||
|
||||
private:
|
||||
friend class AlphabeticIndex;
|
||||
friend class BucketList;
|
||||
|
||||
UnicodeString label_;
|
||||
UnicodeString lowerBoundary_;
|
||||
UAlphabeticIndexLabelType labelType_;
|
||||
Bucket *displayBucket_;
|
||||
int32_t displayIndex_;
|
||||
UVector *records_; // Records are owned by the inputList_ vector.
|
||||
|
||||
Bucket(const UnicodeString &label, // Parameter strings are copied.
|
||||
const UnicodeString &lowerBoundary,
|
||||
UAlphabeticIndexLabelType type);
|
||||
};
|
||||
|
||||
/**
|
||||
* Immutable, thread-safe version of AlphabeticIndex.
|
||||
* This class provides thread-safe methods for bucketing,
|
||||
* and random access to buckets and their properties,
|
||||
* but does not offer adding records to the index.
|
||||
*
|
||||
* The ImmutableIndex class is not intended for public subclassing.
|
||||
*
|
||||
* @draft ICU 51
|
||||
*/
|
||||
class U_I18N_API ImmutableIndex : public UObject {
|
||||
public:
|
||||
/**
|
||||
* Destructor.
|
||||
* @draft ICU 51
|
||||
*/
|
||||
virtual ~ImmutableIndex();
|
||||
|
||||
/**
|
||||
* Returns the number of index buckets and labels, including underflow/inflow/overflow.
|
||||
*
|
||||
* @return the number of index buckets
|
||||
* @draft ICU 51
|
||||
*/
|
||||
int32_t getBucketCount() const;
|
||||
|
||||
/**
|
||||
* Finds the index bucket for the given name and returns the number of that bucket.
|
||||
* Use getBucket() to get the bucket's properties.
|
||||
*
|
||||
* @param name the string to be sorted into an index bucket
|
||||
* @return the bucket number for the name
|
||||
* @draft ICU 51
|
||||
*/
|
||||
int32_t getBucketIndex(const UnicodeString &name, UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Returns the index-th bucket. Returns NULL if the index is out of range.
|
||||
*
|
||||
* @param index bucket number
|
||||
* @return the index-th bucket
|
||||
* @draft ICU 51
|
||||
*/
|
||||
const Bucket *getBucket(int32_t index) const;
|
||||
|
||||
private:
|
||||
friend class AlphabeticIndex;
|
||||
|
||||
ImmutableIndex(BucketList *bucketList, Collator *collatorPrimaryOnly)
|
||||
: buckets_(bucketList), collatorPrimaryOnly_(collatorPrimaryOnly) {}
|
||||
|
||||
BucketList *buckets_;
|
||||
Collator *collatorPrimaryOnly_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Construct an AlphabeticIndex object for the specified locale. If the locale's
|
||||
|
@ -230,6 +357,14 @@ class U_I18N_API AlphabeticIndex: public UObject {
|
|||
virtual ~AlphabeticIndex();
|
||||
|
||||
|
||||
/**
|
||||
* Builds an immutable, thread-safe version of this instance, without data records.
|
||||
*
|
||||
* @return an immutable index instance
|
||||
* @draft ICU 51
|
||||
*/
|
||||
ImmutableIndex *buildImmutableIndex(UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Get the Collator that establishes the ordering of the items in this index.
|
||||
* Ownership of the collator remains with the AlphabeticIndex instance.
|
||||
|
@ -269,7 +404,6 @@ class U_I18N_API AlphabeticIndex: public UObject {
|
|||
virtual AlphabeticIndex &setInflowLabel(const UnicodeString &inflowLabel, UErrorCode &status);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Get the special label used for items that sort after the last normal label,
|
||||
* and that would not otherwise have an appropriate label.
|
||||
|
@ -336,22 +470,6 @@ class U_I18N_API AlphabeticIndex: public UObject {
|
|||
virtual AlphabeticIndex &setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Get the Unicode character (or tailored string) that defines an overflow bucket;
|
||||
* that is anything greater than or equal to that string should go in that bucket,
|
||||
* instead of with the last character. Normally that is the first character of the script
|
||||
* after lowerLimit. Thus in X Y Z ... <i>Devanagari-ka</i>, the overflow character for Z
|
||||
* would be the <i>Greek-alpha</i>.
|
||||
*
|
||||
* @param lowerLimit The character below the overflow (or inflow) bucket
|
||||
* @param status error code
|
||||
* @return string that defines top of the overflow buck for lowerLimit, or an empty string if there is none
|
||||
* @internal
|
||||
*/
|
||||
virtual const UnicodeString &getOverflowComparisonString(const UnicodeString &lowerLimit,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Add a record to the index. Each record will be associated with an index Bucket
|
||||
* based on the record's name. The list of records for each bucket will be sorted
|
||||
|
@ -549,187 +667,90 @@ private:
|
|||
virtual UBool operator!=(const AlphabeticIndex& other) const;
|
||||
|
||||
// Common initialization, for use from all constructors.
|
||||
void init(UErrorCode &status);
|
||||
void init(const Locale *locale, UErrorCode &status);
|
||||
|
||||
// Initialize & destruct static constants used by this class.
|
||||
static void staticInit(UErrorCode &status);
|
||||
/**
|
||||
* This method is called to get the index exemplars. Normally these come from the locale directly,
|
||||
* but if they aren't available, we have to synthesize them.
|
||||
*/
|
||||
void addIndexExemplars(const Locale &locale, UErrorCode &status);
|
||||
/**
|
||||
* Add Chinese index characters from the tailoring.
|
||||
*/
|
||||
UBool addChineseIndexCharacters(UErrorCode &errorCode);
|
||||
|
||||
// Pinyin stuff. If the input name is Chinese, add the Pinyin prefix to the dest string.
|
||||
void hackName(UnicodeString &dest, const UnicodeString &name, const Collator *coll);
|
||||
void initPinyinBounds(const Collator *coll, UErrorCode &status);
|
||||
UVector *firstStringsInScript(UErrorCode &status);
|
||||
|
||||
public:
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
/**
|
||||
* Delete all shared (static) data associated with an AlphabeticIndex.
|
||||
* Internal function, not intended for direct use.
|
||||
* @internal.
|
||||
*/
|
||||
static void staticCleanup();
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
private:
|
||||
static UnicodeString separated(const UnicodeString &item);
|
||||
|
||||
// Add index characters from the specified locale to the dest set.
|
||||
// Does not remove any previous contents from dest.
|
||||
static void getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status);
|
||||
/**
|
||||
* Determine the best labels to use.
|
||||
* This is based on the exemplars, but we also process to make sure that they are unique,
|
||||
* and sort differently, and that the overall list is small enough.
|
||||
*/
|
||||
void initLabels(UVector &indexCharacters, UErrorCode &errorCode) const;
|
||||
BucketList *createBucketList(UErrorCode &errorCode) const;
|
||||
void initBuckets(UErrorCode &errorCode);
|
||||
void clearBuckets();
|
||||
void internalResetBucketIterator();
|
||||
|
||||
UVector *firstStringsInScript(UErrorCode &status);
|
||||
public:
|
||||
|
||||
static UnicodeString separated(const UnicodeString &item);
|
||||
|
||||
static UnicodeSet *getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status);
|
||||
|
||||
void buildIndex(UErrorCode &status);
|
||||
void buildBucketList(UErrorCode &status);
|
||||
void bucketRecords(UErrorCode &status);
|
||||
|
||||
|
||||
public:
|
||||
|
||||
// The following internal items are declared public only to allow access from
|
||||
// implementation code written in plain C. They are not intended for
|
||||
// public use.
|
||||
// The Record is declared public only to allow access from
|
||||
// implementation code written in plain C.
|
||||
// It is not intended for public use.
|
||||
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
/**
|
||||
* A record, or item, in the index.
|
||||
* A (name, data) pair, to be sorted by name into one of the index buckets.
|
||||
* The user data is not used by the index implementation.
|
||||
* @internal
|
||||
*/
|
||||
struct Record: public UMemory {
|
||||
AlphabeticIndex *alphaIndex_;
|
||||
const UnicodeString name_;
|
||||
UnicodeString sortingName_; // Usually the same as name_; different for Pinyin.
|
||||
const void *data_;
|
||||
int32_t serialNumber_; // Defines sorting order for names that compare equal.
|
||||
Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data);
|
||||
~Record();
|
||||
};
|
||||
struct Record: public UMemory {
|
||||
const UnicodeString name_;
|
||||
const void *data_;
|
||||
Record(const UnicodeString &name, const void *data);
|
||||
~Record();
|
||||
};
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
|
||||
/**
|
||||
* Holds all user records before they are distributed into buckets.
|
||||
* Type of contents is (Record *)
|
||||
* @internal
|
||||
*/
|
||||
UVector *inputRecords_;
|
||||
|
||||
/**
|
||||
* A Bucket holds an index label and references to everything belonging to that label.
|
||||
* For implementation use only. Declared public because pure C implementation code needs access.
|
||||
* @internal
|
||||
*/
|
||||
struct Bucket: public UMemory {
|
||||
UnicodeString label_;
|
||||
UnicodeString lowerBoundary_;
|
||||
UAlphabeticIndexLabelType labelType_;
|
||||
UVector *records_; // Records are owned by inputRecords_ vector.
|
||||
|
||||
Bucket(const UnicodeString &label, // Parameter strings are copied.
|
||||
const UnicodeString &lowerBoundary,
|
||||
UAlphabeticIndexLabelType type, UErrorCode &status);
|
||||
~Bucket();
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Language Types. For internal ICU use only.
|
||||
* @internal (but not hidden with U_HIDE_INTERNAL_API because it is used in public API)
|
||||
*/
|
||||
enum ELangType {
|
||||
/** @internal */
|
||||
kNormal,
|
||||
/** @internal */
|
||||
kSimplified,
|
||||
/** @internal */
|
||||
kTraditional
|
||||
};
|
||||
private:
|
||||
|
||||
/**
|
||||
* Get the Language Type for this Index. Based on the locale.
|
||||
* @internal
|
||||
*/
|
||||
static ELangType langTypeFromLocale(const Locale &loc);
|
||||
* Holds all user records before they are distributed into buckets.
|
||||
* Type of contents is (Record *)
|
||||
* @internal
|
||||
*/
|
||||
UVector *inputList_;
|
||||
|
||||
int32_t labelsIterIndex_; // Index of next item to return.
|
||||
int32_t itemsIterIndex_;
|
||||
Bucket *currentBucket_; // While an iteration of the index in underway,
|
||||
// point to the bucket for the current label.
|
||||
// NULL when no iteration underway.
|
||||
|
||||
private:
|
||||
int32_t maxLabelCount_; // Limit on # of labels permitted in the index.
|
||||
|
||||
// Holds the contents of this index, buckets of user items.
|
||||
// UVector elements are of type (Bucket *)
|
||||
UVector *bucketList_;
|
||||
UnicodeSet *initialLabels_; // Initial (unprocessed) set of Labels. Union
|
||||
// of those explicitly set by the user plus
|
||||
// those from locales. Raw values, before
|
||||
// crunching into bucket labels.
|
||||
|
||||
int32_t labelsIterIndex_; // Index of next item to return.
|
||||
int32_t itemsIterIndex_;
|
||||
Bucket *currentBucket_; // While an iteration of the index in underway,
|
||||
// point to the bucket for the current label.
|
||||
// NULL when no iteration underway.
|
||||
UVector *firstCharsInScripts_; // The first character from each script,
|
||||
// in collation order.
|
||||
|
||||
UBool indexBuildRequired_; // Caller has made changes to the index that
|
||||
// require rebuilding & bucketing before the
|
||||
// contents can be iterated.
|
||||
RuleBasedCollator *collator_;
|
||||
RuleBasedCollator *collatorPrimaryOnly_;
|
||||
|
||||
int32_t maxLabelCount_; // Limit on # of labels permitted in the index.
|
||||
// Lazy evaluated: null means that we have not built yet.
|
||||
BucketList *buckets_;
|
||||
|
||||
UHashtable *alreadyIn_; // Key=UnicodeString, value=UnicodeSet
|
||||
|
||||
UnicodeSet *initialLabels_; // Initial (unprocessed) set of Labels. Union
|
||||
// of those explicitly set by the user plus
|
||||
// those from locales. Raw values, before
|
||||
// crunching into bucket labels.
|
||||
|
||||
UVector *labels_; // List of Labels, after processing, sorting.
|
||||
// Contents are (UnicodeString *)
|
||||
|
||||
UnicodeSet *noDistinctSorting_; // As the set of labels is built, strings may
|
||||
// be discarded from the exemplars. This contains
|
||||
// some of the discards, and is
|
||||
// intended for debugging.
|
||||
|
||||
UnicodeSet *notAlphabetic_; // As the set of labels is built, strings may
|
||||
// be discarded from the exemplars. This contains
|
||||
// some of the discards, and is
|
||||
// intended for debugging.
|
||||
|
||||
|
||||
UVector *firstScriptCharacters_; // The first character from each script,
|
||||
// in collation order.
|
||||
|
||||
Locale locale_;
|
||||
Collator *collator_;
|
||||
Collator *collatorPrimaryOnly_;
|
||||
|
||||
UnicodeString inflowLabel_;
|
||||
UnicodeString overflowLabel_;
|
||||
UnicodeString underflowLabel_;
|
||||
UnicodeString overflowComparisonString_;
|
||||
|
||||
ELangType langType_; // The language type, simplified Chinese, Traditional Chinese,
|
||||
// or not Chinese (Normal). Part of the Pinyin support
|
||||
|
||||
typedef const UChar PinyinLookup[24][3];
|
||||
static PinyinLookup HACK_PINYIN_LOOKUP_SHORT;
|
||||
static PinyinLookup HACK_PINYIN_LOOKUP_LONG;
|
||||
|
||||
// These will be lazily set to the short or long tables based on which
|
||||
// Chinese collation has been configured into the ICU library.
|
||||
static PinyinLookup *HACK_PINYIN_LOOKUP;
|
||||
static const UChar *PINYIN_LOWER_BOUNDS;
|
||||
|
||||
|
||||
|
||||
int32_t recordCounter_; // Counts Records created. For minting record serial numbers.
|
||||
|
||||
// Constants. Lazily initialized the first time an AlphabeticIndex object is created.
|
||||
|
||||
static UnicodeSet *ALPHABETIC;
|
||||
static UnicodeSet *CORE_LATIN;
|
||||
static UnicodeSet *ETHIOPIC;
|
||||
static UnicodeSet *HANGUL;
|
||||
static UnicodeSet *IGNORE_SCRIPTS;
|
||||
static UnicodeSet *TO_TRY;
|
||||
static UnicodeSet *UNIHAN;
|
||||
static const UnicodeString *EMPTY_STRING;
|
||||
UnicodeString inflowLabel_;
|
||||
UnicodeString overflowLabel_;
|
||||
UnicodeString underflowLabel_;
|
||||
UnicodeString overflowComparisonString_;
|
||||
|
||||
UnicodeString emptyString_;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
|
||||
#include "unicode/alphaindex.h"
|
||||
#include "unicode/coll.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/tblcoll.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
|
@ -20,6 +21,24 @@
|
|||
// #include <string>
|
||||
// #include <iostream>
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
namespace {
|
||||
|
||||
UnicodeString joinLabelsAndAppend(AlphabeticIndex::ImmutableIndex &index, UnicodeString &dest) {
|
||||
int32_t oldLength = dest.length();
|
||||
const AlphabeticIndex::Bucket *bucket;
|
||||
for (int32_t i = 0; (bucket = index.getBucket(i)) != NULL; ++i) {
|
||||
if (dest.length() > oldLength) {
|
||||
dest.append((UChar)0x3A); // ':'
|
||||
}
|
||||
dest.append(bucket->getLabel());
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
AlphabeticIndexTest::AlphabeticIndexTest() {
|
||||
}
|
||||
|
||||
|
@ -29,27 +48,18 @@ AlphabeticIndexTest::~AlphabeticIndexTest() {
|
|||
void AlphabeticIndexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
|
||||
{
|
||||
if (exec) logln("TestSuite AlphabeticIndex: ");
|
||||
switch (index) {
|
||||
|
||||
case 0: name = "APITest";
|
||||
if (exec) APITest();
|
||||
break;
|
||||
|
||||
case 1: name = "ManyLocales";
|
||||
if (exec) ManyLocalesTest();
|
||||
break;
|
||||
|
||||
case 2: name = "HackPinyinTest";
|
||||
if (exec) HackPinyinTest();
|
||||
break;
|
||||
|
||||
case 3: name = "TestBug9009";
|
||||
if (exec) TestBug9009();
|
||||
break;
|
||||
|
||||
default: name = "";
|
||||
break; //needed to end loop
|
||||
}
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(APITest);
|
||||
TESTCASE_AUTO(ManyLocalesTest);
|
||||
TESTCASE_AUTO(HackPinyinTest);
|
||||
TESTCASE_AUTO(TestBug9009);
|
||||
TESTCASE_AUTO(TestIndexCharactersList);
|
||||
TESTCASE_AUTO(TestHaniFirst);
|
||||
TESTCASE_AUTO(TestPinyinFirst);
|
||||
TESTCASE_AUTO(TestSchSt);
|
||||
TESTCASE_AUTO(TestNoLabels);
|
||||
TESTCASE_AUTO(TestChineseZhuyin);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
#define TEST_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: Test failure. status=%s", \
|
||||
|
@ -85,10 +95,8 @@ void AlphabeticIndexTest::APITest() {
|
|||
index = new AlphabeticIndex(coll, status);
|
||||
TEST_CHECK_STATUS;
|
||||
TEST_ASSERT(coll == &index->getCollator());
|
||||
// TODO: The bucket count for an index built from a collator should be one, the underflow label.
|
||||
// The current implementation adds A-Z if the index is otherwise empty.
|
||||
// TEST_ASSERT(1 == index->getBucketCount(status));
|
||||
TEST_ASSERT(28 == index->getBucketCount(status));
|
||||
assertEquals("only the underflow label in an index built from a collator",
|
||||
1, index->getBucketCount(status));
|
||||
TEST_CHECK_STATUS;
|
||||
delete index;
|
||||
|
||||
|
@ -104,12 +112,8 @@ void AlphabeticIndexTest::APITest() {
|
|||
TEST_CHECK_STATUS;
|
||||
lc = index->getBucketCount(status);
|
||||
TEST_CHECK_STATUS;
|
||||
// TODO: should get 31. Java also gives 30. Needs fixing
|
||||
TEST_ASSERT(30 == lc); // 26 Latin letters plus
|
||||
// TEST_ASSERT(31 == lc); // 26 Latin letters plus
|
||||
// 2 Cyrillic letters plus
|
||||
// 1 inflow label plus
|
||||
// two under/overflow labels.
|
||||
assertEquals("underflow, A-Z, inflow, 2 Cyrillic, overflow",
|
||||
31, index->getBucketCount(status));
|
||||
// std::cout << lc << std::endl;
|
||||
delete index;
|
||||
|
||||
|
@ -298,17 +302,26 @@ void AlphabeticIndexTest::APITest() {
|
|||
|
||||
delete index;
|
||||
index = new AlphabeticIndex(Locale::createFromName("ru"), status);
|
||||
//Locale loc = Locale::createFromName(localeName);
|
||||
TEST_CHECK_STATUS;
|
||||
assertEquals("Russian index.getBucketCount()", 32, index->getBucketCount(status));
|
||||
// Latin-script names should go into the underflow label (0)
|
||||
// if the Russian collation does not use script reordering,
|
||||
// but into the overflow label (getBucketCount()-1)
|
||||
// if Russian sorts Cyrillic first.
|
||||
int32_t reorderCodes[20];
|
||||
int32_t expectedLatinIndex = 0;
|
||||
if (index->getCollator().getReorderCodes(reorderCodes, LENGTHOF(reorderCodes), status) > 0) {
|
||||
expectedLatinIndex = index->getBucketCount(status) - 1;
|
||||
}
|
||||
n = index->getBucketIndex(adam, status);
|
||||
TEST_CHECK_STATUS;
|
||||
TEST_ASSERT(n == 32); // Now Latin is in overflow label for Russian collation
|
||||
assertEquals("Russian index.getBucketIndex(adam)", expectedLatinIndex, n);
|
||||
n = index->getBucketIndex(baker, status);
|
||||
TEST_ASSERT(n == 32);
|
||||
assertEquals("Russian index.getBucketIndex(baker)", expectedLatinIndex, n);
|
||||
n = index->getBucketIndex(Cyrillic, status);
|
||||
TEST_ASSERT(n == 1); // First label
|
||||
assertEquals("Russian index.getBucketIndex(Cyrillic)", 1, n);
|
||||
n = index->getBucketIndex(zed, status);
|
||||
TEST_ASSERT(n == 32);
|
||||
assertEquals("Russian index.getBucketIndex(zed)", expectedLatinIndex, n);
|
||||
|
||||
delete index;
|
||||
|
||||
|
@ -327,7 +340,6 @@ static const char * KEY_LOCALES[] = {
|
|||
void AlphabeticIndexTest::ManyLocalesTest() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t lc = 0;
|
||||
AlphabeticIndex *index = NULL;
|
||||
|
||||
for (int i=0; ; ++i) {
|
||||
status = U_ZERO_ERROR;
|
||||
|
@ -337,23 +349,40 @@ void AlphabeticIndexTest::ManyLocalesTest() {
|
|||
}
|
||||
// std::cout << localeName << " ";
|
||||
Locale loc = Locale::createFromName(localeName);
|
||||
index = new AlphabeticIndex(loc, status);
|
||||
AlphabeticIndex index(loc, status);
|
||||
TEST_CHECK_STATUS;
|
||||
lc = index->getBucketCount(status);
|
||||
lc = index.getBucketCount(status);
|
||||
TEST_CHECK_STATUS;
|
||||
// std::cout << "getBucketCount() == " << lc << std::endl;
|
||||
|
||||
while (index->nextBucket(status)) {
|
||||
LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
|
||||
TEST_CHECK_STATUS;
|
||||
TEST_ASSERT(lc == immIndex->getBucketCount());
|
||||
|
||||
assertEquals("initial bucket index", -1, index.getBucketIndex());
|
||||
int32_t bucketIndex = 0;
|
||||
while (index.nextBucket(status)) {
|
||||
TEST_CHECK_STATUS;
|
||||
const UnicodeString &label = index->getBucketLabel();
|
||||
assertEquals("bucket index", bucketIndex, index.getBucketIndex());
|
||||
const UnicodeString &label = index.getBucketLabel();
|
||||
TEST_ASSERT(label.length()>0);
|
||||
// std::string ss;
|
||||
// std::cout << ":" << label.toUTF8String(ss);
|
||||
const AlphabeticIndex::Bucket *bucket = immIndex->getBucket(bucketIndex);
|
||||
TEST_ASSERT(bucket != NULL);
|
||||
assertEquals("bucket label vs. immutable: locale=" + UnicodeString(localeName) +
|
||||
" index=" + bucketIndex,
|
||||
label, bucket->getLabel());
|
||||
TEST_ASSERT(&label != &bucket->getLabel()); // not the same pointers
|
||||
UAlphabeticIndexLabelType labelType = index.getBucketLabelType();
|
||||
TEST_ASSERT(labelType == bucket->getLabelType());
|
||||
++bucketIndex;
|
||||
}
|
||||
// std::cout << ":" << std::endl;
|
||||
|
||||
|
||||
delete index;
|
||||
TEST_ASSERT(immIndex->getBucketCount() == bucketIndex);
|
||||
TEST_ASSERT(immIndex->getBucket(-1) == NULL);
|
||||
TEST_ASSERT(immIndex->getBucket(bucketIndex) == NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -447,6 +476,191 @@ void AlphabeticIndexTest::TestBug9009() {
|
|||
aindex.nextBucket(status); // Crash here before bug was fixed.
|
||||
TEST_CHECK_STATUS;
|
||||
}
|
||||
|
||||
|
||||
static const char *localeAndIndexCharactersLists[][2] = {
|
||||
/* Arabic*/ {"ar", "\\u0627:\\u0628:\\u062A:\\u062B:\\u062C:\\u062D:\\u062E:\\u062F:\\u0630:\\u0631:\\u0632:\\u0633:\\u0634:\\u0635:\\u0636:\\u0637:\\u0638:\\u0639:\\u063A:\\u0641:\\u0642:\\u0643:\\u0644:\\u0645:\\u0646:\\u0647:\\u0648:\\u064A"},
|
||||
/* Bulgarian*/ {"bg", "\\u0410:\\u0411:\\u0412:\\u0413:\\u0414:\\u0415:\\u0416:\\u0417:\\u0418:\\u0419:\\u041A:\\u041B:\\u041C:\\u041D:\\u041E:\\u041F:\\u0420:\\u0421:\\u0422:\\u0423:\\u0424:\\u0425:\\u0426:\\u0427:\\u0428:\\u0429:\\u042E:\\u042F"},
|
||||
/* Catalan*/ {"ca", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Czech*/ {"cs", "A:B:C:\\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:P:Q:R:\\u0158:S:\\u0160:T:U:V:W:X:Y:Z:\\u017D"},
|
||||
/* Danish*/ {"da", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\\u00C6:\\u00D8:\\u00C5"},
|
||||
/* German*/ {"de", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:Sch:St:T:U:V:W:X:Y:Z"},
|
||||
/* Greek*/ {"el", "\\u0391:\\u0392:\\u0393:\\u0394:\\u0395:\\u0396:\\u0397:\\u0398:\\u0399:\\u039A:\\u039B:\\u039C:\\u039D:\\u039E:\\u039F:\\u03A0:\\u03A1:\\u03A3:\\u03A4:\\u03A5:\\u03A6:\\u03A7:\\u03A8:\\u03A9"},
|
||||
/* English*/ {"en", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Spanish*/ {"es", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\\u00D1:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Estonian*/ {"et", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\\u0160:Z:\\u017D:T:U:V:\\u00D5:\\u00C4:\\u00D6:\\u00DC:X:Y"},
|
||||
/* Basque*/ {"eu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Finnish*/ {"fi", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\\u00C5:\\u00C4:\\u00D6"},
|
||||
/* Filipino*/ {"fil", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* French*/ {"fr", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Hebrew*/ {"he", "\\u05D0:\\u05D1:\\u05D2:\\u05D3:\\u05D4:\\u05D5:\\u05D6:\\u05D7:\\u05D8:\\u05D9:\\u05DB:\\u05DC:\\u05DE:\\u05E0:\\u05E1:\\u05E2:\\u05E4:\\u05E6:\\u05E7:\\u05E8:\\u05E9:\\u05EA"},
|
||||
/* Icelandic*/ {"is", "A:\\u00C1:B:C:D:\\u00D0:E:\\u00C9:F:G:H:I:\\u00CD:J:K:L:M:N:O:\\u00D3:P:Q:R:S:T:U:\\u00DA:V:W:X:Y:\\u00DD:Z:\\u00DE:\\u00C6:\\u00D6"},
|
||||
/* Italian*/ {"it", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Japanese*/ {"ja", "\\u3042:\\u304B:\\u3055:\\u305F:\\u306A:\\u306F:\\u307E:\\u3084:\\u3089:\\u308F"},
|
||||
/* Korean*/ {"ko", "\\u3131:\\u3134:\\u3137:\\u3139:\\u3141:\\u3142:\\u3145:\\u3147:\\u3148:\\u314A:\\u314B:\\u314C:\\u314D:\\u314E"},
|
||||
/* Lithuanian*/ {"lt", "A:B:C:\\u010C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:\\u0160:T:U:V:Z:\\u017D"},
|
||||
// This should be the correct data. Commented till it is fixed in CLDR collation data.
|
||||
// {"lv", "A:B:C:\\u010C:D:E:F:G:\\u0122:H:I:Y:J:K:\\u0136:L:\\u013B:M:N:\\u0145:O:P:Q:R:S:\\u0160:T:U:V:W:X:Z:\\u017D"},
|
||||
/* Latvian*/ {"lv", "A:B:C:\\u010C:D:E:F:G:\\u0122:H:I:J:K:\\u0136:L:\\u013B:M:N:\\u0145:O:P:Q:R:S:\\u0160:T:U:V:W:X:Y:Z:\\u017D"},
|
||||
/* Norwegian Bokm\\u00E5l*/ {"nb", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\\u00C6:\\u00D8:\\u00C5"},
|
||||
/* Dutch*/ {"nl", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Polish*/ {"pl", "A:\\u0104:B:C:\\u0106:D:E:\\u0118:F:G:H:I:J:K:L:\\u0141:M:N:\\u0143:O:\\u00D3:P:Q:R:S:\\u015A:T:U:V:W:X:Y:Z:\\u0179:\\u017B"},
|
||||
/* Portuguese*/ {"pt", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Romanian*/ {"ro", "A:\\u0102:\\u00C2:B:C:D:E:F:G:H:I:\\u00CE:J:K:L:M:N:O:P:Q:R:S:\\u0218:T:\\u021A:U:V:W:X:Y:Z"},
|
||||
/* Russian*/ {"ru", "\\u0410:\\u0411:\\u0412:\\u0413:\\u0414:\\u0415:\\u0416:\\u0417:\\u0418:\\u0419:\\u041A:\\u041B:\\u041C:\\u041D:\\u041E:\\u041F:\\u0420:\\u0421:\\u0422:\\u0423:\\u0424:\\u0425:\\u0426:\\u0427:\\u0428:\\u0429:\\u042B:\\u042D:\\u042E:\\u042F"},
|
||||
/* Slovak*/ {"sk", "A:\\u00C4:B:C:\\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:\\u00D4:P:Q:R:S:\\u0160:T:U:V:W:X:Y:Z:\\u017D"},
|
||||
/* Slovenian*/ {"sl", "A:B:C:\\u010C:\\u0106:D:\\u0110:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\\u0160:T:U:V:W:X:Y:Z:\\u017D"},
|
||||
/* Serbian*/ {"sr", "\\u0410:\\u0411:\\u0412:\\u0413:\\u0414:\\u0402:\\u0415:\\u0416:\\u0417:\\u0418:\\u0408:\\u041A:\\u041B:\\u0409:\\u041C:\\u041D:\\u040A:\\u041E:\\u041F:\\u0420:\\u0421:\\u0422:\\u040B:\\u0423:\\u0424:\\u0425:\\u0426:\\u0427:\\u040F:\\u0428"},
|
||||
/* Swedish*/ {"sv", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\\u00C5:\\u00C4:\\u00D6"},
|
||||
/* Turkish*/ {"tr", "A:B:C:\\u00C7:D:E:F:G:H:I:\\u0130:J:K:L:M:N:O:\\u00D6:P:Q:R:S:\\u015E:T:U:\\u00DC:V:W:X:Y:Z"},
|
||||
/* Ukrainian*/ {"uk", "\\u0410:\\u0411:\\u0412:\\u0413:\\u0490:\\u0414:\\u0415:\\u0404:\\u0416:\\u0417:\\u0418:\\u0406:\\u0407:\\u0419:\\u041A:\\u041B:\\u041C:\\u041D:\\u041E:\\u041F:\\u0420:\\u0421:\\u0422:\\u0423:\\u0424:\\u0425:\\u0426:\\u0427:\\u0428:\\u0429:\\u042E:\\u042F"},
|
||||
/* Vietnamese*/ {"vi", "A:\\u0102:\\u00C2:B:C:D:\\u0110:E:\\u00CA:F:G:H:I:J:K:L:M:N:O:\\u00D4:\\u01A0:P:Q:R:S:T:U:\\u01AF:V:W:X:Y:Z"},
|
||||
/* Chinese*/ {"zh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
|
||||
/* Chinese (Traditional Han)*/ {"zh_Hant", "1\\u5283:2\\u5283:3\\u5283:4\\u5283:5\\u5283:6\\u5283:7\\u5283:8\\u5283:9\\u5283:10\\u5283:11\\u5283:12\\u5283:13\\u5283:14\\u5283:15\\u5283:16\\u5283:17\\u5283:18\\u5283:19\\u5283:20\\u5283:21\\u5283:22\\u5283:23\\u5283:24\\u5283:25\\u5283:26\\u5283:27\\u5283:28\\u5283:29\\u5283:30\\u5283:31\\u5283:32\\u5283:33\\u5283:35\\u5283:36\\u5283:39\\u5283:48\\u5283"},
|
||||
};
|
||||
|
||||
void AlphabeticIndexTest::TestIndexCharactersList() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
for (int32_t i = 0; i < LENGTHOF(localeAndIndexCharactersLists); ++i) {
|
||||
const char *(&localeAndIndexCharacters)[2] = localeAndIndexCharactersLists[i];
|
||||
const char *locale = localeAndIndexCharacters[0];
|
||||
UnicodeString expectedIndexCharacters
|
||||
= (UnicodeString("\\u2026:") + localeAndIndexCharacters[1] + ":\\u2026").unescape();
|
||||
AlphabeticIndex index(locale, status);
|
||||
TEST_CHECK_STATUS;
|
||||
LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
|
||||
TEST_CHECK_STATUS;
|
||||
|
||||
// Join the elements of the list to a string with delimiter ":"
|
||||
UnicodeString actualIndexCharacters;
|
||||
assertEquals(locale,
|
||||
expectedIndexCharacters,
|
||||
joinLabelsAndAppend(*immIndex, actualIndexCharacters));
|
||||
logln(locale + UnicodeString(": ") + actualIndexCharacters);
|
||||
}
|
||||
}
|
||||
|
||||
void AlphabeticIndexTest::TestHaniFirst() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<RuleBasedCollator> coll(
|
||||
static_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getRoot(), status)));
|
||||
int32_t reorderCodes[] = { USCRIPT_HAN };
|
||||
coll->setReorderCodes(reorderCodes, LENGTHOF(reorderCodes), status);
|
||||
TEST_CHECK_STATUS;
|
||||
AlphabeticIndex index(coll.orphan(), status);
|
||||
TEST_CHECK_STATUS;
|
||||
assertEquals("getBucketCount()", 1, index.getBucketCount(status)); // ... (underflow only)
|
||||
index.addLabels(Locale::getEnglish(), status);
|
||||
assertEquals("getBucketCount()", 28, index.getBucketCount(status)); // ... A-Z ...
|
||||
int32_t bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x897f), status);
|
||||
assertEquals("getBucketIndex(U+897F)", 0, bucketIndex); // underflow bucket
|
||||
bucketIndex = index.getBucketIndex("i", status);
|
||||
assertEquals("getBucketIndex(i)", 9, bucketIndex);
|
||||
bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x03B1), status);
|
||||
assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
|
||||
// TODO: Test with an unassigned code point (not just U+FFFF)
|
||||
// when unassigned code points are not in the Hani reordering group any more.
|
||||
// String unassigned = UTF16.valueOf(0x50005);
|
||||
bucketIndex = index.getBucketIndex(UnicodeString((UChar)0xFFFF), status);
|
||||
assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
|
||||
}
|
||||
|
||||
void AlphabeticIndexTest::TestPinyinFirst() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<RuleBasedCollator> coll(
|
||||
static_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getChinese(), status)));
|
||||
int32_t reorderCodes[] = { USCRIPT_HAN };
|
||||
coll->setReorderCodes(reorderCodes, LENGTHOF(reorderCodes), status);
|
||||
TEST_CHECK_STATUS;
|
||||
AlphabeticIndex index(coll.orphan(), status);
|
||||
TEST_CHECK_STATUS;
|
||||
assertEquals("getBucketCount()", 1, index.getBucketCount(status)); // ... (underflow only)
|
||||
index.addLabels(Locale::getChinese(), status);
|
||||
assertEquals("getBucketCount()", 28, index.getBucketCount(status)); // ... A-Z ...
|
||||
int bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x897f), status);
|
||||
assertEquals("getBucketIndex(U+897F)", 'X' - 'A' + 1, bucketIndex);
|
||||
bucketIndex = index.getBucketIndex("i", status);
|
||||
assertEquals("getBucketIndex(i)", 9, bucketIndex);
|
||||
bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x03B1), status);
|
||||
assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
|
||||
// TODO: Test with an unassigned code point (not just U+FFFF)
|
||||
// when unassigned code points are not in the Hani reordering group any more.
|
||||
// String unassigned = UTF16.valueOf(0x50005);
|
||||
bucketIndex = index.getBucketIndex(UnicodeString((UChar)0xFFFF), status);
|
||||
assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
|
||||
}
|
||||
|
||||
void AlphabeticIndexTest::TestSchSt() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
AlphabeticIndex index(Locale::getGerman(), status);
|
||||
index.addLabels(UnicodeSet("[\\u00C6{Sch*}{St*}]", status), status);
|
||||
TEST_CHECK_STATUS;
|
||||
// ... A AE-ligature B-R S Sch St T-Z ...
|
||||
LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
|
||||
TEST_CHECK_STATUS;
|
||||
assertEquals("getBucketCount()", 31, index.getBucketCount(status));
|
||||
assertEquals("immutable getBucketCount()", 31, immIndex->getBucketCount());
|
||||
static const struct TestCase {
|
||||
const char *name;
|
||||
int32_t bucketIndex;
|
||||
const char *bucketLabel;
|
||||
} testCases[] = {
|
||||
// name, bucket index, bucket label
|
||||
{ "Adelbert", 1, "A" },
|
||||
{ "Afrika", 1, "A" },
|
||||
{ "\\u00C6sculap", 2, "\\u00C6" },
|
||||
{ "Aesthet", 2, "\\u00C6" },
|
||||
{ "Berlin", 3, "B" },
|
||||
{ "Rilke", 19, "R" },
|
||||
{ "Sacher", 20, "S" },
|
||||
{ "Seiler", 20, "S" },
|
||||
{ "Sultan", 20, "S" },
|
||||
{ "Schiller", 21, "Sch" },
|
||||
{ "Steiff", 22, "St" },
|
||||
{ "Thomas", 23, "T" }
|
||||
};
|
||||
for (int32_t i = 0; i < LENGTHOF(testCases); ++i) {
|
||||
const TestCase &testCase = testCases[i];
|
||||
UnicodeString name = UnicodeString(testCase.name).unescape();
|
||||
UnicodeString label = UnicodeString(testCase.bucketLabel).unescape();
|
||||
char msg[100];
|
||||
sprintf(msg, "getBucketIndex(%s)", testCase.name);
|
||||
assertEquals(msg, testCase.bucketIndex, index.getBucketIndex(name, status));
|
||||
sprintf(msg, "immutable getBucketIndex(%s)", testCase.name);
|
||||
assertEquals(msg, testCase.bucketIndex, immIndex->getBucketIndex(name, status));
|
||||
sprintf(msg, "immutable bucket label (%s)", testCase.name);
|
||||
assertEquals(msg, label, immIndex->getBucket(testCase.bucketIndex)->getLabel());
|
||||
}
|
||||
}
|
||||
|
||||
void AlphabeticIndexTest::TestNoLabels() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<RuleBasedCollator> coll(
|
||||
static_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getRoot(), status)));
|
||||
TEST_CHECK_STATUS;
|
||||
AlphabeticIndex index(coll.orphan(), status);
|
||||
TEST_CHECK_STATUS;
|
||||
index.addRecord(UnicodeString((UChar)0x897f), NULL, status);
|
||||
index.addRecord("i", NULL, status);
|
||||
index.addRecord(UnicodeString((UChar)0x03B1), NULL, status);
|
||||
assertEquals("getBucketCount()", 1, index.getBucketCount(status)); // ...
|
||||
TEST_ASSERT(index.nextBucket(status));
|
||||
assertEquals("underflow label type", U_ALPHAINDEX_UNDERFLOW, index.getBucketLabelType());
|
||||
assertEquals("all records in the underflow bucket", 3, index.getBucketRecordCount());
|
||||
}
|
||||
|
||||
void AlphabeticIndexTest::TestChineseZhuyin() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
char loc[100];
|
||||
uloc_forLanguageTag("zh-u-co-zhuyin", loc, LENGTHOF(loc), NULL, &status);
|
||||
AlphabeticIndex index(loc, status);
|
||||
LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
|
||||
TEST_CHECK_STATUS;
|
||||
assertEquals("getBucketCount()", 38, immIndex->getBucketCount());
|
||||
assertEquals("label 1", UnicodeString((UChar)0x3105), immIndex->getBucket(1)->getLabel());
|
||||
assertEquals("label 2", UnicodeString((UChar)0x3106), immIndex->getBucket(2)->getLabel());
|
||||
assertEquals("label 3", UnicodeString((UChar)0x3107), immIndex->getBucket(3)->getLabel());
|
||||
assertEquals("label 4", UnicodeString((UChar)0x3108), immIndex->getBucket(4)->getLabel());
|
||||
assertEquals("label 5", UnicodeString((UChar)0x3109), immIndex->getBucket(5)->getLabel());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2012, International Business Machines Corporation and
|
||||
* Copyright (c) 2012-2013, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
//
|
||||
|
@ -24,6 +24,27 @@ public:
|
|||
virtual void ManyLocalesTest();
|
||||
virtual void HackPinyinTest();
|
||||
virtual void TestBug9009();
|
||||
void TestIndexCharactersList();
|
||||
/**
|
||||
* Test AlphabeticIndex vs. root with script reordering.
|
||||
*/
|
||||
void TestHaniFirst();
|
||||
/**
|
||||
* Test AlphabeticIndex vs. Pinyin with script reordering.
|
||||
*/
|
||||
void TestPinyinFirst();
|
||||
/**
|
||||
* Test labels with multiple primary weights.
|
||||
*/
|
||||
void TestSchSt();
|
||||
/**
|
||||
* With no real labels, there should be only the underflow label.
|
||||
*/
|
||||
void TestNoLabels();
|
||||
/**
|
||||
* Test with the Bopomofo-phonetic tailoring.
|
||||
*/
|
||||
void TestChineseZhuyin();
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -233,6 +233,14 @@ IntlTest::appendHex(uint32_t number,
|
|||
0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0
|
||||
}; /* "0123456789ABCDEF" */
|
||||
|
||||
if (digits < 0) { // auto-digits
|
||||
digits = 2;
|
||||
uint32_t max = 0xff;
|
||||
while (number > max) {
|
||||
digits += 2;
|
||||
max = (max << 8) | 0xff;
|
||||
}
|
||||
}
|
||||
switch (digits)
|
||||
{
|
||||
case 8:
|
||||
|
@ -258,6 +266,13 @@ IntlTest::appendHex(uint32_t number,
|
|||
return target;
|
||||
}
|
||||
|
||||
UnicodeString
|
||||
IntlTest::toHex(uint32_t number, int32_t digits) {
|
||||
UnicodeString result;
|
||||
appendHex(number, digits, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline UBool isPrintable(UChar32 c) {
|
||||
return c <= 0x7E && (c >= 0x20 || c == 9 || c == 0xA || c == 0xD);
|
||||
}
|
||||
|
@ -1728,6 +1743,23 @@ UBool IntlTest::assertEquals(const char* message,
|
|||
return TRUE;
|
||||
}
|
||||
|
||||
UBool IntlTest::assertEquals(const char* message,
|
||||
int32_t expected,
|
||||
int32_t actual) {
|
||||
if (expected != actual) {
|
||||
errln((UnicodeString)"FAIL: " + message + "; got " +
|
||||
actual + "=0x" + toHex(actual) +
|
||||
"; expected " + expected + "=0x" + toHex(expected));
|
||||
return FALSE;
|
||||
}
|
||||
#ifdef VERBOSE_ASSERTIONS
|
||||
else {
|
||||
logln((UnicodeString)"Ok: " + message + "; got " + actual + "=0x" + toHex(actual));
|
||||
}
|
||||
#endif
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_FORMATTING
|
||||
UBool IntlTest::assertEquals(const char* message,
|
||||
const Formattable& expected,
|
||||
|
|
|
@ -241,6 +241,7 @@ protected:
|
|||
const UnicodeString& actual, UBool possibleDataError=FALSE);
|
||||
UBool assertEquals(const char* message, const char* expected,
|
||||
const char* actual);
|
||||
UBool assertEquals(const char* message, int32_t expected, int32_t actual);
|
||||
#if !UCONFIG_NO_FORMATTING
|
||||
UBool assertEquals(const char* message, const Formattable& expected,
|
||||
const Formattable& actual);
|
||||
|
@ -299,7 +300,12 @@ protected:
|
|||
|
||||
static UnicodeString &prettify(const UnicodeString &source, UnicodeString &target);
|
||||
static UnicodeString prettify(const UnicodeString &source, UBool parseBackslash=FALSE);
|
||||
// digits=-1 determines the number of digits automatically
|
||||
static UnicodeString &appendHex(uint32_t number, int32_t digits, UnicodeString &target);
|
||||
static UnicodeString toHex(uint32_t number, int32_t digits=-1);
|
||||
static inline UnicodeString toHex(int32_t number, int32_t digits=-1) {
|
||||
return toHex((uint32_t)number, digits);
|
||||
}
|
||||
|
||||
public:
|
||||
static void setICU_DATA(); // Set up ICU_DATA if necessary.
|
||||
|
|
Loading…
Add table
Reference in a new issue