ICU-9880 add ImmutableIndex, replace Chinese hacks and support zhuyin, handle index characters with multiple primary weights, lazy-create rarely-used objects, bug fixes, port other improvements from Java

X-SVN-Rev: 33245
This commit is contained in:
Markus Scherer 2013-02-15 22:11:33 +00:00
parent 407be346b7
commit f5cd9984c6
9 changed files with 1396 additions and 1126 deletions

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
* Copyright (C) 1999-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 1999-2013, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
* Date Name Description
* 10/22/99 alan Creation.
@ -552,12 +552,12 @@ void UVector::sort(UElementComparator *compare, UErrorCode &ec) {
/**
* Sort with a user supplied comparator of type UComparator.
* Stable sort with a user supplied comparator of type UComparator.
*/
void UVector::sortWithUComparator(UComparator *compare, const void *context, UErrorCode &ec) {
if (U_SUCCESS(ec)) {
uprv_sortArray(elements, count, sizeof(UElement),
compare, context, FALSE, &ec);
compare, context, TRUE, &ec);
}
}

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2011, International Business Machines
* Copyright (C) 1999-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -243,9 +243,9 @@ public:
void sort(UElementComparator *compare, UErrorCode &ec);
/**
* Sort the contents of this vector using a caller-supplied function
* Stable sort the contents of this vector using a caller-supplied function
* of type UComparator to do the comparison. Provides more flexibility
* than uvector::sort() because an additional user-parameter can be passed to
* than UVector::sort() because an additional user parameter can be passed to
* the comparison function.
*/
void sortWithUComparator(UComparator *compare, const void *context, UErrorCode &ec);

File diff suppressed because it is too large Load diff

View file

@ -51,7 +51,6 @@ typedef enum ECleanupI18NType {
UCLN_I18N_UCOL_RES,
UCLN_I18N_UCOL_BLD,
UCLN_I18N_CSDET,
UCLN_I18N_INDEX_CHARACTERS,
UCLN_I18N_GENDERINFO,
UCLN_I18N_CDFINFO,
UCLN_I18N_REGION,

View file

@ -71,19 +71,25 @@ U_NAMESPACE_BEGIN
// Forward Declarations
namespace {
class BucketList;
} // namespace;
class Collator;
class RuleBasedCollator;
class StringEnumeration;
class UnicodeSet;
class UVector;
/**
* class AlphabeticIndex supports the creation of a UI index appropriate for a given language, such as:
*
* AlphabeticIndex supports the creation of a UI index appropriate for a given language.
* It can support either direct use, or use with a client that doesn't support localized collation.
* The following is an example of what an index might look like in a UI:
*
* <pre>
* <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z \\u00C6 \\u00D8 \\u00C5 ...</b>
* <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ...</b>
*
* <b>A</b>
* Addison
@ -107,10 +113,14 @@ class UVector;
* into an inflow bucket between the other two scripts.
* <p>
* The AlphabeticIndex class is not intended for public subclassing.
* <p>
* <i>Example</i>
* <p>
* The "show..." methods below are just to illustrate usage.
*
* <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
* as well as characters from the user's language,
* then it is a good idea to call addLabels(Locale::getEnglish(), status).</p>
*
* <h2>Direct Use</h2>
* <p>The following shows an example of building an index directly.
* The "show..." methods below are just to illustrate usage.
*
* <pre>
* // Create a simple index. "Item" is assumed to be an application
@ -150,21 +160,138 @@ class UVector;
* <b>... A-F G-N O-Z ...</b>
* </pre>
*
* <p>
* <b>Notes:</b>
* <h2>Client Support</h2>
* <p>Callers can also use the AlphabeticIndex::ImmutableIndex, or the AlphabeticIndex itself,
* to support sorting on a client that doesn't support AlphabeticIndex functionality.
*
* <p>The ImmutableIndex is both immutable and thread-safe.
* The corresponding AlphabeticIndex methods are not thread-safe because
* they "lazily" build the index buckets.
* <ul>
* <li>Additional collation parameters can be passed in as part of the locale name.
* For example, German plus numeric
* sorting would be "de@kn-true".
* <li>ImmutableIndex.getBucket(index) provides random access to all
* buckets and their labels and label types.
* <li>The AlphabeticIndex bucket iterator or ImmutableIndex.getBucket(0..getBucketCount-1)
* can be used to get a list of the labels,
* such as "...", "A", "B",..., and send that list to the client.
* <li>When the client has a new name, it sends that name to the server.
* The server needs to call the following methods,
* and communicate the bucketIndex and collationKey back to the client.
*
* <pre>
* int32_t bucketIndex = index.getBucketIndex(name, status);
* const UnicodeString &label = immutableIndex.getBucket(bucketIndex)->getLabel(); // optional
* int32_t skLength = collator.getSortKey(name, sk, skCapacity);
* </pre>
*
* <li>The client would put the name (and associated information) into its bucket for bucketIndex. The sort key sk is a
* sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
* </ul>
*
* @stable ICU 4.8
*/
class U_I18N_API AlphabeticIndex: public UObject {
public:
/**
* An index "bucket" with a label string and type.
* It is referenced by getBucketIndex(),
* and returned by ImmutableIndex.getBucket().
*
* The Bucket class is not intended for public subclassing.
* @draft ICU 51
*/
class U_I18N_API Bucket : public UObject {
public:
/**
* Destructor.
* @draft ICU 51
*/
virtual ~Bucket();
public:
/**
* Returns the label string.
*
* @return the label string for the bucket
* @draft ICU 51
*/
const UnicodeString &getLabel() const { return label_; }
/**
* Returns whether this bucket is a normal, underflow, overflow, or inflow bucket.
*
* @return the bucket label type
* @draft ICU 51
*/
UAlphabeticIndexLabelType getLabelType() const { return labelType_; }
private:
friend class AlphabeticIndex;
friend class BucketList;
UnicodeString label_;
UnicodeString lowerBoundary_;
UAlphabeticIndexLabelType labelType_;
Bucket *displayBucket_;
int32_t displayIndex_;
UVector *records_; // Records are owned by the inputList_ vector.
Bucket(const UnicodeString &label, // Parameter strings are copied.
const UnicodeString &lowerBoundary,
UAlphabeticIndexLabelType type);
};
/**
* Immutable, thread-safe version of AlphabeticIndex.
* This class provides thread-safe methods for bucketing,
* and random access to buckets and their properties,
* but does not offer adding records to the index.
*
* The ImmutableIndex class is not intended for public subclassing.
*
* @draft ICU 51
*/
class U_I18N_API ImmutableIndex : public UObject {
public:
/**
* Destructor.
* @draft ICU 51
*/
virtual ~ImmutableIndex();
/**
* Returns the number of index buckets and labels, including underflow/inflow/overflow.
*
* @return the number of index buckets
* @draft ICU 51
*/
int32_t getBucketCount() const;
/**
* Finds the index bucket for the given name and returns the number of that bucket.
* Use getBucket() to get the bucket's properties.
*
* @param name the string to be sorted into an index bucket
* @return the bucket number for the name
* @draft ICU 51
*/
int32_t getBucketIndex(const UnicodeString &name, UErrorCode &errorCode) const;
/**
* Returns the index-th bucket. Returns NULL if the index is out of range.
*
* @param index bucket number
* @return the index-th bucket
* @draft ICU 51
*/
const Bucket *getBucket(int32_t index) const;
private:
friend class AlphabeticIndex;
ImmutableIndex(BucketList *bucketList, Collator *collatorPrimaryOnly)
: buckets_(bucketList), collatorPrimaryOnly_(collatorPrimaryOnly) {}
BucketList *buckets_;
Collator *collatorPrimaryOnly_;
};
/**
* Construct an AlphabeticIndex object for the specified locale. If the locale's
@ -230,6 +357,14 @@ class U_I18N_API AlphabeticIndex: public UObject {
virtual ~AlphabeticIndex();
/**
* Builds an immutable, thread-safe version of this instance, without data records.
*
* @return an immutable index instance
* @draft ICU 51
*/
ImmutableIndex *buildImmutableIndex(UErrorCode &errorCode);
/**
* Get the Collator that establishes the ordering of the items in this index.
* Ownership of the collator remains with the AlphabeticIndex instance.
@ -269,7 +404,6 @@ class U_I18N_API AlphabeticIndex: public UObject {
virtual AlphabeticIndex &setInflowLabel(const UnicodeString &inflowLabel, UErrorCode &status);
/**
* Get the special label used for items that sort after the last normal label,
* and that would not otherwise have an appropriate label.
@ -336,22 +470,6 @@ class U_I18N_API AlphabeticIndex: public UObject {
virtual AlphabeticIndex &setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status);
/**
* Get the Unicode character (or tailored string) that defines an overflow bucket;
* that is anything greater than or equal to that string should go in that bucket,
* instead of with the last character. Normally that is the first character of the script
* after lowerLimit. Thus in X Y Z ... <i>Devanagari-ka</i>, the overflow character for Z
* would be the <i>Greek-alpha</i>.
*
* @param lowerLimit The character below the overflow (or inflow) bucket
* @param status error code
* @return string that defines top of the overflow buck for lowerLimit, or an empty string if there is none
* @internal
*/
virtual const UnicodeString &getOverflowComparisonString(const UnicodeString &lowerLimit,
UErrorCode &status);
/**
* Add a record to the index. Each record will be associated with an index Bucket
* based on the record's name. The list of records for each bucket will be sorted
@ -549,187 +667,90 @@ private:
virtual UBool operator!=(const AlphabeticIndex& other) const;
// Common initialization, for use from all constructors.
void init(UErrorCode &status);
void init(const Locale *locale, UErrorCode &status);
// Initialize & destruct static constants used by this class.
static void staticInit(UErrorCode &status);
/**
* This method is called to get the index exemplars. Normally these come from the locale directly,
* but if they aren't available, we have to synthesize them.
*/
void addIndexExemplars(const Locale &locale, UErrorCode &status);
/**
* Add Chinese index characters from the tailoring.
*/
UBool addChineseIndexCharacters(UErrorCode &errorCode);
// Pinyin stuff. If the input name is Chinese, add the Pinyin prefix to the dest string.
void hackName(UnicodeString &dest, const UnicodeString &name, const Collator *coll);
void initPinyinBounds(const Collator *coll, UErrorCode &status);
UVector *firstStringsInScript(UErrorCode &status);
public:
#ifndef U_HIDE_INTERNAL_API
/**
* Delete all shared (static) data associated with an AlphabeticIndex.
* Internal function, not intended for direct use.
* @internal.
*/
static void staticCleanup();
#endif /* U_HIDE_INTERNAL_API */
private:
static UnicodeString separated(const UnicodeString &item);
// Add index characters from the specified locale to the dest set.
// Does not remove any previous contents from dest.
static void getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status);
/**
* Determine the best labels to use.
* This is based on the exemplars, but we also process to make sure that they are unique,
* and sort differently, and that the overall list is small enough.
*/
void initLabels(UVector &indexCharacters, UErrorCode &errorCode) const;
BucketList *createBucketList(UErrorCode &errorCode) const;
void initBuckets(UErrorCode &errorCode);
void clearBuckets();
void internalResetBucketIterator();
UVector *firstStringsInScript(UErrorCode &status);
public:
static UnicodeString separated(const UnicodeString &item);
static UnicodeSet *getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status);
void buildIndex(UErrorCode &status);
void buildBucketList(UErrorCode &status);
void bucketRecords(UErrorCode &status);
public:
// The following internal items are declared public only to allow access from
// implementation code written in plain C. They are not intended for
// public use.
// The Record is declared public only to allow access from
// implementation code written in plain C.
// It is not intended for public use.
#ifndef U_HIDE_INTERNAL_API
/**
* A record, or item, in the index.
* A (name, data) pair, to be sorted by name into one of the index buckets.
* The user data is not used by the index implementation.
* @internal
*/
struct Record: public UMemory {
AlphabeticIndex *alphaIndex_;
const UnicodeString name_;
UnicodeString sortingName_; // Usually the same as name_; different for Pinyin.
const void *data_;
int32_t serialNumber_; // Defines sorting order for names that compare equal.
Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data);
~Record();
};
struct Record: public UMemory {
const UnicodeString name_;
const void *data_;
Record(const UnicodeString &name, const void *data);
~Record();
};
#endif /* U_HIDE_INTERNAL_API */
/**
* Holds all user records before they are distributed into buckets.
* Type of contents is (Record *)
* @internal
*/
UVector *inputRecords_;
/**
* A Bucket holds an index label and references to everything belonging to that label.
* For implementation use only. Declared public because pure C implementation code needs access.
* @internal
*/
struct Bucket: public UMemory {
UnicodeString label_;
UnicodeString lowerBoundary_;
UAlphabeticIndexLabelType labelType_;
UVector *records_; // Records are owned by inputRecords_ vector.
Bucket(const UnicodeString &label, // Parameter strings are copied.
const UnicodeString &lowerBoundary,
UAlphabeticIndexLabelType type, UErrorCode &status);
~Bucket();
};
public:
/**
* Language Types. For internal ICU use only.
* @internal (but not hidden with U_HIDE_INTERNAL_API because it is used in public API)
*/
enum ELangType {
/** @internal */
kNormal,
/** @internal */
kSimplified,
/** @internal */
kTraditional
};
private:
/**
* Get the Language Type for this Index. Based on the locale.
* @internal
*/
static ELangType langTypeFromLocale(const Locale &loc);
* Holds all user records before they are distributed into buckets.
* Type of contents is (Record *)
* @internal
*/
UVector *inputList_;
int32_t labelsIterIndex_; // Index of next item to return.
int32_t itemsIterIndex_;
Bucket *currentBucket_; // While an iteration of the index in underway,
// point to the bucket for the current label.
// NULL when no iteration underway.
private:
int32_t maxLabelCount_; // Limit on # of labels permitted in the index.
// Holds the contents of this index, buckets of user items.
// UVector elements are of type (Bucket *)
UVector *bucketList_;
UnicodeSet *initialLabels_; // Initial (unprocessed) set of Labels. Union
// of those explicitly set by the user plus
// those from locales. Raw values, before
// crunching into bucket labels.
int32_t labelsIterIndex_; // Index of next item to return.
int32_t itemsIterIndex_;
Bucket *currentBucket_; // While an iteration of the index in underway,
// point to the bucket for the current label.
// NULL when no iteration underway.
UVector *firstCharsInScripts_; // The first character from each script,
// in collation order.
UBool indexBuildRequired_; // Caller has made changes to the index that
// require rebuilding & bucketing before the
// contents can be iterated.
RuleBasedCollator *collator_;
RuleBasedCollator *collatorPrimaryOnly_;
int32_t maxLabelCount_; // Limit on # of labels permitted in the index.
// Lazy evaluated: null means that we have not built yet.
BucketList *buckets_;
UHashtable *alreadyIn_; // Key=UnicodeString, value=UnicodeSet
UnicodeSet *initialLabels_; // Initial (unprocessed) set of Labels. Union
// of those explicitly set by the user plus
// those from locales. Raw values, before
// crunching into bucket labels.
UVector *labels_; // List of Labels, after processing, sorting.
// Contents are (UnicodeString *)
UnicodeSet *noDistinctSorting_; // As the set of labels is built, strings may
// be discarded from the exemplars. This contains
// some of the discards, and is
// intended for debugging.
UnicodeSet *notAlphabetic_; // As the set of labels is built, strings may
// be discarded from the exemplars. This contains
// some of the discards, and is
// intended for debugging.
UVector *firstScriptCharacters_; // The first character from each script,
// in collation order.
Locale locale_;
Collator *collator_;
Collator *collatorPrimaryOnly_;
UnicodeString inflowLabel_;
UnicodeString overflowLabel_;
UnicodeString underflowLabel_;
UnicodeString overflowComparisonString_;
ELangType langType_; // The language type, simplified Chinese, Traditional Chinese,
// or not Chinese (Normal). Part of the Pinyin support
typedef const UChar PinyinLookup[24][3];
static PinyinLookup HACK_PINYIN_LOOKUP_SHORT;
static PinyinLookup HACK_PINYIN_LOOKUP_LONG;
// These will be lazily set to the short or long tables based on which
// Chinese collation has been configured into the ICU library.
static PinyinLookup *HACK_PINYIN_LOOKUP;
static const UChar *PINYIN_LOWER_BOUNDS;
int32_t recordCounter_; // Counts Records created. For minting record serial numbers.
// Constants. Lazily initialized the first time an AlphabeticIndex object is created.
static UnicodeSet *ALPHABETIC;
static UnicodeSet *CORE_LATIN;
static UnicodeSet *ETHIOPIC;
static UnicodeSet *HANGUL;
static UnicodeSet *IGNORE_SCRIPTS;
static UnicodeSet *TO_TRY;
static UnicodeSet *UNIHAN;
static const UnicodeString *EMPTY_STRING;
UnicodeString inflowLabel_;
UnicodeString overflowLabel_;
UnicodeString underflowLabel_;
UnicodeString overflowComparisonString_;
UnicodeString emptyString_;
};
U_NAMESPACE_END

View file

@ -12,6 +12,7 @@
#include "unicode/alphaindex.h"
#include "unicode/coll.h"
#include "unicode/localpointer.h"
#include "unicode/tblcoll.h"
#include "unicode/uniset.h"
@ -20,6 +21,24 @@
// #include <string>
// #include <iostream>
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
namespace {
UnicodeString joinLabelsAndAppend(AlphabeticIndex::ImmutableIndex &index, UnicodeString &dest) {
int32_t oldLength = dest.length();
const AlphabeticIndex::Bucket *bucket;
for (int32_t i = 0; (bucket = index.getBucket(i)) != NULL; ++i) {
if (dest.length() > oldLength) {
dest.append((UChar)0x3A); // ':'
}
dest.append(bucket->getLabel());
}
return dest;
}
} // namespace
AlphabeticIndexTest::AlphabeticIndexTest() {
}
@ -29,27 +48,18 @@ AlphabeticIndexTest::~AlphabeticIndexTest() {
void AlphabeticIndexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
{
if (exec) logln("TestSuite AlphabeticIndex: ");
switch (index) {
case 0: name = "APITest";
if (exec) APITest();
break;
case 1: name = "ManyLocales";
if (exec) ManyLocalesTest();
break;
case 2: name = "HackPinyinTest";
if (exec) HackPinyinTest();
break;
case 3: name = "TestBug9009";
if (exec) TestBug9009();
break;
default: name = "";
break; //needed to end loop
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(APITest);
TESTCASE_AUTO(ManyLocalesTest);
TESTCASE_AUTO(HackPinyinTest);
TESTCASE_AUTO(TestBug9009);
TESTCASE_AUTO(TestIndexCharactersList);
TESTCASE_AUTO(TestHaniFirst);
TESTCASE_AUTO(TestPinyinFirst);
TESTCASE_AUTO(TestSchSt);
TESTCASE_AUTO(TestNoLabels);
TESTCASE_AUTO(TestChineseZhuyin);
TESTCASE_AUTO_END;
}
#define TEST_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: Test failure. status=%s", \
@ -85,10 +95,8 @@ void AlphabeticIndexTest::APITest() {
index = new AlphabeticIndex(coll, status);
TEST_CHECK_STATUS;
TEST_ASSERT(coll == &index->getCollator());
// TODO: The bucket count for an index built from a collator should be one, the underflow label.
// The current implementation adds A-Z if the index is otherwise empty.
// TEST_ASSERT(1 == index->getBucketCount(status));
TEST_ASSERT(28 == index->getBucketCount(status));
assertEquals("only the underflow label in an index built from a collator",
1, index->getBucketCount(status));
TEST_CHECK_STATUS;
delete index;
@ -104,12 +112,8 @@ void AlphabeticIndexTest::APITest() {
TEST_CHECK_STATUS;
lc = index->getBucketCount(status);
TEST_CHECK_STATUS;
// TODO: should get 31. Java also gives 30. Needs fixing
TEST_ASSERT(30 == lc); // 26 Latin letters plus
// TEST_ASSERT(31 == lc); // 26 Latin letters plus
// 2 Cyrillic letters plus
// 1 inflow label plus
// two under/overflow labels.
assertEquals("underflow, A-Z, inflow, 2 Cyrillic, overflow",
31, index->getBucketCount(status));
// std::cout << lc << std::endl;
delete index;
@ -298,17 +302,26 @@ void AlphabeticIndexTest::APITest() {
delete index;
index = new AlphabeticIndex(Locale::createFromName("ru"), status);
//Locale loc = Locale::createFromName(localeName);
TEST_CHECK_STATUS;
assertEquals("Russian index.getBucketCount()", 32, index->getBucketCount(status));
// Latin-script names should go into the underflow label (0)
// if the Russian collation does not use script reordering,
// but into the overflow label (getBucketCount()-1)
// if Russian sorts Cyrillic first.
int32_t reorderCodes[20];
int32_t expectedLatinIndex = 0;
if (index->getCollator().getReorderCodes(reorderCodes, LENGTHOF(reorderCodes), status) > 0) {
expectedLatinIndex = index->getBucketCount(status) - 1;
}
n = index->getBucketIndex(adam, status);
TEST_CHECK_STATUS;
TEST_ASSERT(n == 32); // Now Latin is in overflow label for Russian collation
assertEquals("Russian index.getBucketIndex(adam)", expectedLatinIndex, n);
n = index->getBucketIndex(baker, status);
TEST_ASSERT(n == 32);
assertEquals("Russian index.getBucketIndex(baker)", expectedLatinIndex, n);
n = index->getBucketIndex(Cyrillic, status);
TEST_ASSERT(n == 1); // First label
assertEquals("Russian index.getBucketIndex(Cyrillic)", 1, n);
n = index->getBucketIndex(zed, status);
TEST_ASSERT(n == 32);
assertEquals("Russian index.getBucketIndex(zed)", expectedLatinIndex, n);
delete index;
@ -327,7 +340,6 @@ static const char * KEY_LOCALES[] = {
void AlphabeticIndexTest::ManyLocalesTest() {
UErrorCode status = U_ZERO_ERROR;
int32_t lc = 0;
AlphabeticIndex *index = NULL;
for (int i=0; ; ++i) {
status = U_ZERO_ERROR;
@ -337,23 +349,40 @@ void AlphabeticIndexTest::ManyLocalesTest() {
}
// std::cout << localeName << " ";
Locale loc = Locale::createFromName(localeName);
index = new AlphabeticIndex(loc, status);
AlphabeticIndex index(loc, status);
TEST_CHECK_STATUS;
lc = index->getBucketCount(status);
lc = index.getBucketCount(status);
TEST_CHECK_STATUS;
// std::cout << "getBucketCount() == " << lc << std::endl;
while (index->nextBucket(status)) {
LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
TEST_CHECK_STATUS;
TEST_ASSERT(lc == immIndex->getBucketCount());
assertEquals("initial bucket index", -1, index.getBucketIndex());
int32_t bucketIndex = 0;
while (index.nextBucket(status)) {
TEST_CHECK_STATUS;
const UnicodeString &label = index->getBucketLabel();
assertEquals("bucket index", bucketIndex, index.getBucketIndex());
const UnicodeString &label = index.getBucketLabel();
TEST_ASSERT(label.length()>0);
// std::string ss;
// std::cout << ":" << label.toUTF8String(ss);
const AlphabeticIndex::Bucket *bucket = immIndex->getBucket(bucketIndex);
TEST_ASSERT(bucket != NULL);
assertEquals("bucket label vs. immutable: locale=" + UnicodeString(localeName) +
" index=" + bucketIndex,
label, bucket->getLabel());
TEST_ASSERT(&label != &bucket->getLabel()); // not the same pointers
UAlphabeticIndexLabelType labelType = index.getBucketLabelType();
TEST_ASSERT(labelType == bucket->getLabelType());
++bucketIndex;
}
// std::cout << ":" << std::endl;
delete index;
TEST_ASSERT(immIndex->getBucketCount() == bucketIndex);
TEST_ASSERT(immIndex->getBucket(-1) == NULL);
TEST_ASSERT(immIndex->getBucket(bucketIndex) == NULL);
}
}
@ -447,6 +476,191 @@ void AlphabeticIndexTest::TestBug9009() {
aindex.nextBucket(status); // Crash here before bug was fixed.
TEST_CHECK_STATUS;
}
static const char *localeAndIndexCharactersLists[][2] = {
/* Arabic*/ {"ar", "\\u0627:\\u0628:\\u062A:\\u062B:\\u062C:\\u062D:\\u062E:\\u062F:\\u0630:\\u0631:\\u0632:\\u0633:\\u0634:\\u0635:\\u0636:\\u0637:\\u0638:\\u0639:\\u063A:\\u0641:\\u0642:\\u0643:\\u0644:\\u0645:\\u0646:\\u0647:\\u0648:\\u064A"},
/* Bulgarian*/ {"bg", "\\u0410:\\u0411:\\u0412:\\u0413:\\u0414:\\u0415:\\u0416:\\u0417:\\u0418:\\u0419:\\u041A:\\u041B:\\u041C:\\u041D:\\u041E:\\u041F:\\u0420:\\u0421:\\u0422:\\u0423:\\u0424:\\u0425:\\u0426:\\u0427:\\u0428:\\u0429:\\u042E:\\u042F"},
/* Catalan*/ {"ca", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Czech*/ {"cs", "A:B:C:\\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:P:Q:R:\\u0158:S:\\u0160:T:U:V:W:X:Y:Z:\\u017D"},
/* Danish*/ {"da", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\\u00C6:\\u00D8:\\u00C5"},
/* German*/ {"de", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:Sch:St:T:U:V:W:X:Y:Z"},
/* Greek*/ {"el", "\\u0391:\\u0392:\\u0393:\\u0394:\\u0395:\\u0396:\\u0397:\\u0398:\\u0399:\\u039A:\\u039B:\\u039C:\\u039D:\\u039E:\\u039F:\\u03A0:\\u03A1:\\u03A3:\\u03A4:\\u03A5:\\u03A6:\\u03A7:\\u03A8:\\u03A9"},
/* English*/ {"en", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Spanish*/ {"es", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\\u00D1:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Estonian*/ {"et", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\\u0160:Z:\\u017D:T:U:V:\\u00D5:\\u00C4:\\u00D6:\\u00DC:X:Y"},
/* Basque*/ {"eu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Finnish*/ {"fi", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\\u00C5:\\u00C4:\\u00D6"},
/* Filipino*/ {"fil", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* French*/ {"fr", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Hebrew*/ {"he", "\\u05D0:\\u05D1:\\u05D2:\\u05D3:\\u05D4:\\u05D5:\\u05D6:\\u05D7:\\u05D8:\\u05D9:\\u05DB:\\u05DC:\\u05DE:\\u05E0:\\u05E1:\\u05E2:\\u05E4:\\u05E6:\\u05E7:\\u05E8:\\u05E9:\\u05EA"},
/* Icelandic*/ {"is", "A:\\u00C1:B:C:D:\\u00D0:E:\\u00C9:F:G:H:I:\\u00CD:J:K:L:M:N:O:\\u00D3:P:Q:R:S:T:U:\\u00DA:V:W:X:Y:\\u00DD:Z:\\u00DE:\\u00C6:\\u00D6"},
/* Italian*/ {"it", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Japanese*/ {"ja", "\\u3042:\\u304B:\\u3055:\\u305F:\\u306A:\\u306F:\\u307E:\\u3084:\\u3089:\\u308F"},
/* Korean*/ {"ko", "\\u3131:\\u3134:\\u3137:\\u3139:\\u3141:\\u3142:\\u3145:\\u3147:\\u3148:\\u314A:\\u314B:\\u314C:\\u314D:\\u314E"},
/* Lithuanian*/ {"lt", "A:B:C:\\u010C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:\\u0160:T:U:V:Z:\\u017D"},
// This should be the correct data. Commented till it is fixed in CLDR collation data.
// {"lv", "A:B:C:\\u010C:D:E:F:G:\\u0122:H:I:Y:J:K:\\u0136:L:\\u013B:M:N:\\u0145:O:P:Q:R:S:\\u0160:T:U:V:W:X:Z:\\u017D"},
/* Latvian*/ {"lv", "A:B:C:\\u010C:D:E:F:G:\\u0122:H:I:J:K:\\u0136:L:\\u013B:M:N:\\u0145:O:P:Q:R:S:\\u0160:T:U:V:W:X:Y:Z:\\u017D"},
/* Norwegian Bokm\\u00E5l*/ {"nb", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\\u00C6:\\u00D8:\\u00C5"},
/* Dutch*/ {"nl", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Polish*/ {"pl", "A:\\u0104:B:C:\\u0106:D:E:\\u0118:F:G:H:I:J:K:L:\\u0141:M:N:\\u0143:O:\\u00D3:P:Q:R:S:\\u015A:T:U:V:W:X:Y:Z:\\u0179:\\u017B"},
/* Portuguese*/ {"pt", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Romanian*/ {"ro", "A:\\u0102:\\u00C2:B:C:D:E:F:G:H:I:\\u00CE:J:K:L:M:N:O:P:Q:R:S:\\u0218:T:\\u021A:U:V:W:X:Y:Z"},
/* Russian*/ {"ru", "\\u0410:\\u0411:\\u0412:\\u0413:\\u0414:\\u0415:\\u0416:\\u0417:\\u0418:\\u0419:\\u041A:\\u041B:\\u041C:\\u041D:\\u041E:\\u041F:\\u0420:\\u0421:\\u0422:\\u0423:\\u0424:\\u0425:\\u0426:\\u0427:\\u0428:\\u0429:\\u042B:\\u042D:\\u042E:\\u042F"},
/* Slovak*/ {"sk", "A:\\u00C4:B:C:\\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:\\u00D4:P:Q:R:S:\\u0160:T:U:V:W:X:Y:Z:\\u017D"},
/* Slovenian*/ {"sl", "A:B:C:\\u010C:\\u0106:D:\\u0110:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\\u0160:T:U:V:W:X:Y:Z:\\u017D"},
/* Serbian*/ {"sr", "\\u0410:\\u0411:\\u0412:\\u0413:\\u0414:\\u0402:\\u0415:\\u0416:\\u0417:\\u0418:\\u0408:\\u041A:\\u041B:\\u0409:\\u041C:\\u041D:\\u040A:\\u041E:\\u041F:\\u0420:\\u0421:\\u0422:\\u040B:\\u0423:\\u0424:\\u0425:\\u0426:\\u0427:\\u040F:\\u0428"},
/* Swedish*/ {"sv", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\\u00C5:\\u00C4:\\u00D6"},
/* Turkish*/ {"tr", "A:B:C:\\u00C7:D:E:F:G:H:I:\\u0130:J:K:L:M:N:O:\\u00D6:P:Q:R:S:\\u015E:T:U:\\u00DC:V:W:X:Y:Z"},
/* Ukrainian*/ {"uk", "\\u0410:\\u0411:\\u0412:\\u0413:\\u0490:\\u0414:\\u0415:\\u0404:\\u0416:\\u0417:\\u0418:\\u0406:\\u0407:\\u0419:\\u041A:\\u041B:\\u041C:\\u041D:\\u041E:\\u041F:\\u0420:\\u0421:\\u0422:\\u0423:\\u0424:\\u0425:\\u0426:\\u0427:\\u0428:\\u0429:\\u042E:\\u042F"},
/* Vietnamese*/ {"vi", "A:\\u0102:\\u00C2:B:C:D:\\u0110:E:\\u00CA:F:G:H:I:J:K:L:M:N:O:\\u00D4:\\u01A0:P:Q:R:S:T:U:\\u01AF:V:W:X:Y:Z"},
/* Chinese*/ {"zh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
/* Chinese (Traditional Han)*/ {"zh_Hant", "1\\u5283:2\\u5283:3\\u5283:4\\u5283:5\\u5283:6\\u5283:7\\u5283:8\\u5283:9\\u5283:10\\u5283:11\\u5283:12\\u5283:13\\u5283:14\\u5283:15\\u5283:16\\u5283:17\\u5283:18\\u5283:19\\u5283:20\\u5283:21\\u5283:22\\u5283:23\\u5283:24\\u5283:25\\u5283:26\\u5283:27\\u5283:28\\u5283:29\\u5283:30\\u5283:31\\u5283:32\\u5283:33\\u5283:35\\u5283:36\\u5283:39\\u5283:48\\u5283"},
};
void AlphabeticIndexTest::TestIndexCharactersList() {
UErrorCode status = U_ZERO_ERROR;
for (int32_t i = 0; i < LENGTHOF(localeAndIndexCharactersLists); ++i) {
const char *(&localeAndIndexCharacters)[2] = localeAndIndexCharactersLists[i];
const char *locale = localeAndIndexCharacters[0];
UnicodeString expectedIndexCharacters
= (UnicodeString("\\u2026:") + localeAndIndexCharacters[1] + ":\\u2026").unescape();
AlphabeticIndex index(locale, status);
TEST_CHECK_STATUS;
LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
TEST_CHECK_STATUS;
// Join the elements of the list to a string with delimiter ":"
UnicodeString actualIndexCharacters;
assertEquals(locale,
expectedIndexCharacters,
joinLabelsAndAppend(*immIndex, actualIndexCharacters));
logln(locale + UnicodeString(": ") + actualIndexCharacters);
}
}
void AlphabeticIndexTest::TestHaniFirst() {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<RuleBasedCollator> coll(
static_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getRoot(), status)));
int32_t reorderCodes[] = { USCRIPT_HAN };
coll->setReorderCodes(reorderCodes, LENGTHOF(reorderCodes), status);
TEST_CHECK_STATUS;
AlphabeticIndex index(coll.orphan(), status);
TEST_CHECK_STATUS;
assertEquals("getBucketCount()", 1, index.getBucketCount(status)); // ... (underflow only)
index.addLabels(Locale::getEnglish(), status);
assertEquals("getBucketCount()", 28, index.getBucketCount(status)); // ... A-Z ...
int32_t bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x897f), status);
assertEquals("getBucketIndex(U+897F)", 0, bucketIndex); // underflow bucket
bucketIndex = index.getBucketIndex("i", status);
assertEquals("getBucketIndex(i)", 9, bucketIndex);
bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x03B1), status);
assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
// TODO: Test with an unassigned code point (not just U+FFFF)
// when unassigned code points are not in the Hani reordering group any more.
// String unassigned = UTF16.valueOf(0x50005);
bucketIndex = index.getBucketIndex(UnicodeString((UChar)0xFFFF), status);
assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
}
void AlphabeticIndexTest::TestPinyinFirst() {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<RuleBasedCollator> coll(
static_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getChinese(), status)));
int32_t reorderCodes[] = { USCRIPT_HAN };
coll->setReorderCodes(reorderCodes, LENGTHOF(reorderCodes), status);
TEST_CHECK_STATUS;
AlphabeticIndex index(coll.orphan(), status);
TEST_CHECK_STATUS;
assertEquals("getBucketCount()", 1, index.getBucketCount(status)); // ... (underflow only)
index.addLabels(Locale::getChinese(), status);
assertEquals("getBucketCount()", 28, index.getBucketCount(status)); // ... A-Z ...
int bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x897f), status);
assertEquals("getBucketIndex(U+897F)", 'X' - 'A' + 1, bucketIndex);
bucketIndex = index.getBucketIndex("i", status);
assertEquals("getBucketIndex(i)", 9, bucketIndex);
bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x03B1), status);
assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
// TODO: Test with an unassigned code point (not just U+FFFF)
// when unassigned code points are not in the Hani reordering group any more.
// String unassigned = UTF16.valueOf(0x50005);
bucketIndex = index.getBucketIndex(UnicodeString((UChar)0xFFFF), status);
assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
}
void AlphabeticIndexTest::TestSchSt() {
UErrorCode status = U_ZERO_ERROR;
AlphabeticIndex index(Locale::getGerman(), status);
index.addLabels(UnicodeSet("[\\u00C6{Sch*}{St*}]", status), status);
TEST_CHECK_STATUS;
// ... A AE-ligature B-R S Sch St T-Z ...
LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
TEST_CHECK_STATUS;
assertEquals("getBucketCount()", 31, index.getBucketCount(status));
assertEquals("immutable getBucketCount()", 31, immIndex->getBucketCount());
static const struct TestCase {
const char *name;
int32_t bucketIndex;
const char *bucketLabel;
} testCases[] = {
// name, bucket index, bucket label
{ "Adelbert", 1, "A" },
{ "Afrika", 1, "A" },
{ "\\u00C6sculap", 2, "\\u00C6" },
{ "Aesthet", 2, "\\u00C6" },
{ "Berlin", 3, "B" },
{ "Rilke", 19, "R" },
{ "Sacher", 20, "S" },
{ "Seiler", 20, "S" },
{ "Sultan", 20, "S" },
{ "Schiller", 21, "Sch" },
{ "Steiff", 22, "St" },
{ "Thomas", 23, "T" }
};
for (int32_t i = 0; i < LENGTHOF(testCases); ++i) {
const TestCase &testCase = testCases[i];
UnicodeString name = UnicodeString(testCase.name).unescape();
UnicodeString label = UnicodeString(testCase.bucketLabel).unescape();
char msg[100];
sprintf(msg, "getBucketIndex(%s)", testCase.name);
assertEquals(msg, testCase.bucketIndex, index.getBucketIndex(name, status));
sprintf(msg, "immutable getBucketIndex(%s)", testCase.name);
assertEquals(msg, testCase.bucketIndex, immIndex->getBucketIndex(name, status));
sprintf(msg, "immutable bucket label (%s)", testCase.name);
assertEquals(msg, label, immIndex->getBucket(testCase.bucketIndex)->getLabel());
}
}
void AlphabeticIndexTest::TestNoLabels() {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<RuleBasedCollator> coll(
static_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getRoot(), status)));
TEST_CHECK_STATUS;
AlphabeticIndex index(coll.orphan(), status);
TEST_CHECK_STATUS;
index.addRecord(UnicodeString((UChar)0x897f), NULL, status);
index.addRecord("i", NULL, status);
index.addRecord(UnicodeString((UChar)0x03B1), NULL, status);
assertEquals("getBucketCount()", 1, index.getBucketCount(status)); // ...
TEST_ASSERT(index.nextBucket(status));
assertEquals("underflow label type", U_ALPHAINDEX_UNDERFLOW, index.getBucketLabelType());
assertEquals("all records in the underflow bucket", 3, index.getBucketRecordCount());
}
void AlphabeticIndexTest::TestChineseZhuyin() {
UErrorCode status = U_ZERO_ERROR;
char loc[100];
uloc_forLanguageTag("zh-u-co-zhuyin", loc, LENGTHOF(loc), NULL, &status);
AlphabeticIndex index(loc, status);
LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
TEST_CHECK_STATUS;
assertEquals("getBucketCount()", 38, immIndex->getBucketCount());
assertEquals("label 1", UnicodeString((UChar)0x3105), immIndex->getBucket(1)->getLabel());
assertEquals("label 2", UnicodeString((UChar)0x3106), immIndex->getBucket(2)->getLabel());
assertEquals("label 3", UnicodeString((UChar)0x3107), immIndex->getBucket(3)->getLabel());
assertEquals("label 4", UnicodeString((UChar)0x3108), immIndex->getBucket(4)->getLabel());
assertEquals("label 5", UnicodeString((UChar)0x3109), immIndex->getBucket(5)->getLabel());
}
#endif

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2012, International Business Machines Corporation and
* Copyright (c) 2012-2013, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
//
@ -24,6 +24,27 @@ public:
virtual void ManyLocalesTest();
virtual void HackPinyinTest();
virtual void TestBug9009();
void TestIndexCharactersList();
/**
* Test AlphabeticIndex vs. root with script reordering.
*/
void TestHaniFirst();
/**
* Test AlphabeticIndex vs. Pinyin with script reordering.
*/
void TestPinyinFirst();
/**
* Test labels with multiple primary weights.
*/
void TestSchSt();
/**
* With no real labels, there should be only the underflow label.
*/
void TestNoLabels();
/**
* Test with the Bopomofo-phonetic tailoring.
*/
void TestChineseZhuyin();
};
#endif

View file

@ -233,6 +233,14 @@ IntlTest::appendHex(uint32_t number,
0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0
}; /* "0123456789ABCDEF" */
if (digits < 0) { // auto-digits
digits = 2;
uint32_t max = 0xff;
while (number > max) {
digits += 2;
max = (max << 8) | 0xff;
}
}
switch (digits)
{
case 8:
@ -258,6 +266,13 @@ IntlTest::appendHex(uint32_t number,
return target;
}
UnicodeString
IntlTest::toHex(uint32_t number, int32_t digits) {
UnicodeString result;
appendHex(number, digits, result);
return result;
}
static inline UBool isPrintable(UChar32 c) {
return c <= 0x7E && (c >= 0x20 || c == 9 || c == 0xA || c == 0xD);
}
@ -1728,6 +1743,23 @@ UBool IntlTest::assertEquals(const char* message,
return TRUE;
}
UBool IntlTest::assertEquals(const char* message,
int32_t expected,
int32_t actual) {
if (expected != actual) {
errln((UnicodeString)"FAIL: " + message + "; got " +
actual + "=0x" + toHex(actual) +
"; expected " + expected + "=0x" + toHex(expected));
return FALSE;
}
#ifdef VERBOSE_ASSERTIONS
else {
logln((UnicodeString)"Ok: " + message + "; got " + actual + "=0x" + toHex(actual));
}
#endif
return TRUE;
}
#if !UCONFIG_NO_FORMATTING
UBool IntlTest::assertEquals(const char* message,
const Formattable& expected,

View file

@ -241,6 +241,7 @@ protected:
const UnicodeString& actual, UBool possibleDataError=FALSE);
UBool assertEquals(const char* message, const char* expected,
const char* actual);
UBool assertEquals(const char* message, int32_t expected, int32_t actual);
#if !UCONFIG_NO_FORMATTING
UBool assertEquals(const char* message, const Formattable& expected,
const Formattable& actual);
@ -299,7 +300,12 @@ protected:
static UnicodeString &prettify(const UnicodeString &source, UnicodeString &target);
static UnicodeString prettify(const UnicodeString &source, UBool parseBackslash=FALSE);
// digits=-1 determines the number of digits automatically
static UnicodeString &appendHex(uint32_t number, int32_t digits, UnicodeString &target);
static UnicodeString toHex(uint32_t number, int32_t digits=-1);
static inline UnicodeString toHex(int32_t number, int32_t digits=-1) {
return toHex((uint32_t)number, digits);
}
public:
static void setICU_DATA(); // Set up ICU_DATA if necessary.