diff --git a/icu4c/source/i18n/coll.cpp b/icu4c/source/i18n/coll.cpp index e4d48ef9093..58a156af641 100644 --- a/icu4c/source/i18n/coll.cpp +++ b/icu4c/source/i18n/coll.cpp @@ -35,6 +35,7 @@ // 6/20/97 helena Java class name change. // 04/23/99 stephen Removed EDecompositionMode, merged with // Normalizer::EMode +// 11/23/9 srl Inlining of some critical functions //============================================================================= #include "colcache.h" @@ -140,11 +141,6 @@ Collator::greater(const UnicodeString& source, return (compare(source, target) == Collator::GREATER); } -Collator::ECollationStrength -Collator::getStrength() const -{ - return strength; -} void Collator::setStrength(Collator::ECollationStrength newStrength) @@ -152,11 +148,6 @@ Collator::setStrength(Collator::ECollationStrength newStrength) strength = newStrength; } -Normalizer::EMode -Collator::getDecomposition() const -{ - return decmp; -} void Collator::setDecomposition(Normalizer::EMode decompositionMode) { diff --git a/icu4c/source/i18n/coll.h b/icu4c/source/i18n/coll.h index 0fa313202fc..957ec7dcf55 100644 --- a/icu4c/source/i18n/coll.h +++ b/icu4c/source/i18n/coll.h @@ -34,6 +34,10 @@ // 02/10/98 damiba Added compare() with length as parameter. // 04/23/99 stephen Removed EDecompositionMode, merged with // Normalizer::EMode. +// 11/02/99 helena Collator performance enhancements. Eliminates the +// UnicodeString construction and special case for NO_OP. +// 11/23/99 srl More performance enhancements. Inlining of +// critical accessors. //============================================================================= #ifndef COLL_H @@ -299,6 +303,38 @@ public: int32_t length) const = 0; + /** + * The comparison function compares the character data stored in two + * different string arrays. Returns information about whether a string + * array is less than, greater than or equal to another string array. + *

Example of use: + *

+   * .       UErrorCode status = U_ZERO_ERROR;
+   * .       Collator *myCollation = Collator::createInstance(Locale::US, status);
+   * .       if (U_FAILURE(status)) return;
+   * .       myCollation->setStrength(Collator::PRIMARY);
+   * .       // result would be Collator::EQUAL ("abc" == "ABC")
+   * .       // (no primary difference between "abc" and "ABC")
+   * .       Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
+   * .       myCollation->setStrength(Collator::TERTIARY);
+   * .       // result would be Collator::LESS (abc" <<< "ABC")
+   * .       // (with tertiary difference between "abc" and "ABC")
+   * .       Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
+   * 
+ * @param source the source string array to be compared with. + * @param sourceLength the length of the source string array. If this value + * is equal to -1, the string array is null-terminated. + * @param target the string that is to be compared with the source string. + * @param targetLength the length of the target string array. If this value + * is equal to -1, the string array is null-terminated. + * @return Returns a byte value. GREATER if source is greater + * than target; EQUAL if source is equal to target; LESS if source is less + * than target + **/ + virtual EComparisonResult compare( const UChar* source, + int32_t sourceLength, + const UChar* target, + int32_t targetLength) const = 0; /** Transforms the string into a series of characters that can be compared * with CollationKey::compareTo. It is not possible to restore the original @@ -339,6 +375,24 @@ public: virtual CollationKey& getCollationKey(const UnicodeString& source, CollationKey& key, UErrorCode& status) const = 0; + + /** Transforms the string into a series of characters that can be compared + * with CollationKey::compareTo. It is not possible to restore the original + * string from the chars in the sort key. The generated sort key handles + * only a limited number of ignorable characters. + *

Use CollationKey::equals or CollationKey::compare to compare the + * generated sort keys. + *

If the source string is null, a null collation key will be returned. + * @param source the source string to be transformed into a sort key. + * @param sourceLength length of the collation key + * @param key the collation key to be filled in + * @return the collation key of the string based on the collation rules. + * @see CollationKey#compare + */ + virtual CollationKey& getCollationKey(const UChar *source, + int32_t sourceLength, + CollationKey& key, + UErrorCode& status) const = 0; /** * Generates the hash code for the collation object */ @@ -503,4 +557,17 @@ Collator::operator!=(const Collator& other) const return result; } +inline Collator::ECollationStrength +Collator::getStrength() const +{ + return strength; +} + +inline Normalizer::EMode +Collator::getDecomposition() const +{ + return decmp; +} + + #endif diff --git a/icu4c/source/i18n/tblcoll.cpp b/icu4c/source/i18n/tblcoll.cpp index 18bd7c68779..6ff8827635e 100644 --- a/icu4c/source/i18n/tblcoll.cpp +++ b/icu4c/source/i18n/tblcoll.cpp @@ -43,7 +43,10 @@ * Normalizer::EMode * 06/14/99 stephen Removed kResourceBundleSuffix * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx -* files are no longer used. +* files are no longer used. +* 11/02/99 helena Collator performance enhancements. Special case +* for NO_OP situations. +* 11/17/99 srl More performance enhancements. Inlined some internal functions. ******************************************************************************* */ @@ -69,6 +72,8 @@ #include +#include + class RuleBasedCollatorStreamer { @@ -129,14 +134,124 @@ const int16_t RuleBasedCollator::FILEID = 0x5443; // unique f const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id +//================ Some inline definitions of implementation functions........ ======== + +// Get the character order in the mapping table +inline int32_t +RuleBasedCollator::getUnicodeOrder(UChar ch) const +{ + return ucmp32_get(data->mapping, ch); +} + +inline int32_t +RuleBasedCollator::strengthOrder(int32_t value) const +{ + if (getStrength() == PRIMARY) + { + return (value & PRIMARYDIFFERENCEONLY); + } else if (getStrength() == SECONDARY) + { + return (value & SECONDARYDIFFERENCEONLY); + } + return value; +} + + +inline int32_t +RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor, + UErrorCode status) const +{ + if (U_FAILURE(status)) + { + return CollationElementIterator::NULLORDER; + } + + if (cursor->bufferAlias != NULL) + { + // bufferAlias needs a bit of an explanation. + // When we hit an expanding character in the text, we call the order's + // getExpandValues method to retrieve an array of the orderings for all + // of the characters in the expansion (see the end of this method). + // The first ordering is returned, and an alias to the orderings array + // is saved so that the remaining orderings can be returned on subsequent + // calls to next. So, if the expanding buffer is not exhausted, + // all we have to do here is return the next ordering in the buffer. + if (cursor->expIndex < cursor->bufferAlias->size()) + { + //_L((stderr, "next from [%08X] from bufferAlias\n", this)); + return strengthOrder(cursor->bufferAlias->at(cursor->expIndex++)); + } + else + { + cursor->bufferAlias = NULL; + cursor->expIndex = 0; + } + } + else if (cursor->swapOrder != 0) + { + // If we find a character with no order, we return the marking + // flag, UNMAPPEDCHARVALUE, 0x7fff0000, and then the character + // itself shifted left 16 bits as orders. At this point, the + // UNMAPPEDCHARVALUE flag has already been returned by the code + // below, so just return the shifted character here. + int32_t order = cursor->swapOrder << 16; + + //_L((stderr, "next from [%08X] swaporder..\n", this)); + cursor->swapOrder = 0; + + return order; + } + + UChar ch = cursor->current(); + cursor->next(); + + //_L((stderr, "Next from [%08X] = [%04X], [%c]\n", cursor, (int)ch & 0xFFFF, (char)(ch & 0xFF))); + + if (ch == Normalizer::DONE) { + return CollationElementIterator::NULLORDER; + } + // Ask the collator for this character's ordering. + int32_t value = getUnicodeOrder(ch); + + if (value == UNMAPPED) + { + // Returned an "unmapped" flag and save the character so it can be + // returned next time this method is called. + if (ch == 0x0000) return ch; + cursor->swapOrder = ch; // \u0000 is not valid in C++'s UnicodeString + return CollationElementIterator::UNMAPPEDCHARVALUE; + } + + if (value >= CONTRACTCHARINDEX) + { + value = nextContractChar(cursor, ch, status); + } + + if (value >= EXPANDCHARINDEX) + { + cursor->bufferAlias = getExpandValueList(value); + cursor->expIndex = 0; + value = cursor->bufferAlias->at(cursor->expIndex++); + } + + int32_t str = strengthOrder(value); + + return strengthOrder(value); +} + +// ==================== End inlines ============================================ + + //=============================================================================== RuleBasedCollator::RuleBasedCollator() : Collator(), isOverIgnore(FALSE), mPattern(0), - sourceCursor(0), - targetCursor(0), + // sourceCursor(0), + //targetCursor(0), + cursor1(0), + cursor2(0), data(0), dataIsOwned(FALSE) { @@ -146,8 +261,10 @@ RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that) : Collator(that), isOverIgnore(that.isOverIgnore), mPattern(0), - sourceCursor(0), - targetCursor(0), + // sourceCursor(0), + //targetCursor(0), + cursor1(0), + cursor2(0), dataIsOwned(FALSE), data(that.data) // Alias the data pointer { @@ -214,8 +331,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, : Collator(), isOverIgnore(FALSE), mPattern(0), - sourceCursor(0), - targetCursor(0), + // sourceCursor(0), + /// targetCursor(0), + cursor1(0), + cursor2(0), data(0), dataIsOwned(FALSE) { @@ -233,8 +352,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, : Collator(collationStrength, Normalizer::NO_OP), isOverIgnore(FALSE), mPattern(0), - sourceCursor(0), - targetCursor(0), + // sourceCursor(0), + // targetCursor(0), + cursor1(0), + cursor2(0), data(0), dataIsOwned(FALSE) { @@ -242,7 +363,6 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, { return; } - constructFromRules(rules, status); } @@ -252,8 +372,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, : Collator(TERTIARY, decompositionMode), isOverIgnore(FALSE), mPattern(0), - sourceCursor(0), - targetCursor(0), + // sourceCursor(0), + // targetCursor(0), + cursor1(0), + cursor2(0), data(0), dataIsOwned(FALSE) { @@ -272,8 +394,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, : Collator(collationStrength, decompositionMode), isOverIgnore(FALSE), mPattern(0), - sourceCursor(0), - targetCursor(0), + // sourceCursor(0), + //targetCursor(0), + cursor1(0), + cursor2(0), data(0), dataIsOwned(FALSE) { @@ -392,10 +516,14 @@ RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale, isOverIgnore(FALSE), dataIsOwned(FALSE), data(0), - sourceCursor(0), - targetCursor(0), + // sourceCursor(0), + //targetCursor(0), + cursor1(0), + cursor2(0), mPattern(0) { + + if (U_FAILURE(status)) { return; @@ -447,6 +575,18 @@ RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale, return; } + // srl write out default.col + { + UnicodeString defLocaleName = ResourceBundle::kDefaultFilename; + char *binaryFilePath = createPathName(Locale::getDataDirectory(), + defLocaleName, kFilenameSuffix); + bool_t ok = writeToFile(binaryFilePath); + delete [] binaryFilePath; +#ifdef COLLDEBUG + cerr << defLocaleName << " [default] binary write " << (ok? "OK" : "Failed") << endl; +#endif + } + data->desiredLocale = desiredLocale; desiredLocale.getName(localeName); data->realLocaleName = localeName; @@ -567,7 +707,7 @@ RuleBasedCollator::constructFromFile( const Locale& locale, // Try to load up the collation from a binary file first constructFromFile(binaryFilePath, status); #ifdef COLLDEBUG - cerr << localeFileName << " binary load " << errorName(status) << endl; + cerr << localeFileName << kFilenameSuffix << " binary load " << errorName(status) << endl; #endif if(U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) return; @@ -629,7 +769,7 @@ RuleBasedCollator::constructFromFile( const Locale& locale, } #ifdef COLLDEBUG - cerr << localeFileName << " ascii load " << (U_SUCCESS(status) ? "OK" : "Failed") << endl; + cerr << localeFileName << " ascii load " << (U_SUCCESS(status) ? "OK" : "Failed") << " - try= " << (tryBinaryFile?"true":"false") << endl; #endif if(U_SUCCESS(status) && tryBinaryFile) { @@ -655,11 +795,20 @@ RuleBasedCollator::~RuleBasedCollator() data = 0; - delete sourceCursor; - sourceCursor = 0; + // delete sourceCursor; + // sourceCursor = 0; - delete targetCursor; - targetCursor = 0; + // delete targetCursor; + // targetCursor = 0; + + if (cursor1 != NULL) { + delete cursor1; + cursor1 = 0; + } + if (cursor2 != NULL) { + delete cursor2; + cursor2 = 0; + } delete mPattern; mPattern = 0; @@ -742,13 +891,13 @@ RuleBasedCollator::getRules() const data->isRuleTableLoaded = TRUE; #ifdef _DEBUG // the following is useful for specific debugging purposes - // UnicodeString name; - // cerr << "Table collation rules loaded dynamically for " - // << data->desiredLocale.getName(name) - // << " at " - // << data->realLocaleName - // << ", " << dec << data->ruleTable.size() << " characters" - // << endl; + UnicodeString name; + cerr << "Table collation rules loaded dynamically for " + << data->desiredLocale.getName(name) + << " at " + << data->realLocaleName + << ", " << dec << data->ruleTable.size() << " characters" + << endl; #endif } else @@ -762,6 +911,16 @@ RuleBasedCollator::getRules() const << endl; cerr << "Status " << errorName(status) << ", mPattern " << temp.mPattern << endl; #endif + /* SRL have to add this because we now have the situation where + DEFAULT is loaded from a binary file w/ no rules. */ + UErrorCode intStatus = U_ZERO_ERROR; + temp.constructFromRules(RuleBasedCollator::DEFAULTRULES, intStatus); + + if(U_SUCCESS(intStatus) && (temp.mPattern != 0)) + { + data->ruleTable = temp.getRules(); + data->isRuleTableLoaded = TRUE; + } } } @@ -783,14 +942,15 @@ RuleBasedCollator::compare( const UnicodeString& source, return (RuleBasedCollator::compare(source_togo, target_togo)); } - -// Compare two strings using this collator -Collator::EComparisonResult -RuleBasedCollator::compare(const UnicodeString& source, - const UnicodeString& target) const +Collator::EComparisonResult +RuleBasedCollator::compare(const UChar* source, + int32_t sourceLength, + const UChar* target, + int32_t targetLength) const { // check if source and target are valid strings - if (source.isBogus() || target.isBogus()) + if (((source == 0) && (target == 0)) || + ((sourceLength == 0) && (targetLength == 0))) { return Collator::EQUAL; } @@ -798,55 +958,36 @@ RuleBasedCollator::compare(const UnicodeString& source, Collator::EComparisonResult result = Collator::EQUAL; UErrorCode status = U_ZERO_ERROR; - // The basic algorithm here is that we use CollationElementIterators - // to step through both the source and target strings. We compare each - // collation element in the source string against the corresponding one - // in the target, checking for differences. - // - // If a difference is found, we set to LESS or GREATER to - // indicate whether the source string is less or greater than the target. - // - // However, it's not that simple. If we find a tertiary difference - // (e.g. 'A' vs. 'a') near the beginning of a string, it can be - // overridden by a primary difference (e.g. "A" vs. "B") later in - // the string. For example, "AA" < "aB", even though 'A' > 'a'. - // - // To keep track of this, we use checkSecTer and checkTertiary to keep - // track of the strength of the most significant difference that has been - // found so far. When we find a difference whose strength is greater than - // the previous ones, it overrides the last difference (if any) that - // was found. - // - - if (sourceCursor == NULL) + if (cursor1 == NULL) { - ((RuleBasedCollator *)this)->sourceCursor = createCollationElementIterator(source); + ((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLength, getDecomposition()); } else { - sourceCursor->setText(source, status); + cursor1->setModeAndText(getDecomposition(), source, sourceLength, status); } - if (sourceCursor == NULL || U_FAILURE(status)) + if ( /*cursor1->cursor == NULL ||*/ U_FAILURE(status)) { return Collator::EQUAL; } - if (targetCursor == NULL) + if (cursor2 == NULL) { - ((RuleBasedCollator *)this)->targetCursor = createCollationElementIterator(target); + ((RuleBasedCollator *)this)->cursor2 = new NormalizerIterator(target, targetLength, getDecomposition()); } else { - targetCursor->setText(target, status); + cursor2->setModeAndText(getDecomposition(), target, targetLength, status); } - if (targetCursor == NULL || U_FAILURE(status)) + if (/*cursor2 == NULL ||*/ U_FAILURE(status)) { return Collator::EQUAL; } int32_t sOrder, tOrder; + // int32_t sOrder = CollationElementIterator::NULLORDER, tOrder = CollationElementIterator::NULLORDER; bool_t gets = TRUE, gett = TRUE; bool_t initialCheckSecTer = getStrength() >= Collator::SECONDARY; bool_t checkSecTer = initialCheckSecTer; @@ -860,7 +1001,7 @@ RuleBasedCollator::compare(const UnicodeString& source, // we've been requested to skip it. if (gets) { - sOrder = sourceCursor->next(status); + sOrder = getStrengthOrder((NormalizerIterator*)cursor1, status); if (U_FAILURE(status)) { @@ -872,7 +1013,7 @@ RuleBasedCollator::compare(const UnicodeString& source, if (gett) { - tOrder = targetCursor->next(status); + tOrder = getStrengthOrder((NormalizerIterator*)cursor2, status); if (U_FAILURE(status)) { @@ -1036,7 +1177,7 @@ RuleBasedCollator::compare(const UnicodeString& source, } } } - while ((sOrder = sourceCursor->next(status)) != CollationElementIterator::NULLORDER); + while ((sOrder = getStrengthOrder(cursor1, status)) != CollationElementIterator::NULLORDER); } else if (tOrder != CollationElementIterator::NULLORDER) { @@ -1060,7 +1201,7 @@ RuleBasedCollator::compare(const UnicodeString& source, } } } - while ((tOrder = targetCursor->next(status)) != CollationElementIterator::NULLORDER); + while ((tOrder = getStrengthOrder(cursor2, status)) != CollationElementIterator::NULLORDER); } @@ -1070,15 +1211,46 @@ RuleBasedCollator::compare(const UnicodeString& source, // puts the result of the string comparison directly into result if (result == Collator::EQUAL && getStrength() == IDENTICAL) { - UnicodeString sourceDecomp, targetDecomp; +#if 0 + // ******** for the UChar normalization interface. + // It doesn't work much faster, and the code was broken + // so it's commented out. --srl +// UChar sourceDecomp[1024], targetDecomp[1024]; +// int32_t sourceDecompLength = 1024; +// int32_t targetDecompLength = 1024; + +// int8_t comparison; +// Normalizer::EMode decompMode = getDecomposition(); + +// if (decompMode != Normalizer::NO_OP) +// { +// Normalizer::normalize(source, sourceLength, decompMode, +// 0, sourceDecomp, sourceDecompLength, status); + +// Normalizer::normalize(target, targetLength, decompMode, +// 0, targetDecomp, targetDecompLength, status); + +// comparison = u_strcmp(sourceDecomp,targetDecomp); +// } +// else +// { +// comparison = u_strcmp(source, target); /* ! */ +// } + +#else + + UnicodeString sourceDecomp, targetDecomp; + int8_t comparison; Normalizer::normalize(source, getDecomposition(), - 0, sourceDecomp, status); + 0, sourceDecomp, status); + Normalizer::normalize(target, getDecomposition(), - 0, targetDecomp, status); + 0, targetDecomp, status); comparison = sourceDecomp.compare(targetDecomp); +#endif if (comparison < 0) { @@ -1097,6 +1269,49 @@ RuleBasedCollator::compare(const UnicodeString& source, return result; } + +int32_t +RuleBasedCollator::nextContractChar(NormalizerIterator *cursor, + UChar ch, + UErrorCode& status) const +{ + // First get the ordering of this single character + VectorOfPToContractElement *list = getContractValues(ch); + EntryPair *pair = (EntryPair *)list->at(0); + int32_t order = pair->value; + + // Now iterate through the chars following it and + // look for the longest match + ((UnicodeString&)key).remove(); + ((UnicodeString&)key) += ch; + + while ((ch = cursor->current()) != Normalizer::DONE) + { + ((UnicodeString&)key) += ch; + + int32_t n = getEntry(list, key, TRUE); + + if (n == UNMAPPED) + { + break; + } + cursor->next(); + + pair = (EntryPair *)list->at(n); + order = pair->value; + } + + return order; +} + +// Compare two strings using this collator +Collator::EComparisonResult +RuleBasedCollator::compare(const UnicodeString& source, + const UnicodeString& target) const +{ + return compare(source.getUChars(), source.length(), target.getUChars(), target.length()); +} + // Retrieve a collation key for the specified string // The key can be compared with other collation keys using a bitwise comparison // (e.g. memcmp) to find the ordering of their respective source strings. @@ -1134,6 +1349,15 @@ CollationKey& RuleBasedCollator::getCollationKey( const UnicodeString& source, CollationKey& sortkey, UErrorCode& status) const +{ + return RuleBasedCollator::getCollationKey(source.getUChars(), source.size(), sortkey, status); +} + +CollationKey& +RuleBasedCollator::getCollationKey( const UChar* source, + int32_t sourceLen, + CollationKey& sortkey, + UErrorCode& status) const { if (U_FAILURE(status)) { @@ -1141,27 +1365,21 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source, return sortkey.setToBogus(); } - if (source.isBogus()) - { - status = U_MEMORY_ALLOCATION_ERROR; - return sortkey.setToBogus(); - } - - if (source.size() == 0) + if ((!source) || (sourceLen == 0)) { return sortkey.reset(); } - if (sourceCursor == NULL) + if (cursor1 == NULL) { - ((RuleBasedCollator *)this)->sourceCursor = createCollationElementIterator(source); + ((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLen, getDecomposition()); } else { - sourceCursor->setText(source, status); + cursor1->setModeAndText(getDecomposition(), source,sourceLen, status); } - if (sourceCursor == NULL || U_FAILURE(status)) + if (U_FAILURE(status)) { return sortkey.setToBogus(); } @@ -1177,7 +1395,8 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source, UnicodeString decomp; // iterate over the source, counting primary, secondary, and tertiary entries - while((order = sourceCursor->next(status)) != CollationElementIterator::NULLORDER) + while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) != + CollationElementIterator::NULLORDER) { int32_t secOrder = CollationElementIterator::secondaryOrder(order); int32_t terOrder = CollationElementIterator::tertiaryOrder(order); @@ -1230,7 +1449,7 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source, if (compareIdent) { - Normalizer::normalize(source, getDecomposition(), + Normalizer::normalize(source, getDecomposition(), // SRL: ?? 0, decomp, status); if (U_SUCCESS(status)) @@ -1259,10 +1478,10 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source, int32_t identCursor = terCursor + (2 * totalTer); // reset source to the beginning - sourceCursor->reset(); + cursor1->reset(); // now iterate over the source computing the actual entries - while((order = sourceCursor->next(status)) != CollationElementIterator::NULLORDER) + while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) != CollationElementIterator::NULLORDER) { if (U_FAILURE(status)) { @@ -1336,6 +1555,14 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source, sortkey.storeUnicodeString(identCursor, decomp); } + // Debugging - print out the sortkey [--srl] +// { +// const uint8_t *bytes; +// int32_t xcount; +// bytes = sortkey.getByteArray(xcount); +// // fprintf(stderr, "\n\n- [%02X] [%02X]\n\n", (int)(bytes[0]&0xFF), (int)(bytes[1]&0xFF) ); +// } + return sortkey; } @@ -1615,6 +1842,8 @@ RuleBasedCollator::increment(Collator::ECollationStrength aStrength, int32_t las data->maxTerOrder += 1; } break; + + // case IDENTICAL? } return lastValue; @@ -2017,12 +2246,6 @@ VectorOfInt *RuleBasedCollator::getExpandValueList(int32_t order) const return data->expandTable->at(order - EXPANDCHARINDEX); } -// Get the character order in the mapping table -int32_t -RuleBasedCollator::getUnicodeOrder(UChar ch) const -{ - return ucmp32_get(data->mapping, ch); -} void RuleBasedCollatorStreamer::streamIn(RuleBasedCollator* collator, FileStream* is) @@ -2117,7 +2340,7 @@ bool_t RuleBasedCollator::writeToFile(const char* fileName) const #ifdef COLLDEBUG fprintf(stderr, "binary write %s size %d %s\n", fileName, T_FileStream_size(ofs), - (!T_FileStream_error(ofs) ? ", OK" : ", FAIL"); + (!T_FileStream_error(ofs) ? ", OK" : ", FAIL")); #endif bool_t err = T_FileStream_error(ofs) == 0; diff --git a/icu4c/source/i18n/tblcoll.h b/icu4c/source/i18n/tblcoll.h index b257ee77b15..f7be21fad6f 100644 --- a/icu4c/source/i18n/tblcoll.h +++ b/icu4c/source/i18n/tblcoll.h @@ -37,7 +37,10 @@ * 04/23/99 stephen Removed EDecompositionMode, merged with * Normalizer::EMode * 06/14/99 stephen Removed kResourceBundleSuffix -* +* 11/02/99 helena Collator performance enhancements. Eliminates the +* UnicodeString construction and special case for NO_OP. +* 11/23/99 srl More performance enhancements. Updates to NormalizerIterator +* internal state management. ******************************************************************************* */ @@ -420,6 +423,39 @@ public: const UnicodeString& target, int32_t length) const; + /** + * The comparison function compares the character data stored in two + * different string arrays. Returns information about whether a string + * array is less than, greater than or equal to another string array. + *

Example of use: + *

+   * .       UErrorCode status = U_ZERO_ERROR;
+   * .       Collator *myCollation = Collator::createInstance(Locale::US, status);
+   * .       if (U_FAILURE(status)) return;
+   * .       myCollation->setStrength(Collator::PRIMARY);
+   * .       // result would be Collator::EQUAL ("abc" == "ABC")
+   * .       // (no primary difference between "abc" and "ABC")
+   * .       Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
+   * .       myCollation->setStrength(Collator::TERTIARY);
+   * .       // result would be Collator::LESS (abc" <<< "ABC")
+   * .       // (with tertiary difference between "abc" and "ABC")
+   * .       Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
+   * 
+ * @param source the source string array to be compared with. + * @param sourceLength the length of the source string array. If this value + * is equal to -1, the string array is null-terminated. + * @param target the string that is to be compared with the source string. + * @param targetLength the length of the target string array. If this value + * is equal to -1, the string array is null-terminated. + * @return Returns a byte value. GREATER if source is greater + * than target; EQUAL if source is equal to target; LESS if source is less + * than target + **/ + virtual EComparisonResult compare( const UChar* source, + int32_t sourceLength, + const UChar* target, + int32_t targetLength) const ; + /** Transforms a specified region of the string into a series of characters * that can be compared with CollationKey.compare. Use a CollationKey when * you need to do repeated comparisions on the same string. For a single comparison @@ -433,6 +469,13 @@ public: virtual CollationKey& getCollationKey( const UnicodeString& source, CollationKey& key, UErrorCode& status) const; + + virtual CollationKey& getCollationKey(const UChar *source, + int32_t sourceLength, + CollationKey& key, + UErrorCode& status) const; + + /** * Generates the hash code for the rule-based collation object. * @return the hash code. @@ -705,11 +748,41 @@ private: const UnicodeString& name, const UnicodeString& suffix); - /** - * Chops off the last portion of the locale name. For example, from "en_US_CA" - * to "en_US" and "en_US" to "en". - * @param localeName the locale name. + /* Internal class for quick iteration over the text. + 100% pure inline code */ + class NormalizerIterator { + public: + Normalizer *cursor; + VectorOfInt *bufferAlias; + int32_t swapOrder; + UChar* text; + int32_t expIndex; + int32_t textLen; + UTextOffset currentOffset; + + NormalizerIterator(void); + NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode); + ~NormalizerIterator(void); + void setText(const UChar* source, int32_t length, UErrorCode& status); + void setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status); + + UChar current(void) const; + UChar next(void); + void reset(void); + }; + + int32_t getStrengthOrder(NormalizerIterator* cursor, + UErrorCode status) const; + int32_t strengthOrder(int32_t value) const ; + int32_t nextContractChar(NormalizerIterator *cursor, + UChar ch, + UErrorCode& status) const; + /** + * Chops off the last portion of the locale name. For example, from "en_US_CA" + * to "en_US" and "en_US" to "en". + * @param localeName the locale name. + */ static void chopLocale(UnicodeString& localeName); //-------------------------------------------------------------------------- @@ -751,12 +824,151 @@ private: UnicodeString sbuffer; UnicodeString tbuffer; UnicodeString key; - CollationElementIterator *sourceCursor; - CollationElementIterator *targetCursor; + NormalizerIterator *cursor1; + NormalizerIterator *cursor2; bool_t dataIsOwned; TableCollationData* data; }; +inline +RuleBasedCollator::NormalizerIterator::NormalizerIterator() : + cursor(0), + bufferAlias(0), + swapOrder(0), + text(0), + textLen(0), + currentOffset(0), + expIndex(0) +{ +} + +inline +RuleBasedCollator::NormalizerIterator::NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode) : + cursor(0), + bufferAlias(0), + swapOrder(0), + text(0), + textLen(0), + currentOffset(0), + expIndex(0) +{ + if (mode == Normalizer::NO_OP) { + text = (UChar*)source; + textLen = length; + currentOffset = 0; + } else { + cursor = new Normalizer(source, length, mode); + + } +} + +inline +RuleBasedCollator::NormalizerIterator::~NormalizerIterator() +{ + if (cursor != 0) { + delete cursor; + cursor = 0; + } +} + +inline +void +RuleBasedCollator::NormalizerIterator::setText(const UChar* source, int32_t length, UErrorCode& status) +{ + if (cursor == 0) { + text = (UChar*)source; + textLen = length; + currentOffset = 0; + + } else { + text = 0; + cursor->setText(source, length, status); + } + bufferAlias = 0; + swapOrder = 0; + expIndex = 0; + currentOffset = 0; +} + +/* You can only set mode after the comparision of two strings is completed. + Setting the mode in the middle of a comparison is not allowed. + */ +inline +void + + +RuleBasedCollator::NormalizerIterator::setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status) +{ + if (cursor != NULL) { + if (mode != Normalizer::NO_OP) { + cursor->setMode(mode); + cursor->setText(source, length, status); + } else { + delete cursor; + cursor = 0; + + text = (UChar*)source; + textLen = length; + currentOffset = 0; + } + } else { + if(mode == Normalizer::NO_OP) + { + text = (UChar*)source; + textLen = length; + currentOffset = 0; + + } + else + { + cursor = new Normalizer(source, length, mode); + } + } + + bufferAlias = 0; + swapOrder = 0; + expIndex = 0; +} + +inline +UChar +RuleBasedCollator::NormalizerIterator::current(void) const +{ + if (text != 0) { + if(currentOffset >= textLen) + { + return Normalizer::DONE; + } + else + { + return text[currentOffset]; + } + } + + return cursor->current(); +} + + +inline +UChar +RuleBasedCollator::NormalizerIterator::next(void) +{ + if (text != 0) { + return ((currentOffset < textLen) ? text[++currentOffset] : Normalizer::DONE); + } + return cursor->next(); +} + +inline +void +RuleBasedCollator::NormalizerIterator::reset(void) +{ + currentOffset = 0; + if(cursor) + { + cursor->reset(); + } +} inline bool_t RuleBasedCollator::operator!=(const Collator& other) const @@ -772,4 +984,7 @@ RuleBasedCollator::addContractOrder(const UnicodeString &groupChars, addContractOrder(groupChars, anOrder, TRUE, status); } + + + #endif diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index 03b28c0bd63..9bfd3269672 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -138,11 +138,7 @@ ucol_strcoll( const UCollator *coll, const UChar *target, int32_t targetLength) { - int32_t srcLen = (sourceLength == -1 ? u_strlen(source) : sourceLength); - const UnicodeString tempSource((UChar*)source, sourceLength, sourceLength); - int32_t targLen = (targetLength == -1 ? u_strlen(target) : targetLength); - const UnicodeString tempTarget((UChar*)target, targLen, targLen); - return (UCollationResult) ((Collator*)coll)->compare(tempSource, tempTarget); + return (UCollationResult) ((Collator*)coll)->compare(source,sourceLength,target,targetLength); } U_CAPI bool_t @@ -290,12 +286,12 @@ ucol_getSortKey(const UCollator *coll, const uint8_t* bytes = NULL; CollationKey key; int32_t copyLen; - int32_t len = (sourceLength == -1 ? u_strlen(source) + int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); - UnicodeString string((UChar*)source, len, len); + // UnicodeString string((UChar*)source, len, len); UErrorCode status = U_ZERO_ERROR; - ((Collator*)coll)->getCollationKey(string, key, status); + ((Collator*)coll)->getCollationKey(source, len, key, status); if(U_FAILURE(status)) return 0;