diff --git a/icu4c/source/i18n/coll.cpp b/icu4c/source/i18n/coll.cpp
index e4d48ef9093..58a156af641 100644
--- a/icu4c/source/i18n/coll.cpp
+++ b/icu4c/source/i18n/coll.cpp
@@ -35,6 +35,7 @@
// 6/20/97 helena Java class name change.
// 04/23/99 stephen Removed EDecompositionMode, merged with
// Normalizer::EMode
+// 11/23/9 srl Inlining of some critical functions
//=============================================================================
#include "colcache.h"
@@ -140,11 +141,6 @@ Collator::greater(const UnicodeString& source,
return (compare(source, target) == Collator::GREATER);
}
-Collator::ECollationStrength
-Collator::getStrength() const
-{
- return strength;
-}
void
Collator::setStrength(Collator::ECollationStrength newStrength)
@@ -152,11 +148,6 @@ Collator::setStrength(Collator::ECollationStrength newStrength)
strength = newStrength;
}
-Normalizer::EMode
-Collator::getDecomposition() const
-{
- return decmp;
-}
void
Collator::setDecomposition(Normalizer::EMode decompositionMode)
{
diff --git a/icu4c/source/i18n/coll.h b/icu4c/source/i18n/coll.h
index 0fa313202fc..957ec7dcf55 100644
--- a/icu4c/source/i18n/coll.h
+++ b/icu4c/source/i18n/coll.h
@@ -34,6 +34,10 @@
// 02/10/98 damiba Added compare() with length as parameter.
// 04/23/99 stephen Removed EDecompositionMode, merged with
// Normalizer::EMode.
+// 11/02/99 helena Collator performance enhancements. Eliminates the
+// UnicodeString construction and special case for NO_OP.
+// 11/23/99 srl More performance enhancements. Inlining of
+// critical accessors.
//=============================================================================
#ifndef COLL_H
@@ -299,6 +303,38 @@ public:
int32_t length) const = 0;
+ /**
+ * The comparison function compares the character data stored in two
+ * different string arrays. Returns information about whether a string
+ * array is less than, greater than or equal to another string array.
+ *
Example of use:
+ *
+ * . UErrorCode status = U_ZERO_ERROR;
+ * . Collator *myCollation = Collator::createInstance(Locale::US, status);
+ * . if (U_FAILURE(status)) return;
+ * . myCollation->setStrength(Collator::PRIMARY);
+ * . // result would be Collator::EQUAL ("abc" == "ABC")
+ * . // (no primary difference between "abc" and "ABC")
+ * . Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
+ * . myCollation->setStrength(Collator::TERTIARY);
+ * . // result would be Collator::LESS (abc" <<< "ABC")
+ * . // (with tertiary difference between "abc" and "ABC")
+ * . Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
+ *
+ * @param source the source string array to be compared with.
+ * @param sourceLength the length of the source string array. If this value
+ * is equal to -1, the string array is null-terminated.
+ * @param target the string that is to be compared with the source string.
+ * @param targetLength the length of the target string array. If this value
+ * is equal to -1, the string array is null-terminated.
+ * @return Returns a byte value. GREATER if source is greater
+ * than target; EQUAL if source is equal to target; LESS if source is less
+ * than target
+ **/
+ virtual EComparisonResult compare( const UChar* source,
+ int32_t sourceLength,
+ const UChar* target,
+ int32_t targetLength) const = 0;
/** Transforms the string into a series of characters that can be compared
* with CollationKey::compareTo. It is not possible to restore the original
@@ -339,6 +375,24 @@ public:
virtual CollationKey& getCollationKey(const UnicodeString& source,
CollationKey& key,
UErrorCode& status) const = 0;
+
+ /** Transforms the string into a series of characters that can be compared
+ * with CollationKey::compareTo. It is not possible to restore the original
+ * string from the chars in the sort key. The generated sort key handles
+ * only a limited number of ignorable characters.
+ * Use CollationKey::equals or CollationKey::compare to compare the
+ * generated sort keys.
+ *
If the source string is null, a null collation key will be returned.
+ * @param source the source string to be transformed into a sort key.
+ * @param sourceLength length of the collation key
+ * @param key the collation key to be filled in
+ * @return the collation key of the string based on the collation rules.
+ * @see CollationKey#compare
+ */
+ virtual CollationKey& getCollationKey(const UChar *source,
+ int32_t sourceLength,
+ CollationKey& key,
+ UErrorCode& status) const = 0;
/**
* Generates the hash code for the collation object
*/
@@ -503,4 +557,17 @@ Collator::operator!=(const Collator& other) const
return result;
}
+inline Collator::ECollationStrength
+Collator::getStrength() const
+{
+ return strength;
+}
+
+inline Normalizer::EMode
+Collator::getDecomposition() const
+{
+ return decmp;
+}
+
+
#endif
diff --git a/icu4c/source/i18n/tblcoll.cpp b/icu4c/source/i18n/tblcoll.cpp
index 18bd7c68779..6ff8827635e 100644
--- a/icu4c/source/i18n/tblcoll.cpp
+++ b/icu4c/source/i18n/tblcoll.cpp
@@ -43,7 +43,10 @@
* Normalizer::EMode
* 06/14/99 stephen Removed kResourceBundleSuffix
* 06/22/99 stephen Fixed logic in constructFromFile() since .ctx
-* files are no longer used.
+* files are no longer used.
+* 11/02/99 helena Collator performance enhancements. Special case
+* for NO_OP situations.
+* 11/17/99 srl More performance enhancements. Inlined some internal functions.
*******************************************************************************
*/
@@ -69,6 +72,8 @@
#include
+#include
+
class RuleBasedCollatorStreamer
{
@@ -129,14 +134,124 @@ const int16_t RuleBasedCollator::FILEID = 0x5443; // unique f
const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension
char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id
+//================ Some inline definitions of implementation functions........ ========
+
+// Get the character order in the mapping table
+inline int32_t
+RuleBasedCollator::getUnicodeOrder(UChar ch) const
+{
+ return ucmp32_get(data->mapping, ch);
+}
+
+inline int32_t
+RuleBasedCollator::strengthOrder(int32_t value) const
+{
+ if (getStrength() == PRIMARY)
+ {
+ return (value & PRIMARYDIFFERENCEONLY);
+ } else if (getStrength() == SECONDARY)
+ {
+ return (value & SECONDARYDIFFERENCEONLY);
+ }
+ return value;
+}
+
+
+inline int32_t
+RuleBasedCollator::getStrengthOrder(NormalizerIterator* cursor,
+ UErrorCode status) const
+{
+ if (U_FAILURE(status))
+ {
+ return CollationElementIterator::NULLORDER;
+ }
+
+ if (cursor->bufferAlias != NULL)
+ {
+ // bufferAlias needs a bit of an explanation.
+ // When we hit an expanding character in the text, we call the order's
+ // getExpandValues method to retrieve an array of the orderings for all
+ // of the characters in the expansion (see the end of this method).
+ // The first ordering is returned, and an alias to the orderings array
+ // is saved so that the remaining orderings can be returned on subsequent
+ // calls to next. So, if the expanding buffer is not exhausted,
+ // all we have to do here is return the next ordering in the buffer.
+ if (cursor->expIndex < cursor->bufferAlias->size())
+ {
+ //_L((stderr, "next from [%08X] from bufferAlias\n", this));
+ return strengthOrder(cursor->bufferAlias->at(cursor->expIndex++));
+ }
+ else
+ {
+ cursor->bufferAlias = NULL;
+ cursor->expIndex = 0;
+ }
+ }
+ else if (cursor->swapOrder != 0)
+ {
+ // If we find a character with no order, we return the marking
+ // flag, UNMAPPEDCHARVALUE, 0x7fff0000, and then the character
+ // itself shifted left 16 bits as orders. At this point, the
+ // UNMAPPEDCHARVALUE flag has already been returned by the code
+ // below, so just return the shifted character here.
+ int32_t order = cursor->swapOrder << 16;
+
+ //_L((stderr, "next from [%08X] swaporder..\n", this));
+ cursor->swapOrder = 0;
+
+ return order;
+ }
+
+ UChar ch = cursor->current();
+ cursor->next();
+
+ //_L((stderr, "Next from [%08X] = [%04X], [%c]\n", cursor, (int)ch & 0xFFFF, (char)(ch & 0xFF)));
+
+ if (ch == Normalizer::DONE) {
+ return CollationElementIterator::NULLORDER;
+ }
+ // Ask the collator for this character's ordering.
+ int32_t value = getUnicodeOrder(ch);
+
+ if (value == UNMAPPED)
+ {
+ // Returned an "unmapped" flag and save the character so it can be
+ // returned next time this method is called.
+ if (ch == 0x0000) return ch;
+ cursor->swapOrder = ch; // \u0000 is not valid in C++'s UnicodeString
+ return CollationElementIterator::UNMAPPEDCHARVALUE;
+ }
+
+ if (value >= CONTRACTCHARINDEX)
+ {
+ value = nextContractChar(cursor, ch, status);
+ }
+
+ if (value >= EXPANDCHARINDEX)
+ {
+ cursor->bufferAlias = getExpandValueList(value);
+ cursor->expIndex = 0;
+ value = cursor->bufferAlias->at(cursor->expIndex++);
+ }
+
+ int32_t str = strengthOrder(value);
+
+ return strengthOrder(value);
+}
+
+// ==================== End inlines ============================================
+
+
//===============================================================================
RuleBasedCollator::RuleBasedCollator()
: Collator(),
isOverIgnore(FALSE),
mPattern(0),
- sourceCursor(0),
- targetCursor(0),
+ // sourceCursor(0),
+ //targetCursor(0),
+ cursor1(0),
+ cursor2(0),
data(0),
dataIsOwned(FALSE)
{
@@ -146,8 +261,10 @@ RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
: Collator(that),
isOverIgnore(that.isOverIgnore),
mPattern(0),
- sourceCursor(0),
- targetCursor(0),
+ // sourceCursor(0),
+ //targetCursor(0),
+ cursor1(0),
+ cursor2(0),
dataIsOwned(FALSE),
data(that.data) // Alias the data pointer
{
@@ -214,8 +331,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
: Collator(),
isOverIgnore(FALSE),
mPattern(0),
- sourceCursor(0),
- targetCursor(0),
+ // sourceCursor(0),
+ /// targetCursor(0),
+ cursor1(0),
+ cursor2(0),
data(0),
dataIsOwned(FALSE)
{
@@ -233,8 +352,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
: Collator(collationStrength, Normalizer::NO_OP),
isOverIgnore(FALSE),
mPattern(0),
- sourceCursor(0),
- targetCursor(0),
+ // sourceCursor(0),
+ // targetCursor(0),
+ cursor1(0),
+ cursor2(0),
data(0),
dataIsOwned(FALSE)
{
@@ -242,7 +363,6 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
{
return;
}
-
constructFromRules(rules, status);
}
@@ -252,8 +372,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
: Collator(TERTIARY, decompositionMode),
isOverIgnore(FALSE),
mPattern(0),
- sourceCursor(0),
- targetCursor(0),
+ // sourceCursor(0),
+ // targetCursor(0),
+ cursor1(0),
+ cursor2(0),
data(0),
dataIsOwned(FALSE)
{
@@ -272,8 +394,10 @@ RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
: Collator(collationStrength, decompositionMode),
isOverIgnore(FALSE),
mPattern(0),
- sourceCursor(0),
- targetCursor(0),
+ // sourceCursor(0),
+ //targetCursor(0),
+ cursor1(0),
+ cursor2(0),
data(0),
dataIsOwned(FALSE)
{
@@ -392,10 +516,14 @@ RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale,
isOverIgnore(FALSE),
dataIsOwned(FALSE),
data(0),
- sourceCursor(0),
- targetCursor(0),
+ // sourceCursor(0),
+ //targetCursor(0),
+ cursor1(0),
+ cursor2(0),
mPattern(0)
{
+
+
if (U_FAILURE(status))
{
return;
@@ -447,6 +575,18 @@ RuleBasedCollator::RuleBasedCollator( const Locale& desiredLocale,
return;
}
+ // srl write out default.col
+ {
+ UnicodeString defLocaleName = ResourceBundle::kDefaultFilename;
+ char *binaryFilePath = createPathName(Locale::getDataDirectory(),
+ defLocaleName, kFilenameSuffix);
+ bool_t ok = writeToFile(binaryFilePath);
+ delete [] binaryFilePath;
+#ifdef COLLDEBUG
+ cerr << defLocaleName << " [default] binary write " << (ok? "OK" : "Failed") << endl;
+#endif
+ }
+
data->desiredLocale = desiredLocale;
desiredLocale.getName(localeName);
data->realLocaleName = localeName;
@@ -567,7 +707,7 @@ RuleBasedCollator::constructFromFile( const Locale& locale,
// Try to load up the collation from a binary file first
constructFromFile(binaryFilePath, status);
#ifdef COLLDEBUG
- cerr << localeFileName << " binary load " << errorName(status) << endl;
+ cerr << localeFileName << kFilenameSuffix << " binary load " << errorName(status) << endl;
#endif
if(U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR)
return;
@@ -629,7 +769,7 @@ RuleBasedCollator::constructFromFile( const Locale& locale,
}
#ifdef COLLDEBUG
- cerr << localeFileName << " ascii load " << (U_SUCCESS(status) ? "OK" : "Failed") << endl;
+ cerr << localeFileName << " ascii load " << (U_SUCCESS(status) ? "OK" : "Failed") << " - try= " << (tryBinaryFile?"true":"false") << endl;
#endif
if(U_SUCCESS(status) && tryBinaryFile) {
@@ -655,11 +795,20 @@ RuleBasedCollator::~RuleBasedCollator()
data = 0;
- delete sourceCursor;
- sourceCursor = 0;
+ // delete sourceCursor;
+ // sourceCursor = 0;
- delete targetCursor;
- targetCursor = 0;
+ // delete targetCursor;
+ // targetCursor = 0;
+
+ if (cursor1 != NULL) {
+ delete cursor1;
+ cursor1 = 0;
+ }
+ if (cursor2 != NULL) {
+ delete cursor2;
+ cursor2 = 0;
+ }
delete mPattern;
mPattern = 0;
@@ -742,13 +891,13 @@ RuleBasedCollator::getRules() const
data->isRuleTableLoaded = TRUE;
#ifdef _DEBUG
// the following is useful for specific debugging purposes
- // UnicodeString name;
- // cerr << "Table collation rules loaded dynamically for "
- // << data->desiredLocale.getName(name)
- // << " at "
- // << data->realLocaleName
- // << ", " << dec << data->ruleTable.size() << " characters"
- // << endl;
+ UnicodeString name;
+ cerr << "Table collation rules loaded dynamically for "
+ << data->desiredLocale.getName(name)
+ << " at "
+ << data->realLocaleName
+ << ", " << dec << data->ruleTable.size() << " characters"
+ << endl;
#endif
}
else
@@ -762,6 +911,16 @@ RuleBasedCollator::getRules() const
<< endl;
cerr << "Status " << errorName(status) << ", mPattern " << temp.mPattern << endl;
#endif
+ /* SRL have to add this because we now have the situation where
+ DEFAULT is loaded from a binary file w/ no rules. */
+ UErrorCode intStatus = U_ZERO_ERROR;
+ temp.constructFromRules(RuleBasedCollator::DEFAULTRULES, intStatus);
+
+ if(U_SUCCESS(intStatus) && (temp.mPattern != 0))
+ {
+ data->ruleTable = temp.getRules();
+ data->isRuleTableLoaded = TRUE;
+ }
}
}
@@ -783,14 +942,15 @@ RuleBasedCollator::compare( const UnicodeString& source,
return (RuleBasedCollator::compare(source_togo, target_togo));
}
-
-// Compare two strings using this collator
-Collator::EComparisonResult
-RuleBasedCollator::compare(const UnicodeString& source,
- const UnicodeString& target) const
+Collator::EComparisonResult
+RuleBasedCollator::compare(const UChar* source,
+ int32_t sourceLength,
+ const UChar* target,
+ int32_t targetLength) const
{
// check if source and target are valid strings
- if (source.isBogus() || target.isBogus())
+ if (((source == 0) && (target == 0)) ||
+ ((sourceLength == 0) && (targetLength == 0)))
{
return Collator::EQUAL;
}
@@ -798,55 +958,36 @@ RuleBasedCollator::compare(const UnicodeString& source,
Collator::EComparisonResult result = Collator::EQUAL;
UErrorCode status = U_ZERO_ERROR;
- // The basic algorithm here is that we use CollationElementIterators
- // to step through both the source and target strings. We compare each
- // collation element in the source string against the corresponding one
- // in the target, checking for differences.
- //
- // If a difference is found, we set to LESS or GREATER to
- // indicate whether the source string is less or greater than the target.
- //
- // However, it's not that simple. If we find a tertiary difference
- // (e.g. 'A' vs. 'a') near the beginning of a string, it can be
- // overridden by a primary difference (e.g. "A" vs. "B") later in
- // the string. For example, "AA" < "aB", even though 'A' > 'a'.
- //
- // To keep track of this, we use checkSecTer and checkTertiary to keep
- // track of the strength of the most significant difference that has been
- // found so far. When we find a difference whose strength is greater than
- // the previous ones, it overrides the last difference (if any) that
- // was found.
- //
-
- if (sourceCursor == NULL)
+ if (cursor1 == NULL)
{
- ((RuleBasedCollator *)this)->sourceCursor = createCollationElementIterator(source);
+ ((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLength, getDecomposition());
}
else
{
- sourceCursor->setText(source, status);
+ cursor1->setModeAndText(getDecomposition(), source, sourceLength, status);
}
- if (sourceCursor == NULL || U_FAILURE(status))
+ if ( /*cursor1->cursor == NULL ||*/ U_FAILURE(status))
{
return Collator::EQUAL;
}
- if (targetCursor == NULL)
+ if (cursor2 == NULL)
{
- ((RuleBasedCollator *)this)->targetCursor = createCollationElementIterator(target);
+ ((RuleBasedCollator *)this)->cursor2 = new NormalizerIterator(target, targetLength, getDecomposition());
}
else
{
- targetCursor->setText(target, status);
+ cursor2->setModeAndText(getDecomposition(), target, targetLength, status);
}
- if (targetCursor == NULL || U_FAILURE(status))
+ if (/*cursor2 == NULL ||*/ U_FAILURE(status))
{
return Collator::EQUAL;
}
int32_t sOrder, tOrder;
+ // int32_t sOrder = CollationElementIterator::NULLORDER, tOrder = CollationElementIterator::NULLORDER;
bool_t gets = TRUE, gett = TRUE;
bool_t initialCheckSecTer = getStrength() >= Collator::SECONDARY;
bool_t checkSecTer = initialCheckSecTer;
@@ -860,7 +1001,7 @@ RuleBasedCollator::compare(const UnicodeString& source,
// we've been requested to skip it.
if (gets)
{
- sOrder = sourceCursor->next(status);
+ sOrder = getStrengthOrder((NormalizerIterator*)cursor1, status);
if (U_FAILURE(status))
{
@@ -872,7 +1013,7 @@ RuleBasedCollator::compare(const UnicodeString& source,
if (gett)
{
- tOrder = targetCursor->next(status);
+ tOrder = getStrengthOrder((NormalizerIterator*)cursor2, status);
if (U_FAILURE(status))
{
@@ -1036,7 +1177,7 @@ RuleBasedCollator::compare(const UnicodeString& source,
}
}
}
- while ((sOrder = sourceCursor->next(status)) != CollationElementIterator::NULLORDER);
+ while ((sOrder = getStrengthOrder(cursor1, status)) != CollationElementIterator::NULLORDER);
}
else if (tOrder != CollationElementIterator::NULLORDER)
{
@@ -1060,7 +1201,7 @@ RuleBasedCollator::compare(const UnicodeString& source,
}
}
}
- while ((tOrder = targetCursor->next(status)) != CollationElementIterator::NULLORDER);
+ while ((tOrder = getStrengthOrder(cursor2, status)) != CollationElementIterator::NULLORDER);
}
@@ -1070,15 +1211,46 @@ RuleBasedCollator::compare(const UnicodeString& source,
// puts the result of the string comparison directly into result
if (result == Collator::EQUAL && getStrength() == IDENTICAL)
{
- UnicodeString sourceDecomp, targetDecomp;
+#if 0
+ // ******** for the UChar normalization interface.
+ // It doesn't work much faster, and the code was broken
+ // so it's commented out. --srl
+// UChar sourceDecomp[1024], targetDecomp[1024];
+// int32_t sourceDecompLength = 1024;
+// int32_t targetDecompLength = 1024;
+
+// int8_t comparison;
+// Normalizer::EMode decompMode = getDecomposition();
+
+// if (decompMode != Normalizer::NO_OP)
+// {
+// Normalizer::normalize(source, sourceLength, decompMode,
+// 0, sourceDecomp, sourceDecompLength, status);
+
+// Normalizer::normalize(target, targetLength, decompMode,
+// 0, targetDecomp, targetDecompLength, status);
+
+// comparison = u_strcmp(sourceDecomp,targetDecomp);
+// }
+// else
+// {
+// comparison = u_strcmp(source, target); /* ! */
+// }
+
+#else
+
+ UnicodeString sourceDecomp, targetDecomp;
+
int8_t comparison;
Normalizer::normalize(source, getDecomposition(),
- 0, sourceDecomp, status);
+ 0, sourceDecomp, status);
+
Normalizer::normalize(target, getDecomposition(),
- 0, targetDecomp, status);
+ 0, targetDecomp, status);
comparison = sourceDecomp.compare(targetDecomp);
+#endif
if (comparison < 0)
{
@@ -1097,6 +1269,49 @@ RuleBasedCollator::compare(const UnicodeString& source,
return result;
}
+
+int32_t
+RuleBasedCollator::nextContractChar(NormalizerIterator *cursor,
+ UChar ch,
+ UErrorCode& status) const
+{
+ // First get the ordering of this single character
+ VectorOfPToContractElement *list = getContractValues(ch);
+ EntryPair *pair = (EntryPair *)list->at(0);
+ int32_t order = pair->value;
+
+ // Now iterate through the chars following it and
+ // look for the longest match
+ ((UnicodeString&)key).remove();
+ ((UnicodeString&)key) += ch;
+
+ while ((ch = cursor->current()) != Normalizer::DONE)
+ {
+ ((UnicodeString&)key) += ch;
+
+ int32_t n = getEntry(list, key, TRUE);
+
+ if (n == UNMAPPED)
+ {
+ break;
+ }
+ cursor->next();
+
+ pair = (EntryPair *)list->at(n);
+ order = pair->value;
+ }
+
+ return order;
+}
+
+// Compare two strings using this collator
+Collator::EComparisonResult
+RuleBasedCollator::compare(const UnicodeString& source,
+ const UnicodeString& target) const
+{
+ return compare(source.getUChars(), source.length(), target.getUChars(), target.length());
+}
+
// Retrieve a collation key for the specified string
// The key can be compared with other collation keys using a bitwise comparison
// (e.g. memcmp) to find the ordering of their respective source strings.
@@ -1134,6 +1349,15 @@ CollationKey&
RuleBasedCollator::getCollationKey( const UnicodeString& source,
CollationKey& sortkey,
UErrorCode& status) const
+{
+ return RuleBasedCollator::getCollationKey(source.getUChars(), source.size(), sortkey, status);
+}
+
+CollationKey&
+RuleBasedCollator::getCollationKey( const UChar* source,
+ int32_t sourceLen,
+ CollationKey& sortkey,
+ UErrorCode& status) const
{
if (U_FAILURE(status))
{
@@ -1141,27 +1365,21 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
return sortkey.setToBogus();
}
- if (source.isBogus())
- {
- status = U_MEMORY_ALLOCATION_ERROR;
- return sortkey.setToBogus();
- }
-
- if (source.size() == 0)
+ if ((!source) || (sourceLen == 0))
{
return sortkey.reset();
}
- if (sourceCursor == NULL)
+ if (cursor1 == NULL)
{
- ((RuleBasedCollator *)this)->sourceCursor = createCollationElementIterator(source);
+ ((RuleBasedCollator *)this)->cursor1 = new NormalizerIterator(source, sourceLen, getDecomposition());
}
else
{
- sourceCursor->setText(source, status);
+ cursor1->setModeAndText(getDecomposition(), source,sourceLen, status);
}
- if (sourceCursor == NULL || U_FAILURE(status))
+ if (U_FAILURE(status))
{
return sortkey.setToBogus();
}
@@ -1177,7 +1395,8 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
UnicodeString decomp;
// iterate over the source, counting primary, secondary, and tertiary entries
- while((order = sourceCursor->next(status)) != CollationElementIterator::NULLORDER)
+ while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) !=
+ CollationElementIterator::NULLORDER)
{
int32_t secOrder = CollationElementIterator::secondaryOrder(order);
int32_t terOrder = CollationElementIterator::tertiaryOrder(order);
@@ -1230,7 +1449,7 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
if (compareIdent)
{
- Normalizer::normalize(source, getDecomposition(),
+ Normalizer::normalize(source, getDecomposition(), // SRL: ??
0, decomp, status);
if (U_SUCCESS(status))
@@ -1259,10 +1478,10 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
int32_t identCursor = terCursor + (2 * totalTer);
// reset source to the beginning
- sourceCursor->reset();
+ cursor1->reset();
// now iterate over the source computing the actual entries
- while((order = sourceCursor->next(status)) != CollationElementIterator::NULLORDER)
+ while((order = getStrengthOrder((NormalizerIterator*)cursor1, status)) != CollationElementIterator::NULLORDER)
{
if (U_FAILURE(status))
{
@@ -1336,6 +1555,14 @@ RuleBasedCollator::getCollationKey( const UnicodeString& source,
sortkey.storeUnicodeString(identCursor, decomp);
}
+ // Debugging - print out the sortkey [--srl]
+// {
+// const uint8_t *bytes;
+// int32_t xcount;
+// bytes = sortkey.getByteArray(xcount);
+// // fprintf(stderr, "\n\n- [%02X] [%02X]\n\n", (int)(bytes[0]&0xFF), (int)(bytes[1]&0xFF) );
+// }
+
return sortkey;
}
@@ -1615,6 +1842,8 @@ RuleBasedCollator::increment(Collator::ECollationStrength aStrength, int32_t las
data->maxTerOrder += 1;
}
break;
+
+ // case IDENTICAL?
}
return lastValue;
@@ -2017,12 +2246,6 @@ VectorOfInt *RuleBasedCollator::getExpandValueList(int32_t order) const
return data->expandTable->at(order - EXPANDCHARINDEX);
}
-// Get the character order in the mapping table
-int32_t
-RuleBasedCollator::getUnicodeOrder(UChar ch) const
-{
- return ucmp32_get(data->mapping, ch);
-}
void RuleBasedCollatorStreamer::streamIn(RuleBasedCollator* collator, FileStream* is)
@@ -2117,7 +2340,7 @@ bool_t RuleBasedCollator::writeToFile(const char* fileName) const
#ifdef COLLDEBUG
fprintf(stderr, "binary write %s size %d %s\n", fileName, T_FileStream_size(ofs),
- (!T_FileStream_error(ofs) ? ", OK" : ", FAIL");
+ (!T_FileStream_error(ofs) ? ", OK" : ", FAIL"));
#endif
bool_t err = T_FileStream_error(ofs) == 0;
diff --git a/icu4c/source/i18n/tblcoll.h b/icu4c/source/i18n/tblcoll.h
index b257ee77b15..f7be21fad6f 100644
--- a/icu4c/source/i18n/tblcoll.h
+++ b/icu4c/source/i18n/tblcoll.h
@@ -37,7 +37,10 @@
* 04/23/99 stephen Removed EDecompositionMode, merged with
* Normalizer::EMode
* 06/14/99 stephen Removed kResourceBundleSuffix
-*
+* 11/02/99 helena Collator performance enhancements. Eliminates the
+* UnicodeString construction and special case for NO_OP.
+* 11/23/99 srl More performance enhancements. Updates to NormalizerIterator
+* internal state management.
*******************************************************************************
*/
@@ -420,6 +423,39 @@ public:
const UnicodeString& target,
int32_t length) const;
+ /**
+ * The comparison function compares the character data stored in two
+ * different string arrays. Returns information about whether a string
+ * array is less than, greater than or equal to another string array.
+ * Example of use:
+ *
+ * . UErrorCode status = U_ZERO_ERROR;
+ * . Collator *myCollation = Collator::createInstance(Locale::US, status);
+ * . if (U_FAILURE(status)) return;
+ * . myCollation->setStrength(Collator::PRIMARY);
+ * . // result would be Collator::EQUAL ("abc" == "ABC")
+ * . // (no primary difference between "abc" and "ABC")
+ * . Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
+ * . myCollation->setStrength(Collator::TERTIARY);
+ * . // result would be Collator::LESS (abc" <<< "ABC")
+ * . // (with tertiary difference between "abc" and "ABC")
+ * . Collator::EComparisonResult result = myCollation->compare(L"abc", 3, L"ABC", 3);
+ *
+ * @param source the source string array to be compared with.
+ * @param sourceLength the length of the source string array. If this value
+ * is equal to -1, the string array is null-terminated.
+ * @param target the string that is to be compared with the source string.
+ * @param targetLength the length of the target string array. If this value
+ * is equal to -1, the string array is null-terminated.
+ * @return Returns a byte value. GREATER if source is greater
+ * than target; EQUAL if source is equal to target; LESS if source is less
+ * than target
+ **/
+ virtual EComparisonResult compare( const UChar* source,
+ int32_t sourceLength,
+ const UChar* target,
+ int32_t targetLength) const ;
+
/** Transforms a specified region of the string into a series of characters
* that can be compared with CollationKey.compare. Use a CollationKey when
* you need to do repeated comparisions on the same string. For a single comparison
@@ -433,6 +469,13 @@ public:
virtual CollationKey& getCollationKey( const UnicodeString& source,
CollationKey& key,
UErrorCode& status) const;
+
+ virtual CollationKey& getCollationKey(const UChar *source,
+ int32_t sourceLength,
+ CollationKey& key,
+ UErrorCode& status) const;
+
+
/**
* Generates the hash code for the rule-based collation object.
* @return the hash code.
@@ -705,11 +748,41 @@ private:
const UnicodeString& name,
const UnicodeString& suffix);
- /**
- * Chops off the last portion of the locale name. For example, from "en_US_CA"
- * to "en_US" and "en_US" to "en".
- * @param localeName the locale name.
+ /* Internal class for quick iteration over the text.
+ 100% pure inline code
*/
+ class NormalizerIterator {
+ public:
+ Normalizer *cursor;
+ VectorOfInt *bufferAlias;
+ int32_t swapOrder;
+ UChar* text;
+ int32_t expIndex;
+ int32_t textLen;
+ UTextOffset currentOffset;
+
+ NormalizerIterator(void);
+ NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode);
+ ~NormalizerIterator(void);
+ void setText(const UChar* source, int32_t length, UErrorCode& status);
+ void setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status);
+
+ UChar current(void) const;
+ UChar next(void);
+ void reset(void);
+ };
+
+ int32_t getStrengthOrder(NormalizerIterator* cursor,
+ UErrorCode status) const;
+ int32_t strengthOrder(int32_t value) const ;
+ int32_t nextContractChar(NormalizerIterator *cursor,
+ UChar ch,
+ UErrorCode& status) const;
+ /**
+ * Chops off the last portion of the locale name. For example, from "en_US_CA"
+ * to "en_US" and "en_US" to "en".
+ * @param localeName the locale name.
+ */
static void chopLocale(UnicodeString& localeName);
//--------------------------------------------------------------------------
@@ -751,12 +824,151 @@ private:
UnicodeString sbuffer;
UnicodeString tbuffer;
UnicodeString key;
- CollationElementIterator *sourceCursor;
- CollationElementIterator *targetCursor;
+ NormalizerIterator *cursor1;
+ NormalizerIterator *cursor2;
bool_t dataIsOwned;
TableCollationData* data;
};
+inline
+RuleBasedCollator::NormalizerIterator::NormalizerIterator() :
+ cursor(0),
+ bufferAlias(0),
+ swapOrder(0),
+ text(0),
+ textLen(0),
+ currentOffset(0),
+ expIndex(0)
+{
+}
+
+inline
+RuleBasedCollator::NormalizerIterator::NormalizerIterator(const UChar* source, int32_t length, Normalizer::EMode mode) :
+ cursor(0),
+ bufferAlias(0),
+ swapOrder(0),
+ text(0),
+ textLen(0),
+ currentOffset(0),
+ expIndex(0)
+{
+ if (mode == Normalizer::NO_OP) {
+ text = (UChar*)source;
+ textLen = length;
+ currentOffset = 0;
+ } else {
+ cursor = new Normalizer(source, length, mode);
+
+ }
+}
+
+inline
+RuleBasedCollator::NormalizerIterator::~NormalizerIterator()
+{
+ if (cursor != 0) {
+ delete cursor;
+ cursor = 0;
+ }
+}
+
+inline
+void
+RuleBasedCollator::NormalizerIterator::setText(const UChar* source, int32_t length, UErrorCode& status)
+{
+ if (cursor == 0) {
+ text = (UChar*)source;
+ textLen = length;
+ currentOffset = 0;
+
+ } else {
+ text = 0;
+ cursor->setText(source, length, status);
+ }
+ bufferAlias = 0;
+ swapOrder = 0;
+ expIndex = 0;
+ currentOffset = 0;
+}
+
+/* You can only set mode after the comparision of two strings is completed.
+ Setting the mode in the middle of a comparison is not allowed.
+ */
+inline
+void
+
+
+RuleBasedCollator::NormalizerIterator::setModeAndText(Normalizer::EMode mode, const UChar* source, int32_t length, UErrorCode& status)
+{
+ if (cursor != NULL) {
+ if (mode != Normalizer::NO_OP) {
+ cursor->setMode(mode);
+ cursor->setText(source, length, status);
+ } else {
+ delete cursor;
+ cursor = 0;
+
+ text = (UChar*)source;
+ textLen = length;
+ currentOffset = 0;
+ }
+ } else {
+ if(mode == Normalizer::NO_OP)
+ {
+ text = (UChar*)source;
+ textLen = length;
+ currentOffset = 0;
+
+ }
+ else
+ {
+ cursor = new Normalizer(source, length, mode);
+ }
+ }
+
+ bufferAlias = 0;
+ swapOrder = 0;
+ expIndex = 0;
+}
+
+inline
+UChar
+RuleBasedCollator::NormalizerIterator::current(void) const
+{
+ if (text != 0) {
+ if(currentOffset >= textLen)
+ {
+ return Normalizer::DONE;
+ }
+ else
+ {
+ return text[currentOffset];
+ }
+ }
+
+ return cursor->current();
+}
+
+
+inline
+UChar
+RuleBasedCollator::NormalizerIterator::next(void)
+{
+ if (text != 0) {
+ return ((currentOffset < textLen) ? text[++currentOffset] : Normalizer::DONE);
+ }
+ return cursor->next();
+}
+
+inline
+void
+RuleBasedCollator::NormalizerIterator::reset(void)
+{
+ currentOffset = 0;
+ if(cursor)
+ {
+ cursor->reset();
+ }
+}
inline bool_t
RuleBasedCollator::operator!=(const Collator& other) const
@@ -772,4 +984,7 @@ RuleBasedCollator::addContractOrder(const UnicodeString &groupChars,
addContractOrder(groupChars, anOrder, TRUE, status);
}
+
+
+
#endif
diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp
index 03b28c0bd63..9bfd3269672 100644
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@@ -138,11 +138,7 @@ ucol_strcoll( const UCollator *coll,
const UChar *target,
int32_t targetLength)
{
- int32_t srcLen = (sourceLength == -1 ? u_strlen(source) : sourceLength);
- const UnicodeString tempSource((UChar*)source, sourceLength, sourceLength);
- int32_t targLen = (targetLength == -1 ? u_strlen(target) : targetLength);
- const UnicodeString tempTarget((UChar*)target, targLen, targLen);
- return (UCollationResult) ((Collator*)coll)->compare(tempSource, tempTarget);
+ return (UCollationResult) ((Collator*)coll)->compare(source,sourceLength,target,targetLength);
}
U_CAPI bool_t
@@ -290,12 +286,12 @@ ucol_getSortKey(const UCollator *coll,
const uint8_t* bytes = NULL;
CollationKey key;
int32_t copyLen;
- int32_t len = (sourceLength == -1 ? u_strlen(source)
+ int32_t len = (sourceLength == -1 ? u_strlen(source)
: sourceLength);
- UnicodeString string((UChar*)source, len, len);
+ // UnicodeString string((UChar*)source, len, len);
UErrorCode status = U_ZERO_ERROR;
- ((Collator*)coll)->getCollationKey(string, key, status);
+ ((Collator*)coll)->getCollationKey(source, len, key, status);
if(U_FAILURE(status))
return 0;