ICU-96 collation data generated so that collation elements never have 00 or 01 bytes in them

X-SVN-Rev: 3179
This commit is contained in:
Vladimir Weinstein 2000-12-08 22:03:01 +00:00
parent 25fbf90920
commit 7cdd2c8ecb
4 changed files with 98 additions and 59 deletions

View file

@ -139,6 +139,16 @@ const int32_t RuleBasedCollator::SECONDARYORDERSHIFT = 8; // secondar
const int32_t RuleBasedCollator::SORTKEYOFFSET = 1; // minimum sort key offset
const int32_t RuleBasedCollator::CONTRACTCHAROVERFLOW = 0x7FFFFFFF; // Indicates the char is a contract char
const int32_t RuleBasedCollator::COLELEMENTSTART = 0x02020202; // starting value for collation elements
const int32_t RuleBasedCollator::PRIMARYLOWZEROMASK = 0x00FF0000; // testing mask for primary low element
const int32_t RuleBasedCollator::RESETSECONDARYTERTIARY = 0x00000202;// reseting value for secondaries and tertiaries
const int32_t RuleBasedCollator::RESETTERTIARY = 0x00000002; // reseting value for tertiaries
const int32_t RuleBasedCollator::IGNORABLE = 0x02020202;
const int32_t RuleBasedCollator::PRIMIGNORABLE = 0x0202;
const int32_t RuleBasedCollator::SECIGNORABLE = 0x02;
const int32_t RuleBasedCollator::TERIGNORABLE = 0x02;
const int16_t RuleBasedCollator::FILEID = 0x5443; // unique file id for parity check
const char* RuleBasedCollator::kFilenameSuffix = ".col"; // binary collation file extension
char RuleBasedCollator::fgClassID = 0; // Value is irrelevant // class id
@ -1316,7 +1326,7 @@ RuleBasedCollator::compareEx(const UChar* source,
pTOrder = CollationElementIterator::primaryOrder(tOrder);
if (sOrder == tOrder)
{
if (isFrenchSec && pSOrder != 0)
if (isFrenchSec && pSOrder != SECIGNORABLE)
{
if (!checkSecTer)
{
@ -1336,7 +1346,7 @@ RuleBasedCollator::compareEx(const UChar* source,
// Compare primary differences first.
if (pSOrder != pTOrder)
{
if (sOrder == 0)
if (sOrder == IGNORABLE)
{
// The entire source element is ignorable.
// Skip to the next source element, but don't fetch another target element.
@ -1344,7 +1354,7 @@ RuleBasedCollator::compareEx(const UChar* source,
continue;
}
if (tOrder == 0)
if (tOrder == IGNORABLE)
{
gets = FALSE;
continue;
@ -1352,7 +1362,7 @@ RuleBasedCollator::compareEx(const UChar* source,
// The source and target elements aren't ignorable, but it's still possible
// for the primary component of one of the elements to be ignorable....
if (pSOrder == 0) // primary order in source is ignorable
if (pSOrder == PRIMIGNORABLE) // primary order in source is ignorable
{
// The source's primary is ignorable, but the target's isn't. We treat ignorables
// as a secondary difference, so remember that we found one.
@ -1365,7 +1375,7 @@ RuleBasedCollator::compareEx(const UChar* source,
// Skip to the next source element, but don't fetch another target element.
gett = FALSE;
}
else if (pTOrder == 0)
else if (pTOrder == PRIMIGNORABLE)
{
// record differences - see the comment above.
if (checkSecTer)
@ -1440,14 +1450,14 @@ RuleBasedCollator::compareEx(const UChar* source,
// The source string has more elements, but the target string hasn't.
do
{
if (CollationElementIterator::primaryOrder(sOrder) != 0)
if (CollationElementIterator::primaryOrder(sOrder) != PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the source string.
// This is a primary difference, so the source is greater
return Collator::GREATER; // (strength is PRIMARY)
}
if (CollationElementIterator::secondaryOrder(sOrder) != 0)
if (CollationElementIterator::secondaryOrder(sOrder) != SECIGNORABLE)
{
// Additional secondary elements mean the source string is greater
if (checkSecTer)
@ -1464,14 +1474,14 @@ RuleBasedCollator::compareEx(const UChar* source,
// The target string has more elements, but the source string hasn't.
do
{
if (CollationElementIterator::primaryOrder(tOrder) != 0)
if (CollationElementIterator::primaryOrder(tOrder) != PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the target string.
// This is a primary difference, so the source is less
return Collator::LESS; // (strength is PRIMARY)
}
if (CollationElementIterator::secondaryOrder(tOrder) != 0)
if (CollationElementIterator::secondaryOrder(tOrder) != SECIGNORABLE)
{
// Additional secondary elements in the target mean the source string is less
if (checkSecTer)
@ -1770,12 +1780,12 @@ RuleBasedCollator::getCollationKeyEx( const UChar* source,
}
else
{
if (compareSec && secOrder != 0)
if (compareSec && secOrder != SECIGNORABLE)
{
totalSec += 1;
}
if (compareTer && terOrder != 0)
if (compareTer && terOrder != TERIGNORABLE)
{
totalTer += 1;
}
@ -1863,12 +1873,12 @@ RuleBasedCollator::getCollationKeyEx( const UChar* source,
}
else
{
if (compareSec && secOrder != 0)
if (compareSec && secOrder != SECIGNORABLE)
{
secCursor = sortkey.storeBytes(secCursor, secOrder + data->maxSecOrder + SORTKEYOFFSET);
}
if (compareTer && terOrder != 0)
if (compareTer && terOrder != TERIGNORABLE)
{
terCursor = sortkey.storeBytes(terCursor, terOrder + data->maxTerOrder + SORTKEYOFFSET);
}
@ -1964,7 +1974,7 @@ RuleBasedCollator::build(const UnicodeString& pattern,
return;
}
int32_t order = 0;
int32_t order = COLELEMENTSTART;
// Walk through each entry
for (i = 0; i < mPattern->getCount(); ++i)
@ -2162,7 +2172,13 @@ RuleBasedCollator::increment(Collator::ECollationStrength aStrength, int32_t las
case Collator::PRIMARY:
// increment priamry order and mask off secondary and tertiary difference
lastValue += PRIMARYORDERINCREMENT;
if((lastValue & PRIMARYLOWZEROMASK) == 0) {
lastValue += PRIMARYORDERINCREMENT;
lastValue += PRIMARYORDERINCREMENT;
}
lastValue &= PRIMARYORDERMASK;
lastValue |= RESETSECONDARYTERTIARY; // Start all values from 02
isOverIgnore = TRUE;
break;
@ -2170,6 +2186,7 @@ RuleBasedCollator::increment(Collator::ECollationStrength aStrength, int32_t las
// increment secondary order and mask off tertiary difference
lastValue += SECONDARYORDERINCREMENT;
lastValue &= SECONDARYDIFFERENCEONLY;
lastValue |= RESETTERTIARY; // Start all values from 02
// record max # of ignorable chars with secondary difference
if (isOverIgnore == FALSE)

View file

@ -56,7 +56,6 @@ struct collIterate {
#define UCOL_UNMAPPEDCHARVALUE 0x7fff0000 // from coleiterator
#define UCOL_LEVELTERMINATOR 1
#define UCOL_IGNORABLE 0x0000
#define UCOL_CHARINDEX 0x70000000 // need look up in .commit()
#define UCOL_EXPANDCHARINDEX 0x7E000000 // Expand index follows
#define UCOL_CONTRACTCHARINDEX 0x7F000000 // contract indexes follows
@ -77,6 +76,18 @@ struct collIterate {
#define UCOL_SORTKEYOFFSET 2 // minimum sort key offset
#define UCOL_CONTRACTCHAROVERFLOW 0x7FFFFFFF // Indicates the char is a contract char
#define UCOL_COLELEMENTSTART 0x02020202 // starting value for collation elements
#define UCOL_PRIMARYLOWZEROMASK 0x00FF0000 // testing mask for primary low element
#define UCOL_RESETSECONDARYTERTIARY 0x00000202// reseting value for secondaries and tertiaries
#define UCOL_RESETTERTIARY 0x00000002 // reseting value for tertiaries
#define UCOL_IGNORABLE 0x02020202
#define UCOL_PRIMIGNORABLE 0x0202
#define UCOL_SECIGNORABLE 0x02
#define UCOL_TERIGNORABLE 0x02
#define UCOL_PRIMARYORDER(order) (((order) & UCOL_PRIMARYORDERMASK)>> UCOL_PRIMARYORDERSHIFT)
#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
@ -723,7 +734,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
pTOrder = UCOL_PRIMARYORDER(tOrder);
if (pSOrder != pTOrder)
{
if (sOrder == 0)
if (sOrder == UCOL_IGNORABLE)
{
// The entire source element is ignorable.
// Skip to the next source element, but don't fetch another target element.
@ -731,7 +742,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
continue;
}
if (tOrder == 0)
if (tOrder == UCOL_IGNORABLE)
{
gets = FALSE;
continue;
@ -739,7 +750,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
// The source and target elements aren't ignorable, but it's still possible
// for the primary component of one of the elements to be ignorable....
if (pSOrder == 0) // primary order in source is ignorable
if (pSOrder == UCOL_PRIMIGNORABLE) // primary order in source is ignorable
{
// The source's primary is ignorable, but the target's isn't. We treat ignorables
// as a secondary difference, so remember that we found one.
@ -751,7 +762,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
// Skip to the next source element, but don't fetch another target element.
gett = FALSE;
}
else if (pTOrder == 0)
else if (pTOrder == UCOL_PRIMIGNORABLE)
{
// record differences - see the comment above.
if (checkSecTer)
@ -823,14 +834,14 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
// The source string has more elements, but the target string hasn't.
do
{
if (UCOL_PRIMARYORDER(sOrder) != 0)
if (UCOL_PRIMARYORDER(sOrder) != UCOL_PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the source string.
// This is a primary difference, so the source is greater
return UCOL_GREATER; // (strength is PRIMARY)
}
if (UCOL_SECONDARYORDER(sOrder) != 0)
if (UCOL_SECONDARYORDER(sOrder) != UCOL_SECIGNORABLE)
{
// Additional secondary elements mean the source string is greater
if (checkSecTer)
@ -849,14 +860,14 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
// The target string has more elements, but the source string hasn't.
do
{
if (UCOL_PRIMARYORDER(tOrder) != 0)
if (UCOL_PRIMARYORDER(tOrder) != UCOL_PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the target string.
// This is a primary difference, so the source is less
return UCOL_LESS; // (strength is PRIMARY)
}
if (UCOL_SECONDARYORDER(tOrder) != 0)
if (UCOL_SECONDARYORDER(tOrder) != UCOL_SECIGNORABLE)
{
// Additional secondary elements in the target mean the source string is less
if (checkSecTer)
@ -925,7 +936,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
pTOrder = UCOL_PRIMARYORDER(tOrder);
if (pSOrder != pTOrder)
{
if (sOrder == 0)
if (sOrder == UCOL_IGNORABLE)
{
// The entire source element is ignorable.
// Skip to the next source element, but don't fetch another target element.
@ -933,7 +944,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
continue;
}
if (tOrder == 0)
if (tOrder == UCOL_IGNORABLE)
{
gets = FALSE;
continue;
@ -941,7 +952,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
// The source and target elements aren't ignorable, but it's still possible
// for the primary component of one of the elements to be ignorable....
if (pSOrder == 0) // primary order in source is ignorable
if (pSOrder == UCOL_PRIMIGNORABLE) // primary order in source is ignorable
{
// The source's primary is ignorable, but the target's isn't. We treat ignorables
// as a secondary difference, so remember that we found one.
@ -952,7 +963,7 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
// Skip to the next source element, but don't fetch another target element.
gett = FALSE;
}
else if (pTOrder == 0)
else if (pTOrder == UCOL_PRIMIGNORABLE)
{
// record differences - see the comment above.
if (checkSecTer)
@ -1024,14 +1035,14 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
// The source string has more elements, but the target string hasn't.
do
{
if (UCOL_PRIMARYORDER(sOrder) != 0)
if (UCOL_PRIMARYORDER(sOrder) != UCOL_PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the source string.
// This is a primary difference, so the source is greater
return UCOL_GREATER; // (strength is PRIMARY)
}
if (UCOL_SECONDARYORDER(sOrder) != 0)
if (UCOL_SECONDARYORDER(sOrder) != UCOL_SECIGNORABLE)
{
// Additional secondary elements mean the source string is greater
if (checkSecTer)
@ -1053,14 +1064,14 @@ U_CAPI UCollationResult ucol_strcollinc(const UCollator *coll,
// The target string has more elements, but the source string hasn't.
do
{
if (UCOL_PRIMARYORDER(tOrder) != 0)
if (UCOL_PRIMARYORDER(tOrder) != UCOL_PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the target string.
// This is a primary difference, so the source is less
return UCOL_LESS; // (strength is PRIMARY)
}
if (UCOL_SECONDARYORDER(tOrder) != 0)
if (UCOL_SECONDARYORDER(tOrder) != UCOL_SECIGNORABLE)
{
// Additional secondary elements in the target mean the source string is less
if (checkSecTer)
@ -1249,7 +1260,7 @@ ucol_strcoll( const UCollator *coll,
pTOrder = UCOL_PRIMARYORDER(tOrder);
if (pSOrder != pTOrder)
{
if (sOrder == 0)
if (sOrder == UCOL_IGNORABLE)
{
// The entire source element is ignorable.
// Skip to the next source element, but don't fetch another target element.
@ -1257,7 +1268,7 @@ ucol_strcoll( const UCollator *coll,
continue;
}
if (tOrder == 0)
if (tOrder == UCOL_IGNORABLE)
{
gets = FALSE;
continue;
@ -1265,7 +1276,7 @@ ucol_strcoll( const UCollator *coll,
// The source and target elements aren't ignorable, but it's still possible
// for the primary component of one of the elements to be ignorable....
if (pSOrder == 0) // primary order in source is ignorable
if (pSOrder == UCOL_PRIMIGNORABLE) // primary order in source is ignorable
{
// The source's primary is ignorable, but the target's isn't. We treat ignorables
// as a secondary difference, so remember that we found one.
@ -1277,7 +1288,7 @@ ucol_strcoll( const UCollator *coll,
// Skip to the next source element, but don't fetch another target element.
gett = FALSE;
}
else if (pTOrder == 0)
else if (pTOrder == UCOL_PRIMIGNORABLE)
{
// record differences - see the comment above.
if (checkSecTer)
@ -1349,14 +1360,14 @@ ucol_strcoll( const UCollator *coll,
// The source string has more elements, but the target string hasn't.
do
{
if (UCOL_PRIMARYORDER(sOrder) != 0)
if (UCOL_PRIMARYORDER(sOrder) != UCOL_PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the source string.
// This is a primary difference, so the source is greater
return UCOL_GREATER; // (strength is PRIMARY)
}
if (UCOL_SECONDARYORDER(sOrder) != 0)
if (UCOL_SECONDARYORDER(sOrder) != UCOL_SECIGNORABLE)
{
// Additional secondary elements mean the source string is greater
if (checkSecTer)
@ -1375,14 +1386,14 @@ ucol_strcoll( const UCollator *coll,
// The target string has more elements, but the source string hasn't.
do
{
if (UCOL_PRIMARYORDER(tOrder) != 0)
if (UCOL_PRIMARYORDER(tOrder) != UCOL_PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the target string.
// This is a primary difference, so the source is less
return UCOL_LESS; // (strength is PRIMARY)
}
if (UCOL_SECONDARYORDER(tOrder) != 0)
if (UCOL_SECONDARYORDER(tOrder) != UCOL_SECIGNORABLE)
{
// Additional secondary elements in the target mean the source string is less
if (checkSecTer)
@ -1451,7 +1462,7 @@ ucol_strcoll( const UCollator *coll,
pTOrder = UCOL_PRIMARYORDER(tOrder);
if (pSOrder != pTOrder)
{
if (sOrder == 0)
if (sOrder == UCOL_IGNORABLE)
{
// The entire source element is ignorable.
// Skip to the next source element, but don't fetch another target element.
@ -1459,7 +1470,7 @@ ucol_strcoll( const UCollator *coll,
continue;
}
if (tOrder == 0)
if (tOrder == UCOL_IGNORABLE)
{
gets = FALSE;
continue;
@ -1467,7 +1478,7 @@ ucol_strcoll( const UCollator *coll,
// The source and target elements aren't ignorable, but it's still possible
// for the primary component of one of the elements to be ignorable....
if (pSOrder == 0) // primary order in source is ignorable
if (pSOrder == UCOL_PRIMIGNORABLE) // primary order in source is ignorable
{
// The source's primary is ignorable, but the target's isn't. We treat ignorables
// as a secondary difference, so remember that we found one.
@ -1478,7 +1489,7 @@ ucol_strcoll( const UCollator *coll,
// Skip to the next source element, but don't fetch another target element.
gett = FALSE;
}
else if (pTOrder == 0)
else if (pTOrder == UCOL_PRIMIGNORABLE)
{
// record differences - see the comment above.
if (checkSecTer)
@ -1550,14 +1561,14 @@ ucol_strcoll( const UCollator *coll,
// The source string has more elements, but the target string hasn't.
do
{
if (UCOL_PRIMARYORDER(sOrder) != 0)
if (UCOL_PRIMARYORDER(sOrder) != UCOL_PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the source string.
// This is a primary difference, so the source is greater
return UCOL_GREATER; // (strength is PRIMARY)
}
if (UCOL_SECONDARYORDER(sOrder) != 0)
if (UCOL_SECONDARYORDER(sOrder) != UCOL_SECIGNORABLE)
{
// Additional secondary elements mean the source string is greater
if (checkSecTer)
@ -1579,14 +1590,14 @@ ucol_strcoll( const UCollator *coll,
// The target string has more elements, but the source string hasn't.
do
{
if (UCOL_PRIMARYORDER(tOrder) != 0)
if (UCOL_PRIMARYORDER(tOrder) != UCOL_PRIMIGNORABLE)
{
// We found an additional non-ignorable base character in the target string.
// This is a primary difference, so the source is less
return UCOL_LESS; // (strength is PRIMARY)
}
if (UCOL_SECONDARYORDER(tOrder) != 0)
if (UCOL_SECONDARYORDER(tOrder) != UCOL_SECIGNORABLE)
{
// Additional secondary elements in the target mean the source string is less
if (checkSecTer)
@ -1706,7 +1717,7 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
secondary = ((order & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT);
tertiary = (order & UCOL_TERTIARYORDERMASK);
if(primary != UCOL_IGNORABLE) {
if(primary != UCOL_PRIMIGNORABLE) {
currentSize += 2;
if(compareSec) {
currentSize++;
@ -1839,30 +1850,30 @@ ucol_getSortKey(const UCollator *coll,
secondary = ((order & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT);
tertiary = (order & UCOL_TERTIARYORDERMASK);
if(primary != UCOL_IGNORABLE) {
*(primaries++) = (primary>>8)+UCOL_SORTKEYOFFSET;
*(primaries++) = (primary&0xFF)+UCOL_SORTKEYOFFSET;
if(primary != UCOL_PRIMIGNORABLE) {
*(primaries++) = (primary>>8);
*(primaries++) = (primary&0xFF);
sortKeySize += 2;
if(compareSec) {
*(secondaries++) = secondary+UCOL_SORTKEYOFFSET;
*(secondaries++) = secondary;
sortKeySize++;
}
if(compareTer) {
*(tertiaries++) = tertiary+UCOL_SORTKEYOFFSET;
*(tertiaries++) = tertiary;
sortKeySize++;
}
} else if(secondary != 0) {
} else if(secondary != UCOL_SECIGNORABLE) {
if(compareSec) {
*(secondaries++) = secondary+UCOL_SORTKEYOFFSET;
*(secondaries++) = secondary;
sortKeySize++;
}
if(compareTer) {
*(tertiaries++) = tertiary+UCOL_SORTKEYOFFSET;
*(tertiaries++) = tertiary;
sortKeySize++;
}
} else if(tertiary != 0) {
} else if(tertiary != UCOL_TERIGNORABLE) {
if(compareTer) {
*(tertiaries++) = tertiary+UCOL_SORTKEYOFFSET;
*(tertiaries++) = tertiary;
sortKeySize++;
}
}

View file

@ -385,7 +385,7 @@ CollationElementIterator::getMaxExpansion(int32_t order) const
inline UBool
CollationElementIterator::isIgnorable(int32_t order)
{
return (primaryOrder(order) == 0);
return (primaryOrder(order) == RuleBasedCollator::PRIMIGNORABLE);
}
/**

View file

@ -978,6 +978,17 @@ private:
static const int32_t SORTKEYOFFSET;
static const int32_t CONTRACTCHAROVERFLOW;
static const int32_t COLELEMENTSTART;
static const int32_t PRIMARYLOWZEROMASK;
static const int32_t RESETSECONDARYTERTIARY;
static const int32_t RESETTERTIARY;
static const int32_t IGNORABLE;
static const int32_t PRIMIGNORABLE;
static const int32_t SECIGNORABLE;
static const int32_t TERIGNORABLE;
static const int16_t FILEID;
static UnicodeString DEFAULTRULES;