From f6fb60ff6f7863f66743079ee649fb17235881ce Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Mon, 25 Oct 2010 23:02:08 +0000 Subject: [PATCH] ICU-7264 fix weight allocation and align sort key constants with C++ X-SVN-Rev: 28908 --- .../icu/text/CollationParsedRuleBuilder.java | 19 ++--- .../com/ibm/icu/text/RuleBasedCollator.java | 73 ++++++++----------- .../collator/CollationCreationMethodTest.java | 20 ++--- .../dev/test/collator/CollationMiscTest.java | 3 +- 4 files changed, 48 insertions(+), 67 deletions(-) diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java index 1b79807aff3..14d8c23a974 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/CollationParsedRuleBuilder.java @@ -727,17 +727,7 @@ final class CollationParsedRuleBuilder { * @return 0 if equals, 1 if this is > target, -1 otherwise */ public int compareTo(WeightRange target) { - if (this == target) { - return 0; - } - int tstart = target.m_start_; - if (m_start_ == tstart) { - return 0; - } - if (m_start_ > tstart) { - return 1; - } - return -1; + return Utility.compareUnsigned(m_start_, target.m_start_); } /** @@ -1584,9 +1574,12 @@ final class CollationParsedRuleBuilder { } } } - if (low == 0) { - low = 0x01000000; + if(0 <= low && low < 0x02000000) { // unsigned comparison < 0x02000000 + // We must not use CE weight byte 02, so we set it as the minimum lower bound. + // See http://site.icu-project.org/design/collation/bytes + low = 0x02000000; } + if (strength == Collator.SECONDARY) { // similar as simple if (Utility.compareUnsigned(low, RuleBasedCollator.COMMON_BOTTOM_2_ << 24) >= 0 diff --git a/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java b/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java index 98868e811aa..593e130a534 100644 --- a/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java +++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/RuleBasedCollator.java @@ -1697,32 +1697,14 @@ public final class RuleBasedCollator extends Collator * Implicit generator */ static final ImplicitCEGenerator impCEGen_; -// /** -// * Implicit constants -// */ -// static final int IMPLICIT_BASE_BYTE_; -// static final int IMPLICIT_LIMIT_BYTE_; -// static final int IMPLICIT_4BYTE_BOUNDARY_; -// static final int LAST_MULTIPLIER_; -// static final int LAST2_MULTIPLIER_; -// static final int IMPLICIT_BASE_3BYTE_; -// static final int IMPLICIT_BASE_4BYTE_; -// static final int BYTES_TO_AVOID_ = 3; -// static final int OTHER_COUNT_ = 256 - BYTES_TO_AVOID_; -// static final int LAST_COUNT_ = OTHER_COUNT_ / 2; -// /** -// * Room for intervening, without expanding to 5 bytes -// */ -// static final int LAST_COUNT2_ = OTHER_COUNT_ / 21; -// static final int IMPLICIT_3BYTE_COUNT_ = 1; -// + static final byte SORT_LEVEL_TERMINATOR_ = 1; // These are values from UCA required for // implicit generation and supressing sort key compression // they should regularly be in the UCA, but if one // is running without UCA, it could be a problem - static final int maxRegularPrimary = 0xA0; + static final int maxRegularPrimary = 0x7A; static final int minImplicitPrimary = 0xE0; static final int maxImplicitPrimary = 0xE4; @@ -2108,9 +2090,10 @@ public final class RuleBasedCollator extends Collator private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03; /*private*/ static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_; //private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_; - static final byte CODAN_PLACEHOLDER = 0x27; - //private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C; - private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D; + // TODO: Make the following values dynamic since they change with almost every UCA version. + static final byte CODAN_PLACEHOLDER = 0x12; + private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x5B; + private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF; private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1; private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80; @@ -2341,6 +2324,14 @@ public final class RuleBasedCollator extends Collator return 0; } + // Is this primary weight compressible? + // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit). + // TODO: This should use per-lead-byte flags from FractionalUCA.txt. + static boolean + isCompressible(int primary1) { + return BYTE_FIRST_NON_LATIN_PRIMARY_ <= primary1 && primary1 <= maxRegularPrimary; + } + /** * Gets the 2 bytes of primary order and adds it to the primary byte array * @param ce current ce @@ -2417,33 +2408,27 @@ public final class RuleBasedCollator extends Collator m_utilBytesCount1_ ++; leadPrimary = 0; } - else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_ - || (p1 > maxRegularPrimary - //> (RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_[0] - // >>> 24) - && p1 < minImplicitPrimary - //< (RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_[0] - // >>> 24) - )) { - // not compressible - leadPrimary = 0; - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)p1); - m_utilBytesCount1_ ++; - m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, - (byte)p2); - m_utilBytesCount1_ ++; - } - else { // compress + else if (isCompressible(p1)) { + // compress leadPrimary = p1; m_utilBytes1_ = append(m_utilBytes1_, m_utilBytesCount1_, (byte)p1); m_utilBytesCount1_ ++; m_utilBytes1_ = append(m_utilBytes1_, - m_utilBytesCount1_, (byte)p2); + m_utilBytesCount1_, + (byte)p2); + m_utilBytesCount1_ ++; + } + else { + leadPrimary = 0; + m_utilBytes1_ = append(m_utilBytes1_, + m_utilBytesCount1_, + (byte)p1); + m_utilBytesCount1_ ++; + m_utilBytes1_ = append(m_utilBytes1_, + m_utilBytesCount1_, + (byte)p2); m_utilBytesCount1_ ++; } } diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationCreationMethodTest.java b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationCreationMethodTest.java index 5b72f7082ea..ae48275c75a 100644 --- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationCreationMethodTest.java +++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationCreationMethodTest.java @@ -53,17 +53,18 @@ public class CollationCreationMethodTest extends TestFmwk for(z = 0; z < 60; z++) { x = r.nextInt(locales.length); + Locale locale = locales[x]; try { //this is making the assumption that the only type of collator that will be made is RBC - localeCollator = (RuleBasedCollator)Collator.getInstance(locales[x]); + localeCollator = (RuleBasedCollator)Collator.getInstance(locale); + logln("Rules for " + locale + " are: " + localeCollator.getRules()); ruleCollator = new RuleBasedCollator(localeCollator.getRules()); - logln("Rules are: " + localeCollator.getRules()); } catch (Exception e) { - warnln("ERROR: in creation of collator of " + locales[x].getDisplayName() + " locale"); + warnln("ERROR: in creation of collator of locale " + locale.getDisplayName() + ": " + e); return; } @@ -77,7 +78,7 @@ public class CollationCreationMethodTest extends TestFmwk key1 = localeCollator.getCollationKey(randString1); key2 = ruleCollator.getCollationKey(randString1); - report(locales[x].getDisplayName(), randString1, key1, key2); + report(locale.getDisplayName(), randString1, key1, key2); } } } @@ -112,11 +113,12 @@ public class CollationCreationMethodTest extends TestFmwk { if (!k1.equals(k2)) { - String msg = ""; - msg += "With " + localeName + "Collator: "; - msg += string1; - msg += " failed to produce identical keys on both collators"; - errln(msg); + StringBuilder msg = new StringBuilder(); + msg.append("With ").append(localeName).append(" collator\n and input string: ").append(string1).append('\n'); + msg.append(" failed to produce identical keys on both collators\n"); + msg.append(" localeCollator key: ").append(CollationMiscTest.prettify(k1)).append('\n'); + msg.append(" ruleCollator key: ").append(CollationMiscTest.prettify(k2)).append('\n'); + errln(msg.toString()); } } } diff --git a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationMiscTest.java b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationMiscTest.java index f7ff479a88e..345275674e2 100644 --- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationMiscTest.java +++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationMiscTest.java @@ -213,6 +213,7 @@ public class CollationMiscTest extends TestFmwk { }; for (int i = 0; i< rules.length; i++) { + logln(String.format("rules[%d] = \"%s\"", i, rules[i])); genericRulesStarter(rules[i], data[i]); } } @@ -321,7 +322,7 @@ public class CollationMiscTest extends TestFmwk { return target; } - String prettify(CollationKey sourceKey) { + static String prettify(CollationKey sourceKey) { int i; byte[] bytes= sourceKey.toByteArray(); StringBuilder target = new StringBuilder("[");