ICU-3184 CODAN java port

X-SVN-Rev: 12968
This commit is contained in:
Syn Wee Quek 2003-08-27 22:28:45 +00:00
parent 0b1267d260
commit c8a4b87a90
7 changed files with 697 additions and 22 deletions

View file

@ -20,8 +20,9 @@ package com.ibm.icu.dev.test.collator;
import com.ibm.icu.dev.test.*;
import com.ibm.icu.text.*;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.impl.ICULocaleData;
import java.util.Locale;
import java.util.ResourceBundle;
public class CollationMiscTest extends TestFmwk{
@ -30,6 +31,95 @@ public class CollationMiscTest extends TestFmwk{
// new CollationMiscTest().TestLocaleRuleBasedCollators();
}
private static final int NORM_BUFFER_TEST_LEN_ = 32;
private static final class Tester
{
int u;
String NFC;
String NFD;
};
private static final boolean hasCollationElements(Locale locale)
{
ResourceBundle rb = ICULocaleData.getLocaleElements(locale);
if (rb != null) {
try {
Object elements = rb.getObject("CollationElements");
if (elements != null) {
return true;
}
} catch (Exception e) {
}
}
return false;
}
public void TestComposeDecompose()
{
Tester t[] = new Tester[0x30000];
t[0] = new Tester();
logln("Testing UCA extensively\n");
RuleBasedCollator coll;
try {
coll = (RuleBasedCollator)Collator.getInstance(Locale.ENGLISH);
}
catch (Exception e) {
errln("Error opening collator\n");
return;
}
int noCases = 0;
for (int u = 0; u < 0x30000; u ++) {
String comp = UTF16.valueOf(u);
int len = comp.length();
t[noCases].NFC = Normalizer.normalize(u, Normalizer.NFC);
t[noCases].NFD = Normalizer.normalize(u, Normalizer.NFD);
if (t[noCases].NFC.length() != t[noCases].NFD.length()
|| (t[noCases].NFC.compareTo(t[noCases].NFD) != 0)
|| (len != t[noCases].NFD.length())
|| (comp.compareTo(t[noCases].NFD) != 0)) {
t[noCases].u = u;
if (len != t[noCases].NFD.length()
|| (comp.compareTo(t[noCases].NFD) != 0)) {
t[noCases].NFC = comp;
}
noCases ++;
t[noCases] = new Tester();
}
}
for (int u = 0; u < noCases; u ++) {
if (!coll.equals(t[u].NFC, t[u].NFD)) {
errln("Failure: codePoint \\u" + Integer.toHexString(t[u].u)
+ " fails TestComposeDecompose in the UCA");
CollationTest.doTest(this, coll, t[u].NFC, t[u].NFD, 0);
}
}
logln("Testing locales, number of cases = " + noCases);
Locale loc[] = Collator.getAvailableLocales();
for (int i = 0; i < loc.length; i ++) {
if (hasCollationElements(loc[i])) {
logln("Testing locale " + loc[i].getDisplayName());
coll = (RuleBasedCollator)Collator.getInstance(loc[i]);
coll.setStrength(Collator.IDENTICAL);
for (int u = 0; u < noCases; u ++) {
if (!coll.equals(t[u].NFC, t[u].NFD)) {
errln("Failure: codePoint \\u"
+ Integer.toHexString(t[u].u)
+ " fails TestComposeDecompose for locale "
+ loc[i].getDisplayName());
// this tests for the iterators too
CollationTest.doTest(this, coll, t[u].NFC, t[u].NFD,
0);
}
}
}
}
}
public void TestRuleOptions() {
// values here are hardcoded and are correct for the current UCA when
// the UCA changes, one might be forced to change these values.
@ -426,6 +516,9 @@ public class CollationMiscTest extends TestFmwk{
coll.setAlternateHandlingShifted(((Boolean)values[i]
).booleanValue());
}
else if (attrs[i].equals("NumericCollation")) {
coll.setNumericCollation(((Boolean)values[i]).booleanValue());
}
}
genericOrderingTest(coll, s);
@ -1698,4 +1791,73 @@ public class CollationMiscTest extends TestFmwk{
CollationTest.doTest(this, collator, "a", "a ", 0); // inconsistent results
}
/**
* Test for CollationElementIterator previous and next for the whole set of
* unicode characters with normalization on.
*/
public void TestNumericCollation()
{
String basicTestStrings[] = {"hello1", "hello2", "hello123456"};
String preZeroTestStrings[] = {"avery1",
"avery01",
"avery001",
"avery0001"};
String thirtyTwoBitNumericStrings[] = {"avery42949672960",
"avery42949672961",
"avery42949672962",
"avery429496729610"};
String supplementaryDigits[] = {"\uD835\uDFCE", // 0
"\uD835\uDFCF", // 1
"\uD835\uDFD0", // 2
"\uD835\uDFD1", // 3
"\uD835\uDFCF\uD835\uDFCE", // 10
"\uD835\uDFCF\uD835\uDFCF", // 11
"\uD835\uDFCF\uD835\uDFD0", // 12
"\uD835\uDFD0\uD835\uDFCE", // 20
"\uD835\uDFD0\uD835\uDFCF", // 21
"\uD835\uDFD0\uD835\uDFD0" // 22
};
String foreignDigits[] = {"\u0661",
"\u0662",
"\u0663",
"\u0661\u0660",
"\u0661\u0662",
"\u0661\u0663",
"\u0662\u0660",
"\u0662\u0662",
"\u0662\u0663",
"\u0663\u0660",
"\u0663\u0662",
"\u0663\u0663"
};
// Open our collator.
RuleBasedCollator coll
= (RuleBasedCollator)Collator.getInstance(Locale.ENGLISH);
String att[] = {"NumericCollation"};
Boolean val[] = {new Boolean(true)};
genericLocaleStarterWithOptions(Locale.ENGLISH, basicTestStrings, att,
val);
genericLocaleStarterWithOptions(Locale.ENGLISH,
thirtyTwoBitNumericStrings, att, val);
genericLocaleStarterWithOptions(Locale.ENGLISH, foreignDigits, att,
val);
genericLocaleStarterWithOptions(Locale.ENGLISH, supplementaryDigits,
att, val);
// Setting up our collator to do digits.
coll.setNumericCollation(true);
// Testing that prepended zeroes still yield the correct collation
// behavior.
// We expect that every element in our strings array will be equal.
for (int i = 0; i < preZeroTestStrings.length - 1; i ++) {
for (int j = i + 1; j < preZeroTestStrings.length; j ++) {
CollationTest.doTest(this, coll, preZeroTestStrings[i],
preZeroTestStrings[j],0);
}
}
}
}

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2e8932a08fe0f724eba842b654f00fd88ae65f631f3ef4a5896eeaba34200e7d
size 1138811
oid sha256:4c81d588b4c428cd79002dde1e44c7f8f5b9583f43bf85027aa6996f28615b78
size 1223293

View file

@ -652,6 +652,13 @@ public final class CollationElementIterator
* will cause this value to be reset to 0.
*/
int m_CEBufferSize_;
static final int CE_NOT_FOUND_ = 0xF0000000;
static final int CE_EXPANSION_TAG_ = 1;
static final int CE_CONTRACTION_TAG_ = 2;
/**
* Collate Digits As Numbers (CODAN) implementation
*/
static final int CE_DIGIT_TAG_ = 13;
// package private methods ----------------------------------------------
@ -862,7 +869,7 @@ public final class CollationElementIterator
private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
// special ce values and tags -------------------------------------------
/*private*/ static final int CE_NOT_FOUND_ = 0xF0000000;
private static final int CE_EXPANSION_ = 0xF1000000;
private static final int CE_CONTRACTION_ = 0xF2000000;
private static final int CE_THAI_ = 0xF3000000;
@ -876,8 +883,6 @@ public final class CollationElementIterator
private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
private static final int CE_NOT_FOUND_TAG_ = 0;
/*private*/ static final int CE_EXPANSION_TAG_ = 1;
/*private*/ static final int CE_CONTRACTION_TAG_ = 2;
private static final int CE_THAI_TAG_ = 3;
/**
* Charset processing, not yet implemented
@ -907,7 +912,8 @@ public final class CollationElementIterator
* space without affecting the performance (hopefully).
*/
private static final int CE_LONG_PRIMARY_TAG_ = 12;
private static final int CE_CE_TAGS_COUNT = 13;
private static final int CE_CE_TAGS_COUNT = 14;
private static final int CE_BYTE_COMMON_ = 0x05;
// end special ce values and tags ---------------------------------------
@ -2005,6 +2011,193 @@ public final class CollationElementIterator
}
return m_CEBuffer_[0];
}
/**
* Gets the next digit ce
* @param collator current collator
* @param ce current collation element
* @param cp current codepoint
* @return next digit ce
*/
private int nextDigit(RuleBasedCollator collator, int ce, int cp)
{
// We do a check to see if we want to collate digits as numbers;
// if so we generate a custom collation key. Otherwise we pull out
// the value stored in the expansion table.
if (collator.m_isNumericCollation_){
int collateVal = 0;
int trailingZeroIndex = 0;
boolean nonZeroValReached = false;
// I just need a temporary place to store my generated CEs.
// icu4c uses a unsigned byte array, i'll use a stringbuffer here
// to avoid dealing with the sign problems and array allocation
// clear and set initial string buffer length
m_utilStringBuffer_.setLength(3);
// We parse the source string until we hit a char that's NOT a
// digit.
// Use this u_charDigitValue. This might be slow because we have
// to handle surrogates...
int digVal = UCharacter.digit(cp);
// if we have arrived here, we have already processed possible
// supplementaries that trigered the digit tag -
// all supplementaries are marked in the UCA.
// We pad a zero in front of the first element anyways.
// This takes care of the (probably) most common case where
// people are sorting things followed by a single digit
int digIndx = 1;
for (;;) {
// Make sure we have enough space.
if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
<< 1);
}
// Skipping over leading zeroes.
if (digVal != 0 || nonZeroValReached) {
if (digVal != 0 && !nonZeroValReached) {
nonZeroValReached = true;
}
// We parse the digit string into base 100 numbers
// (this fits into a byte).
// We only add to the buffer in twos, thus if we are
// parsing an odd character, that serves as the
// 'tens' digit while the if we are parsing an even
// one, that is the 'ones' digit. We dumped the
// parsed base 100 value (collateVal) into a buffer.
// We multiply each collateVal by 2 (to give us room)
// and add 5 (to avoid overlapping magic CE byte
// values). The last byte we subtract 1 to ensure it is
// less than all the other bytes.
if (digIndx % 2 == 1) {
collateVal += digVal;
// This removes trailing zeroes.
if (collateVal == 0 && trailingZeroIndex == 0) {
trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
}
else if (trailingZeroIndex != 0) {
trailingZeroIndex = 0;
}
m_utilStringBuffer_.setCharAt(
((digIndx - 1) >>> 1) + 2,
(char)((collateVal << 1) + 6));
collateVal = 0;
}
else {
// We drop the collation value into the buffer so if
// we need to do a "front patch" we don't have to
// check to see if we're hitting the last element.
collateVal = digVal * 10;
m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
(char)((collateVal << 1) + 6));
}
digIndx ++;
}
// Get next character.
if (!isEnd()){
backupInternalState(m_utilSpecialBackUp_);
char ch = nextChar();
int char32 = ch;
if (UTF16.isLeadSurrogate(ch)){
if (!isEnd()) {
char trail = nextChar();
if (UTF16.isTrailSurrogate(trail)) {
char32 = UCharacterProperty.getRawSupplementary(
ch, trail);
}
else {
goBackOne();
}
}
}
digVal = UCharacter.digit(char32);
if (digVal == -1) {
// Resetting position to point to the next unprocessed
// char. We overshot it when doing our test/set for
// numbers.
updateInternalState(m_utilSpecialBackUp_);
break;
}
}
else {
break;
}
}
if (nonZeroValReached == false){
digIndx = 2;
m_utilStringBuffer_.setCharAt(2, (char)6);
}
int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
: (digIndx >>> 1) + 2;
if (digIndx % 2 != 0){
// We missed a value. Since digIndx isn't even, stuck too many
// values into the buffer (this is what we get for padding the
// first byte with a zero). "Front-patch" now by pushing all
// nybbles forward.
// Doing it this way ensures that at least 50% of the time
// (statistically speaking) we'll only be doing a single pass
// and optimizes for strings with single digits. I'm just
// assuming that's the more common case.
for (int i = 2; i < endIndex; i ++){
m_utilStringBuffer_.setCharAt(i,
(char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
% 10) * 10)
+ (((m_utilStringBuffer_.charAt(i + 1) - 6)
>>> 1) / 10) << 1) + 6));
}
-- digIndx;
}
// Subtract one off of the last byte.
m_utilStringBuffer_.setCharAt(endIndex - 1,
(char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
// We want to skip over the first two slots in the buffer.
// The first slot is reserved for the header byte 0x1B.
// The second slot is for the sign/exponent byte:
// 0x80 + (decimalPos/2) & 7f.
m_utilStringBuffer_.setCharAt(0, (char)0x1B);
m_utilStringBuffer_.setCharAt(1,
(char)(0x80 + ((digIndx >>> 1) & 0x7F)));
// Now transfer the collation key to our collIterate struct.
// The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
ce = (((m_utilStringBuffer_.charAt(0) << 8)
// Primary weight
| m_utilStringBuffer_.charAt(1))
<< RuleBasedCollator.CE_PRIMARY_SHIFT_)
// Secondary weight
| (RuleBasedCollator.BYTE_COMMON_
<< RuleBasedCollator.CE_SECONDARY_SHIFT_)
| RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
int i = 2; // Reset the index into the buffer.
m_CEBuffer_[0] = ce;
m_CEBufferSize_ = 1;
m_CEBufferOffset_ = 1;
while (i < endIndex)
{
int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
if (i < endIndex) {
primWeight |= m_utilStringBuffer_.charAt(i ++);
}
m_CEBuffer_[m_CEBufferSize_ ++]
= (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
}
return ce;
}
// no numeric mode, we'll just switch to whatever we stashed and
// continue
// find the offset to expansion table
return collator.m_expansion_[getExpansionOffset(collator, ce)];
}
/**
* Gets the next implicit ce for codepoints
@ -2157,6 +2350,9 @@ public final class CollationElementIterator
return nextLongPrimary(ce);
case CE_EXPANSION_TAG_:
return nextExpansion(collator, ce);
case CE_DIGIT_TAG_:
ce = nextDigit(collator, ce, codepoint);
break;
// various implicits optimization
case CE_CJK_IMPLICIT_TAG_:
// 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
@ -2180,7 +2376,8 @@ public final class CollationElementIterator
break;
}
}
} finally {
}
finally {
m_utilSpecialEntryBackUp_ = entrybackup;
}
return ce;
@ -2469,6 +2666,185 @@ public final class CollationElementIterator
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
return m_CEBuffer_[m_CEBufferOffset_];
}
/**
* Getting the digit collation elements
* @param collator
* @param ce current collation element
* @param ch current code point
* @return digit collation element
*/
private int previousDigit(RuleBasedCollator collator, int ce, char ch)
{
// We do a check to see if we want to collate digits as numbers; if so we generate
// a custom collation key. Otherwise we pull out the value stored in the expansion table.
if (collator.m_isNumericCollation_){
int leadingZeroIndex = 0;
int collateVal = 0;
boolean nonZeroValReached = false;
// clear and set initial string buffer length
m_utilStringBuffer_.setLength(3);
// We parse the source string until we hit a char that's NOT a digit
// Use this u_charDigitValue. This might be slow because we have to
// handle surrogates...
int char32 = ch;
if (UTF16.isTrailSurrogate(ch)) {
if (!isBackwardsStart()){
char lead = previousChar();
if (UTF16.isLeadSurrogate(lead)) {
char32 = UCharacterProperty.getRawSupplementary(lead,
ch);
}
else {
goForwardOne();
}
}
}
int digVal = UCharacter.digit(char32);
int digIndx = 0;
for (;;) {
// Make sure we have enough space.
if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
<< 1);
}
// Skipping over "trailing" zeroes but we still add to digIndx.
if (digVal != 0 || nonZeroValReached) {
if (digVal != 0 && !nonZeroValReached) {
nonZeroValReached = true;
}
// We parse the digit string into base 100 numbers (this
// fits into a byte).
// We only add to the buffer in twos, thus if we are
// parsing an odd character, that serves as the 'tens'
// digit while the if we are parsing an even one, that is
// the 'ones' digit. We dumped the parsed base 100 value
// (collateVal) into a buffer. We multiply each collateVal
// by 2 (to give us room) and add 5 (to avoid overlapping
// magic CE byte values). The last byte we subtract 1 to
// ensure it is less than all the other bytes.
// Since we're doing in this reverse we want to put the
// first digit encountered into the ones place and the
// second digit encountered into the tens place.
if (digIndx % 2 == 1){
collateVal += digVal * 10;
// This removes leading zeroes.
if (collateVal == 0 && leadingZeroIndex == 0) {
leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
}
else if (leadingZeroIndex != 0) {
leadingZeroIndex = 0;
}
m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
(char)((collateVal << 1) + 6));
collateVal = 0;
}
else {
collateVal = digVal;
}
}
digIndx ++;
if (!isBackwardsStart()){
backupInternalState(m_utilSpecialBackUp_);
ch = previousChar();
char32 = ch;
if (UTF16.isTrailSurrogate(ch)){
if (!isBackwardsStart()) {
char lead = previousChar();
if (UTF16.isLeadSurrogate(lead)) {
char32
= UCharacterProperty.getRawSupplementary(
lead, ch);
}
else {
updateInternalState(m_utilSpecialBackUp_);
}
}
}
digVal = UCharacter.digit(char32);
if (digVal == -1) {
updateInternalState(m_utilSpecialBackUp_);
break;
}
}
else {
break;
}
}
if (nonZeroValReached == false) {
digIndx = 2;
m_utilStringBuffer_.setCharAt(2, (char)6);
}
if (digIndx % 2 != 0) {
if (collateVal == 0 && leadingZeroIndex == 0) {
// This removes the leading 0 in a odd number sequence of
// numbers e.g. avery001
leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
}
else {
// this is not a leading 0, we add it in
m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
(char)((collateVal << 1) + 6));
digIndx ++;
}
}
int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
: ((digIndx >>> 1) + 2) ;
digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
// Subtract one off of the last byte.
// Really the first byte here, but it's reversed...
m_utilStringBuffer_.setCharAt(2,
(char)(m_utilStringBuffer_.charAt(2) - 1));
// We want to skip over the first two slots in the buffer.
// The first slot is reserved for the header byte 0x1B.
// The second slot is for the sign/exponent byte:
// 0x80 + (decimalPos/2) & 7f.
m_utilStringBuffer_.setCharAt(0, (char)0x1B);
m_utilStringBuffer_.setCharAt(1,
(char)(0x80 + ((digIndx >>> 1) & 0x7F)));
// Now transfer the collation key to our collIterate struct.
// The total size for our collation key is endIndx bumped up to the
// next largest even value divided by two.
m_CEBufferSize_ = 0;
m_CEBuffer_[m_CEBufferSize_ ++]
= (((m_utilStringBuffer_.charAt(0) << 8)
// Primary weight
| m_utilStringBuffer_.charAt(1))
<< RuleBasedCollator.CE_PRIMARY_SHIFT_)
// Secondary weight
| (RuleBasedCollator.BYTE_COMMON_
<< RuleBasedCollator.CE_SECONDARY_SHIFT_)
// Tertiary weight.
| RuleBasedCollator.BYTE_COMMON_;
int i = endIndex - 1; // Reset the index into the buffer.
while (i >= 2) {
int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
if (i >= 2) {
primWeight |= m_utilStringBuffer_.charAt(i --);
}
m_CEBuffer_[m_CEBufferSize_ ++]
= (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
}
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
return m_CEBuffer_[m_CEBufferOffset_];
}
else {
return collator.m_expansion_[getExpansionOffset(collator, ce)];
}
}
/**
* Returns previous hangul ces
@ -2600,6 +2976,9 @@ public final class CollationElementIterator
return previousLongPrimary(ce);
case CE_EXPANSION_TAG_: // always returns
return previousExpansion(collator, ce);
case CE_DIGIT_TAG_:
ce = previousDigit(collator, ce, ch);
break;
case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
return previousHangul(collator, ch);
case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
@ -2728,4 +3107,45 @@ public final class CollationElementIterator
return m_source_.current();
}
}
/**
* Moves back 1 position in the source string. This is slightly less
* complicated than previousChar in that it doesn't normalize while
* moving back. Boundary checks are not performed.
* This method is to be used with caution, with the assumption that
* moving back one position will not exceed the source limits.
* Use only with nextChar() and never call this API twice in a row without
* nextChar() in the middle.
*/
private void goBackOne()
{
if (m_bufferOffset_ >= 0) {
m_bufferOffset_ --;
}
else {
m_source_.setIndex(m_source_.getIndex() - 1);
}
}
/**
* Moves forward 1 position in the source string. This is slightly less
* complicated than nextChar in that it doesn't normalize while
* moving back. Boundary checks are not performed.
* This method is to be used with caution, with the assumption that
* moving back one position will not exceed the source limits.
* Use only with previousChar() and never call this API twice in a row
* without previousChar() in the middle.
*/
private void goForwardOne()
{
if (m_bufferOffset_ < 0) {
// we're working on the source and not normalizing. fast path.
// note Thai pre-vowel reordering uses buffer too
m_source_.setIndex(m_source_.getIndex() + 1);
}
else {
// we are in the buffer, buffer offset will never be 0 here
m_bufferOffset_ ++;
}
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java,v $
* $Date: 2003/08/20 00:20:37 $
* $Revision: 1.23 $
* $Date: 2003/08/27 22:28:45 $
* $Revision: 1.24 $
*
*******************************************************************************
*/
@ -24,6 +24,7 @@ import com.ibm.icu.impl.TrieBuilder;
import com.ibm.icu.impl.IntTrieBuilder;
import com.ibm.icu.impl.TrieIterator;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.impl.NormalizerImpl;
@ -1865,6 +1866,7 @@ final class CollationParsedRuleBuilder
private int addAnElement(BuildTable t, Elements element)
{
Vector expansions = t.m_expansions_;
element.m_mapCE_ = 0;
if (element.m_CELength_ == 1) {
if (element.m_isThai_ == false) {
element.m_mapCE_ = element.m_CEs_[0];
@ -1941,6 +1943,41 @@ final class CollationParsedRuleBuilder
}
}
}
// We treat digits differently - they are "uber special" and should be
// processed differently if numeric collation is on.
int uniChar = 0;
if ((element.m_uchars_.length() == 2)
&& UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
uniChar = UCharacterProperty.getRawSupplementary(
element.m_uchars_.charAt(0),
element.m_uchars_.charAt(1));
}
else if (element.m_uchars_.length() == 1) {
uniChar = element.m_uchars_.charAt(0);
}
// Here, we either have one normal CE OR mapCE is set. Therefore, we
// stuff only one element to the expansion buffer. When we encounter a
// digit and we don't do numeric collation, we will just pick the CE
// we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE
// && ucol_prv_getSpecialPrevCE). If we picked a special, further
// processing will occur. If it's a simple CE, we'll return due
// to how the loop is constructed.
if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
// prepare the element
int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
| (CollationElementIterator.CE_DIGIT_TAG_
<< RuleBasedCollator.CE_TAG_SHIFT_) | 1;
if (element.m_mapCE_ != 0) {
// if there is an expansion, we'll pick it here
expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
}
else {
expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);
}
element.m_mapCE_ = expansion;
}
// here we want to add the prefix structure.
// I will try to process it as a reverse contraction, if possible.

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollatorReader.java,v $
* $Date: 2003/06/03 18:49:34 $
* $Revision: 1.13 $
* $Date: 2003/08/27 22:28:45 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -91,8 +91,8 @@ final class CollatorReader
*/
protected void readHeader(RuleBasedCollator rbc) throws IOException
{
int size = m_dataInputStream_.readInt();
// all the offsets are in bytes
int size = m_dataInputStream_.readInt();
// all the offsets are in bytes
// to get the address add to the header address and cast properly
// Default options int options
m_dataInputStream_.skip(4); // options
@ -166,7 +166,7 @@ final class CollatorReader
*/
protected void readOptions(RuleBasedCollator rbc) throws IOException
{
rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_);
rbc.m_defaultIsAlternateHandlingShifted_
@ -186,6 +186,8 @@ final class CollatorReader
rbc.m_defaultStrength_ = m_dataInputStream_.readInt();
rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_);
rbc.m_defaultIsNumericCollation_ = (m_dataInputStream_.readInt()
== RuleBasedCollator.AttributeValue.ON_);
m_dataInputStream_.skip(64); // reserved for future use
}
@ -206,7 +208,7 @@ final class CollatorReader
{
readHeader(rbc);
readOptions(rbc);
m_expansionSize_ >>= 2;
m_expansionSize_ >>= 2;
rbc.m_expansion_ = new int[m_expansionSize_];
for (int i = 0; i < m_expansionSize_; i ++) {
rbc.m_expansion_[i] = m_dataInputStream_.readInt();

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java,v $
* $Date: 2003/08/25 23:23:12 $
* $Revision: 1.44 $
* $Date: 2003/08/27 22:28:45 $
* $Revision: 1.45 $
*
*******************************************************************************
*/
@ -445,6 +445,20 @@ public final class RuleBasedCollator extends Collator
setStrength(m_defaultStrength_);
}
/**
* Method to set numeric collation to its default value.
* When numeric collation is turned on, this Collator generates a collation
* key for the numeric value of substrings of digits. This is a way to get
* '100' to sort AFTER '2'
* @see #getNumericCollation
* @see #setNumericCollation
* @draft ICU 2.8
*/
public void setNumericCollationDefault()
{
setNumericCollation(m_defaultIsNumericCollation_);
}
/**
* Sets the mode for the direction of SECONDARY weights to be used in
* French collation.
@ -625,6 +639,21 @@ public final class RuleBasedCollator extends Collator
m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
}
/**
* When numeric collation is turned on, this Collator generates a collation
* key for the numeric value of substrings of digits. This is a way to get
* '100' to sort AFTER '2'
* @param flag true to turn numeric collation on and false to turn it off
* @see #getNumericCollation
* @see #setNumericCollationDefault
* @draft ICU 2.8
*/
public void setNumericCollation(boolean flag)
{
// sort substrings of digits as numbers
m_isNumericCollation_ = flag;
}
// public getters --------------------------------------------------------
/**
@ -863,6 +892,21 @@ public final class RuleBasedCollator extends Collator
return m_variableTopValue_ << 16;
}
/**
* Method to retrieve the numeric collation value.
* When numeric collation is turned on, this Collator generates a collation
* key for the numeric value of substrings of digits. This is a way to get
* '100' to sort AFTER '2'
* @see #setNumericCollation
* @see #setNumericCollationDefault
* @return true if numeric collation is turned on, false otherwise
* @draft ICU 2.8
*/
public boolean getNumericCollation()
{
return m_isNumericCollation_;
}
// public other methods -------------------------------------------------
/**
@ -1324,6 +1368,7 @@ public final class RuleBasedCollator extends Collator
boolean m_isJamoSpecial_;
// Collator options ------------------------------------------------------
int m_defaultVariableTopValue_;
boolean m_defaultIsFrenchCollation_;
boolean m_defaultIsAlternateHandlingShifted_;
@ -1332,6 +1377,8 @@ public final class RuleBasedCollator extends Collator
int m_defaultDecomposition_;
int m_defaultStrength_;
boolean m_defaultIsHiragana4_;
boolean m_defaultIsNumericCollation_;
/**
* Value of the variable top
*/
@ -1344,6 +1391,10 @@ public final class RuleBasedCollator extends Collator
* Case sorting customization
*/
int m_caseFirst_;
/**
* Numeric collation option
*/
boolean m_isNumericCollation_;
// end Collator options --------------------------------------------------
@ -1515,10 +1566,9 @@ public final class RuleBasedCollator extends Collator
Object elements = rb.getObject("CollationElements");
if (elements != null) {
Object[][] rules = (Object[][])elements;
m_rules_ = (String)rules[1][1];
// %%CollationBin
if(rules[0][1] instanceof byte[]){
m_rules_ = (String)rules[1][1];
byte map[] = (byte [])rules[0][1];
BufferedInputStream input =
new BufferedInputStream(
@ -1547,7 +1597,8 @@ public final class RuleBasedCollator extends Collator
// due to resource redirection ICUListResourceBundle does not
// raise missing resource error
//throw new MissingResourceException("Could not get resource for constructing RuleBasedCollator","com.ibm.icu.impl.data.LocaleElements_"+locale.toString(), "%%CollationBin");
init((String)rules[1][1]);
m_rules_ = (String)rules[0][1];
init(m_rules_);
return;
}
}
@ -1606,6 +1657,7 @@ public final class RuleBasedCollator extends Collator
m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
m_defaultStrength_ = UCA_.m_defaultStrength_;
m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
m_expansionOffset_ = UCA_.m_expansionOffset_;
m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
m_isCaseLevel_ = UCA_.m_isCaseLevel_;
@ -1621,6 +1673,7 @@ public final class RuleBasedCollator extends Collator
m_top3_ = UCA_.m_top3_;
m_topCount3_ = UCA_.m_topCount3_;
m_variableTopValue_ = UCA_.m_variableTopValue_;
m_isNumericCollation_ = UCA_.m_isNumericCollation_;
setWithUCATables();
latinOneFailed_ = false;
}
@ -1818,7 +1871,7 @@ public final class RuleBasedCollator extends Collator
* Minimum size required for the binary collation data in bytes.
* Size of UCA header + size of options to 4 bytes
*/
private static final int MIN_BINARY_DATA_SIZE_ = (42 + 24) << 2;
private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
/**
* If this collator is to generate only simple tertiaries for fast path
@ -3679,6 +3732,7 @@ public final class RuleBasedCollator extends Collator
m_isCaseLevel_ = m_defaultIsCaseLevel_;
m_caseFirst_ = m_defaultCaseFirst_;
m_isHiragana4_ = m_defaultIsHiragana4_;
m_isNumericCollation_ = m_defaultIsNumericCollation_;
latinOneFailed_ = false;
updateInternalState();
}