mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 04:29:31 +00:00
ICU-3184 CODAN java port
X-SVN-Rev: 12968
This commit is contained in:
parent
0b1267d260
commit
c8a4b87a90
7 changed files with 697 additions and 22 deletions
|
@ -20,8 +20,9 @@ package com.ibm.icu.dev.test.collator;
|
|||
import com.ibm.icu.dev.test.*;
|
||||
import com.ibm.icu.text.*;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.impl.ICULocaleData;
|
||||
import java.util.Locale;
|
||||
|
||||
import java.util.ResourceBundle;
|
||||
|
||||
public class CollationMiscTest extends TestFmwk{
|
||||
|
||||
|
@ -30,6 +31,95 @@ public class CollationMiscTest extends TestFmwk{
|
|||
// new CollationMiscTest().TestLocaleRuleBasedCollators();
|
||||
}
|
||||
|
||||
private static final int NORM_BUFFER_TEST_LEN_ = 32;
|
||||
private static final class Tester
|
||||
{
|
||||
int u;
|
||||
String NFC;
|
||||
String NFD;
|
||||
};
|
||||
|
||||
private static final boolean hasCollationElements(Locale locale)
|
||||
{
|
||||
ResourceBundle rb = ICULocaleData.getLocaleElements(locale);
|
||||
if (rb != null) {
|
||||
try {
|
||||
Object elements = rb.getObject("CollationElements");
|
||||
if (elements != null) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void TestComposeDecompose()
|
||||
{
|
||||
Tester t[] = new Tester[0x30000];
|
||||
t[0] = new Tester();
|
||||
logln("Testing UCA extensively\n");
|
||||
RuleBasedCollator coll;
|
||||
try {
|
||||
coll = (RuleBasedCollator)Collator.getInstance(Locale.ENGLISH);
|
||||
}
|
||||
catch (Exception e) {
|
||||
errln("Error opening collator\n");
|
||||
return;
|
||||
}
|
||||
|
||||
int noCases = 0;
|
||||
for (int u = 0; u < 0x30000; u ++) {
|
||||
String comp = UTF16.valueOf(u);
|
||||
int len = comp.length();
|
||||
t[noCases].NFC = Normalizer.normalize(u, Normalizer.NFC);
|
||||
t[noCases].NFD = Normalizer.normalize(u, Normalizer.NFD);
|
||||
|
||||
if (t[noCases].NFC.length() != t[noCases].NFD.length()
|
||||
|| (t[noCases].NFC.compareTo(t[noCases].NFD) != 0)
|
||||
|| (len != t[noCases].NFD.length())
|
||||
|| (comp.compareTo(t[noCases].NFD) != 0)) {
|
||||
t[noCases].u = u;
|
||||
if (len != t[noCases].NFD.length()
|
||||
|| (comp.compareTo(t[noCases].NFD) != 0)) {
|
||||
t[noCases].NFC = comp;
|
||||
}
|
||||
noCases ++;
|
||||
t[noCases] = new Tester();
|
||||
}
|
||||
}
|
||||
|
||||
for (int u = 0; u < noCases; u ++) {
|
||||
if (!coll.equals(t[u].NFC, t[u].NFD)) {
|
||||
errln("Failure: codePoint \\u" + Integer.toHexString(t[u].u)
|
||||
+ " fails TestComposeDecompose in the UCA");
|
||||
CollationTest.doTest(this, coll, t[u].NFC, t[u].NFD, 0);
|
||||
}
|
||||
}
|
||||
|
||||
logln("Testing locales, number of cases = " + noCases);
|
||||
Locale loc[] = Collator.getAvailableLocales();
|
||||
for (int i = 0; i < loc.length; i ++) {
|
||||
if (hasCollationElements(loc[i])) {
|
||||
logln("Testing locale " + loc[i].getDisplayName());
|
||||
coll = (RuleBasedCollator)Collator.getInstance(loc[i]);
|
||||
coll.setStrength(Collator.IDENTICAL);
|
||||
|
||||
for (int u = 0; u < noCases; u ++) {
|
||||
if (!coll.equals(t[u].NFC, t[u].NFD)) {
|
||||
errln("Failure: codePoint \\u"
|
||||
+ Integer.toHexString(t[u].u)
|
||||
+ " fails TestComposeDecompose for locale "
|
||||
+ loc[i].getDisplayName());
|
||||
// this tests for the iterators too
|
||||
CollationTest.doTest(this, coll, t[u].NFC, t[u].NFD,
|
||||
0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestRuleOptions() {
|
||||
// values here are hardcoded and are correct for the current UCA when
|
||||
// the UCA changes, one might be forced to change these values.
|
||||
|
@ -426,6 +516,9 @@ public class CollationMiscTest extends TestFmwk{
|
|||
coll.setAlternateHandlingShifted(((Boolean)values[i]
|
||||
).booleanValue());
|
||||
}
|
||||
else if (attrs[i].equals("NumericCollation")) {
|
||||
coll.setNumericCollation(((Boolean)values[i]).booleanValue());
|
||||
}
|
||||
}
|
||||
|
||||
genericOrderingTest(coll, s);
|
||||
|
@ -1698,4 +1791,73 @@ public class CollationMiscTest extends TestFmwk{
|
|||
CollationTest.doTest(this, collator, "a", "a ", 0); // inconsistent results
|
||||
}
|
||||
|
||||
/**
|
||||
* Test for CollationElementIterator previous and next for the whole set of
|
||||
* unicode characters with normalization on.
|
||||
*/
|
||||
public void TestNumericCollation()
|
||||
{
|
||||
String basicTestStrings[] = {"hello1", "hello2", "hello123456"};
|
||||
String preZeroTestStrings[] = {"avery1",
|
||||
"avery01",
|
||||
"avery001",
|
||||
"avery0001"};
|
||||
String thirtyTwoBitNumericStrings[] = {"avery42949672960",
|
||||
"avery42949672961",
|
||||
"avery42949672962",
|
||||
"avery429496729610"};
|
||||
|
||||
String supplementaryDigits[] = {"\uD835\uDFCE", // 0
|
||||
"\uD835\uDFCF", // 1
|
||||
"\uD835\uDFD0", // 2
|
||||
"\uD835\uDFD1", // 3
|
||||
"\uD835\uDFCF\uD835\uDFCE", // 10
|
||||
"\uD835\uDFCF\uD835\uDFCF", // 11
|
||||
"\uD835\uDFCF\uD835\uDFD0", // 12
|
||||
"\uD835\uDFD0\uD835\uDFCE", // 20
|
||||
"\uD835\uDFD0\uD835\uDFCF", // 21
|
||||
"\uD835\uDFD0\uD835\uDFD0" // 22
|
||||
};
|
||||
|
||||
String foreignDigits[] = {"\u0661",
|
||||
"\u0662",
|
||||
"\u0663",
|
||||
"\u0661\u0660",
|
||||
"\u0661\u0662",
|
||||
"\u0661\u0663",
|
||||
"\u0662\u0660",
|
||||
"\u0662\u0662",
|
||||
"\u0662\u0663",
|
||||
"\u0663\u0660",
|
||||
"\u0663\u0662",
|
||||
"\u0663\u0663"
|
||||
};
|
||||
|
||||
// Open our collator.
|
||||
RuleBasedCollator coll
|
||||
= (RuleBasedCollator)Collator.getInstance(Locale.ENGLISH);
|
||||
String att[] = {"NumericCollation"};
|
||||
Boolean val[] = {new Boolean(true)};
|
||||
genericLocaleStarterWithOptions(Locale.ENGLISH, basicTestStrings, att,
|
||||
val);
|
||||
genericLocaleStarterWithOptions(Locale.ENGLISH,
|
||||
thirtyTwoBitNumericStrings, att, val);
|
||||
genericLocaleStarterWithOptions(Locale.ENGLISH, foreignDigits, att,
|
||||
val);
|
||||
genericLocaleStarterWithOptions(Locale.ENGLISH, supplementaryDigits,
|
||||
att, val);
|
||||
|
||||
// Setting up our collator to do digits.
|
||||
coll.setNumericCollation(true);
|
||||
|
||||
// Testing that prepended zeroes still yield the correct collation
|
||||
// behavior.
|
||||
// We expect that every element in our strings array will be equal.
|
||||
for (int i = 0; i < preZeroTestStrings.length - 1; i ++) {
|
||||
for (int j = i + 1; j < preZeroTestStrings.length; j ++) {
|
||||
CollationTest.doTest(this, coll, preZeroTestStrings[i],
|
||||
preZeroTestStrings[j],0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2e8932a08fe0f724eba842b654f00fd88ae65f631f3ef4a5896eeaba34200e7d
|
||||
size 1138811
|
||||
oid sha256:4c81d588b4c428cd79002dde1e44c7f8f5b9583f43bf85027aa6996f28615b78
|
||||
size 1223293
|
||||
|
|
Binary file not shown.
|
@ -652,6 +652,13 @@ public final class CollationElementIterator
|
|||
* will cause this value to be reset to 0.
|
||||
*/
|
||||
int m_CEBufferSize_;
|
||||
static final int CE_NOT_FOUND_ = 0xF0000000;
|
||||
static final int CE_EXPANSION_TAG_ = 1;
|
||||
static final int CE_CONTRACTION_TAG_ = 2;
|
||||
/**
|
||||
* Collate Digits As Numbers (CODAN) implementation
|
||||
*/
|
||||
static final int CE_DIGIT_TAG_ = 13;
|
||||
|
||||
// package private methods ----------------------------------------------
|
||||
|
||||
|
@ -862,7 +869,7 @@ public final class CollationElementIterator
|
|||
private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
|
||||
|
||||
// special ce values and tags -------------------------------------------
|
||||
/*private*/ static final int CE_NOT_FOUND_ = 0xF0000000;
|
||||
|
||||
private static final int CE_EXPANSION_ = 0xF1000000;
|
||||
private static final int CE_CONTRACTION_ = 0xF2000000;
|
||||
private static final int CE_THAI_ = 0xF3000000;
|
||||
|
@ -876,8 +883,6 @@ public final class CollationElementIterator
|
|||
private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
|
||||
|
||||
private static final int CE_NOT_FOUND_TAG_ = 0;
|
||||
/*private*/ static final int CE_EXPANSION_TAG_ = 1;
|
||||
/*private*/ static final int CE_CONTRACTION_TAG_ = 2;
|
||||
private static final int CE_THAI_TAG_ = 3;
|
||||
/**
|
||||
* Charset processing, not yet implemented
|
||||
|
@ -907,7 +912,8 @@ public final class CollationElementIterator
|
|||
* space without affecting the performance (hopefully).
|
||||
*/
|
||||
private static final int CE_LONG_PRIMARY_TAG_ = 12;
|
||||
private static final int CE_CE_TAGS_COUNT = 13;
|
||||
|
||||
private static final int CE_CE_TAGS_COUNT = 14;
|
||||
private static final int CE_BYTE_COMMON_ = 0x05;
|
||||
|
||||
// end special ce values and tags ---------------------------------------
|
||||
|
@ -2005,6 +2011,193 @@ public final class CollationElementIterator
|
|||
}
|
||||
return m_CEBuffer_[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next digit ce
|
||||
* @param collator current collator
|
||||
* @param ce current collation element
|
||||
* @param cp current codepoint
|
||||
* @return next digit ce
|
||||
*/
|
||||
private int nextDigit(RuleBasedCollator collator, int ce, int cp)
|
||||
{
|
||||
// We do a check to see if we want to collate digits as numbers;
|
||||
// if so we generate a custom collation key. Otherwise we pull out
|
||||
// the value stored in the expansion table.
|
||||
|
||||
if (collator.m_isNumericCollation_){
|
||||
int collateVal = 0;
|
||||
int trailingZeroIndex = 0;
|
||||
boolean nonZeroValReached = false;
|
||||
|
||||
// I just need a temporary place to store my generated CEs.
|
||||
// icu4c uses a unsigned byte array, i'll use a stringbuffer here
|
||||
// to avoid dealing with the sign problems and array allocation
|
||||
// clear and set initial string buffer length
|
||||
m_utilStringBuffer_.setLength(3);
|
||||
|
||||
// We parse the source string until we hit a char that's NOT a
|
||||
// digit.
|
||||
// Use this u_charDigitValue. This might be slow because we have
|
||||
// to handle surrogates...
|
||||
int digVal = UCharacter.digit(cp);
|
||||
// if we have arrived here, we have already processed possible
|
||||
// supplementaries that trigered the digit tag -
|
||||
// all supplementaries are marked in the UCA.
|
||||
// We pad a zero in front of the first element anyways.
|
||||
// This takes care of the (probably) most common case where
|
||||
// people are sorting things followed by a single digit
|
||||
int digIndx = 1;
|
||||
for (;;) {
|
||||
// Make sure we have enough space.
|
||||
if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
|
||||
m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
|
||||
<< 1);
|
||||
}
|
||||
// Skipping over leading zeroes.
|
||||
if (digVal != 0 || nonZeroValReached) {
|
||||
if (digVal != 0 && !nonZeroValReached) {
|
||||
nonZeroValReached = true;
|
||||
}
|
||||
// We parse the digit string into base 100 numbers
|
||||
// (this fits into a byte).
|
||||
// We only add to the buffer in twos, thus if we are
|
||||
// parsing an odd character, that serves as the
|
||||
// 'tens' digit while the if we are parsing an even
|
||||
// one, that is the 'ones' digit. We dumped the
|
||||
// parsed base 100 value (collateVal) into a buffer.
|
||||
// We multiply each collateVal by 2 (to give us room)
|
||||
// and add 5 (to avoid overlapping magic CE byte
|
||||
// values). The last byte we subtract 1 to ensure it is
|
||||
// less than all the other bytes.
|
||||
if (digIndx % 2 == 1) {
|
||||
collateVal += digVal;
|
||||
// This removes trailing zeroes.
|
||||
if (collateVal == 0 && trailingZeroIndex == 0) {
|
||||
trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
|
||||
}
|
||||
else if (trailingZeroIndex != 0) {
|
||||
trailingZeroIndex = 0;
|
||||
}
|
||||
m_utilStringBuffer_.setCharAt(
|
||||
((digIndx - 1) >>> 1) + 2,
|
||||
(char)((collateVal << 1) + 6));
|
||||
collateVal = 0;
|
||||
}
|
||||
else {
|
||||
// We drop the collation value into the buffer so if
|
||||
// we need to do a "front patch" we don't have to
|
||||
// check to see if we're hitting the last element.
|
||||
collateVal = digVal * 10;
|
||||
m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
|
||||
(char)((collateVal << 1) + 6));
|
||||
}
|
||||
digIndx ++;
|
||||
}
|
||||
|
||||
// Get next character.
|
||||
if (!isEnd()){
|
||||
backupInternalState(m_utilSpecialBackUp_);
|
||||
char ch = nextChar();
|
||||
int char32 = ch;
|
||||
if (UTF16.isLeadSurrogate(ch)){
|
||||
if (!isEnd()) {
|
||||
char trail = nextChar();
|
||||
if (UTF16.isTrailSurrogate(trail)) {
|
||||
char32 = UCharacterProperty.getRawSupplementary(
|
||||
ch, trail);
|
||||
}
|
||||
else {
|
||||
goBackOne();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
digVal = UCharacter.digit(char32);
|
||||
if (digVal == -1) {
|
||||
// Resetting position to point to the next unprocessed
|
||||
// char. We overshot it when doing our test/set for
|
||||
// numbers.
|
||||
updateInternalState(m_utilSpecialBackUp_);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nonZeroValReached == false){
|
||||
digIndx = 2;
|
||||
m_utilStringBuffer_.setCharAt(2, (char)6);
|
||||
}
|
||||
|
||||
int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
|
||||
: (digIndx >>> 1) + 2;
|
||||
if (digIndx % 2 != 0){
|
||||
// We missed a value. Since digIndx isn't even, stuck too many
|
||||
// values into the buffer (this is what we get for padding the
|
||||
// first byte with a zero). "Front-patch" now by pushing all
|
||||
// nybbles forward.
|
||||
// Doing it this way ensures that at least 50% of the time
|
||||
// (statistically speaking) we'll only be doing a single pass
|
||||
// and optimizes for strings with single digits. I'm just
|
||||
// assuming that's the more common case.
|
||||
for (int i = 2; i < endIndex; i ++){
|
||||
m_utilStringBuffer_.setCharAt(i,
|
||||
(char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
|
||||
% 10) * 10)
|
||||
+ (((m_utilStringBuffer_.charAt(i + 1) - 6)
|
||||
>>> 1) / 10) << 1) + 6));
|
||||
}
|
||||
-- digIndx;
|
||||
}
|
||||
|
||||
// Subtract one off of the last byte.
|
||||
m_utilStringBuffer_.setCharAt(endIndex - 1,
|
||||
(char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
|
||||
|
||||
// We want to skip over the first two slots in the buffer.
|
||||
// The first slot is reserved for the header byte 0x1B.
|
||||
// The second slot is for the sign/exponent byte:
|
||||
// 0x80 + (decimalPos/2) & 7f.
|
||||
m_utilStringBuffer_.setCharAt(0, (char)0x1B);
|
||||
m_utilStringBuffer_.setCharAt(1,
|
||||
(char)(0x80 + ((digIndx >>> 1) & 0x7F)));
|
||||
|
||||
// Now transfer the collation key to our collIterate struct.
|
||||
// The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
|
||||
ce = (((m_utilStringBuffer_.charAt(0) << 8)
|
||||
// Primary weight
|
||||
| m_utilStringBuffer_.charAt(1))
|
||||
<< RuleBasedCollator.CE_PRIMARY_SHIFT_)
|
||||
// Secondary weight
|
||||
| (RuleBasedCollator.BYTE_COMMON_
|
||||
<< RuleBasedCollator.CE_SECONDARY_SHIFT_)
|
||||
| RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
|
||||
int i = 2; // Reset the index into the buffer.
|
||||
|
||||
m_CEBuffer_[0] = ce;
|
||||
m_CEBufferSize_ = 1;
|
||||
m_CEBufferOffset_ = 1;
|
||||
while (i < endIndex)
|
||||
{
|
||||
int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
|
||||
if (i < endIndex) {
|
||||
primWeight |= m_utilStringBuffer_.charAt(i ++);
|
||||
}
|
||||
m_CEBuffer_[m_CEBufferSize_ ++]
|
||||
= (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
|
||||
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
|
||||
}
|
||||
return ce;
|
||||
}
|
||||
|
||||
// no numeric mode, we'll just switch to whatever we stashed and
|
||||
// continue
|
||||
// find the offset to expansion table
|
||||
return collator.m_expansion_[getExpansionOffset(collator, ce)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next implicit ce for codepoints
|
||||
|
@ -2157,6 +2350,9 @@ public final class CollationElementIterator
|
|||
return nextLongPrimary(ce);
|
||||
case CE_EXPANSION_TAG_:
|
||||
return nextExpansion(collator, ce);
|
||||
case CE_DIGIT_TAG_:
|
||||
ce = nextDigit(collator, ce, codepoint);
|
||||
break;
|
||||
// various implicits optimization
|
||||
case CE_CJK_IMPLICIT_TAG_:
|
||||
// 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
|
||||
|
@ -2180,7 +2376,8 @@ public final class CollationElementIterator
|
|||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
}
|
||||
finally {
|
||||
m_utilSpecialEntryBackUp_ = entrybackup;
|
||||
}
|
||||
return ce;
|
||||
|
@ -2469,6 +2666,185 @@ public final class CollationElementIterator
|
|||
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
|
||||
return m_CEBuffer_[m_CEBufferOffset_];
|
||||
}
|
||||
|
||||
/**
|
||||
* Getting the digit collation elements
|
||||
* @param collator
|
||||
* @param ce current collation element
|
||||
* @param ch current code point
|
||||
* @return digit collation element
|
||||
*/
|
||||
private int previousDigit(RuleBasedCollator collator, int ce, char ch)
|
||||
{
|
||||
// We do a check to see if we want to collate digits as numbers; if so we generate
|
||||
// a custom collation key. Otherwise we pull out the value stored in the expansion table.
|
||||
if (collator.m_isNumericCollation_){
|
||||
int leadingZeroIndex = 0;
|
||||
int collateVal = 0;
|
||||
boolean nonZeroValReached = false;
|
||||
|
||||
// clear and set initial string buffer length
|
||||
m_utilStringBuffer_.setLength(3);
|
||||
|
||||
// We parse the source string until we hit a char that's NOT a digit
|
||||
// Use this u_charDigitValue. This might be slow because we have to
|
||||
// handle surrogates...
|
||||
int char32 = ch;
|
||||
if (UTF16.isTrailSurrogate(ch)) {
|
||||
if (!isBackwardsStart()){
|
||||
char lead = previousChar();
|
||||
if (UTF16.isLeadSurrogate(lead)) {
|
||||
char32 = UCharacterProperty.getRawSupplementary(lead,
|
||||
ch);
|
||||
}
|
||||
else {
|
||||
goForwardOne();
|
||||
}
|
||||
}
|
||||
}
|
||||
int digVal = UCharacter.digit(char32);
|
||||
int digIndx = 0;
|
||||
for (;;) {
|
||||
// Make sure we have enough space.
|
||||
if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
|
||||
m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
|
||||
<< 1);
|
||||
}
|
||||
// Skipping over "trailing" zeroes but we still add to digIndx.
|
||||
if (digVal != 0 || nonZeroValReached) {
|
||||
if (digVal != 0 && !nonZeroValReached) {
|
||||
nonZeroValReached = true;
|
||||
}
|
||||
|
||||
// We parse the digit string into base 100 numbers (this
|
||||
// fits into a byte).
|
||||
// We only add to the buffer in twos, thus if we are
|
||||
// parsing an odd character, that serves as the 'tens'
|
||||
// digit while the if we are parsing an even one, that is
|
||||
// the 'ones' digit. We dumped the parsed base 100 value
|
||||
// (collateVal) into a buffer. We multiply each collateVal
|
||||
// by 2 (to give us room) and add 5 (to avoid overlapping
|
||||
// magic CE byte values). The last byte we subtract 1 to
|
||||
// ensure it is less than all the other bytes.
|
||||
// Since we're doing in this reverse we want to put the
|
||||
// first digit encountered into the ones place and the
|
||||
// second digit encountered into the tens place.
|
||||
|
||||
if (digIndx % 2 == 1){
|
||||
collateVal += digVal * 10;
|
||||
|
||||
// This removes leading zeroes.
|
||||
if (collateVal == 0 && leadingZeroIndex == 0) {
|
||||
leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
|
||||
}
|
||||
else if (leadingZeroIndex != 0) {
|
||||
leadingZeroIndex = 0;
|
||||
}
|
||||
|
||||
m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
|
||||
(char)((collateVal << 1) + 6));
|
||||
collateVal = 0;
|
||||
}
|
||||
else {
|
||||
collateVal = digVal;
|
||||
}
|
||||
}
|
||||
digIndx ++;
|
||||
|
||||
if (!isBackwardsStart()){
|
||||
backupInternalState(m_utilSpecialBackUp_);
|
||||
ch = previousChar();
|
||||
char32 = ch;
|
||||
if (UTF16.isTrailSurrogate(ch)){
|
||||
if (!isBackwardsStart()) {
|
||||
char lead = previousChar();
|
||||
if (UTF16.isLeadSurrogate(lead)) {
|
||||
char32
|
||||
= UCharacterProperty.getRawSupplementary(
|
||||
lead, ch);
|
||||
}
|
||||
else {
|
||||
updateInternalState(m_utilSpecialBackUp_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
digVal = UCharacter.digit(char32);
|
||||
if (digVal == -1) {
|
||||
updateInternalState(m_utilSpecialBackUp_);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nonZeroValReached == false) {
|
||||
digIndx = 2;
|
||||
m_utilStringBuffer_.setCharAt(2, (char)6);
|
||||
}
|
||||
|
||||
if (digIndx % 2 != 0) {
|
||||
if (collateVal == 0 && leadingZeroIndex == 0) {
|
||||
// This removes the leading 0 in a odd number sequence of
|
||||
// numbers e.g. avery001
|
||||
leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
|
||||
}
|
||||
else {
|
||||
// this is not a leading 0, we add it in
|
||||
m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
|
||||
(char)((collateVal << 1) + 6));
|
||||
digIndx ++;
|
||||
}
|
||||
}
|
||||
|
||||
int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
|
||||
: ((digIndx >>> 1) + 2) ;
|
||||
digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
|
||||
// Subtract one off of the last byte.
|
||||
// Really the first byte here, but it's reversed...
|
||||
m_utilStringBuffer_.setCharAt(2,
|
||||
(char)(m_utilStringBuffer_.charAt(2) - 1));
|
||||
// We want to skip over the first two slots in the buffer.
|
||||
// The first slot is reserved for the header byte 0x1B.
|
||||
// The second slot is for the sign/exponent byte:
|
||||
// 0x80 + (decimalPos/2) & 7f.
|
||||
m_utilStringBuffer_.setCharAt(0, (char)0x1B);
|
||||
m_utilStringBuffer_.setCharAt(1,
|
||||
(char)(0x80 + ((digIndx >>> 1) & 0x7F)));
|
||||
|
||||
// Now transfer the collation key to our collIterate struct.
|
||||
// The total size for our collation key is endIndx bumped up to the
|
||||
// next largest even value divided by two.
|
||||
m_CEBufferSize_ = 0;
|
||||
m_CEBuffer_[m_CEBufferSize_ ++]
|
||||
= (((m_utilStringBuffer_.charAt(0) << 8)
|
||||
// Primary weight
|
||||
| m_utilStringBuffer_.charAt(1))
|
||||
<< RuleBasedCollator.CE_PRIMARY_SHIFT_)
|
||||
// Secondary weight
|
||||
| (RuleBasedCollator.BYTE_COMMON_
|
||||
<< RuleBasedCollator.CE_SECONDARY_SHIFT_)
|
||||
// Tertiary weight.
|
||||
| RuleBasedCollator.BYTE_COMMON_;
|
||||
int i = endIndex - 1; // Reset the index into the buffer.
|
||||
while (i >= 2) {
|
||||
int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
|
||||
if (i >= 2) {
|
||||
primWeight |= m_utilStringBuffer_.charAt(i --);
|
||||
}
|
||||
m_CEBuffer_[m_CEBufferSize_ ++]
|
||||
= (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
|
||||
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
|
||||
}
|
||||
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
|
||||
return m_CEBuffer_[m_CEBufferOffset_];
|
||||
}
|
||||
else {
|
||||
return collator.m_expansion_[getExpansionOffset(collator, ce)];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns previous hangul ces
|
||||
|
@ -2600,6 +2976,9 @@ public final class CollationElementIterator
|
|||
return previousLongPrimary(ce);
|
||||
case CE_EXPANSION_TAG_: // always returns
|
||||
return previousExpansion(collator, ce);
|
||||
case CE_DIGIT_TAG_:
|
||||
ce = previousDigit(collator, ce, ch);
|
||||
break;
|
||||
case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
|
||||
return previousHangul(collator, ch);
|
||||
case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
|
||||
|
@ -2728,4 +3107,45 @@ public final class CollationElementIterator
|
|||
return m_source_.current();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves back 1 position in the source string. This is slightly less
|
||||
* complicated than previousChar in that it doesn't normalize while
|
||||
* moving back. Boundary checks are not performed.
|
||||
* This method is to be used with caution, with the assumption that
|
||||
* moving back one position will not exceed the source limits.
|
||||
* Use only with nextChar() and never call this API twice in a row without
|
||||
* nextChar() in the middle.
|
||||
*/
|
||||
private void goBackOne()
|
||||
{
|
||||
if (m_bufferOffset_ >= 0) {
|
||||
m_bufferOffset_ --;
|
||||
}
|
||||
else {
|
||||
m_source_.setIndex(m_source_.getIndex() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves forward 1 position in the source string. This is slightly less
|
||||
* complicated than nextChar in that it doesn't normalize while
|
||||
* moving back. Boundary checks are not performed.
|
||||
* This method is to be used with caution, with the assumption that
|
||||
* moving back one position will not exceed the source limits.
|
||||
* Use only with previousChar() and never call this API twice in a row
|
||||
* without previousChar() in the middle.
|
||||
*/
|
||||
private void goForwardOne()
|
||||
{
|
||||
if (m_bufferOffset_ < 0) {
|
||||
// we're working on the source and not normalizing. fast path.
|
||||
// note Thai pre-vowel reordering uses buffer too
|
||||
m_source_.setIndex(m_source_.getIndex() + 1);
|
||||
}
|
||||
else {
|
||||
// we are in the buffer, buffer offset will never be 0 here
|
||||
m_bufferOffset_ ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java,v $
|
||||
* $Date: 2003/08/20 00:20:37 $
|
||||
* $Revision: 1.23 $
|
||||
* $Date: 2003/08/27 22:28:45 $
|
||||
* $Revision: 1.24 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -24,6 +24,7 @@ import com.ibm.icu.impl.TrieBuilder;
|
|||
import com.ibm.icu.impl.IntTrieBuilder;
|
||||
import com.ibm.icu.impl.TrieIterator;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
|
@ -1865,6 +1866,7 @@ final class CollationParsedRuleBuilder
|
|||
private int addAnElement(BuildTable t, Elements element)
|
||||
{
|
||||
Vector expansions = t.m_expansions_;
|
||||
element.m_mapCE_ = 0;
|
||||
if (element.m_CELength_ == 1) {
|
||||
if (element.m_isThai_ == false) {
|
||||
element.m_mapCE_ = element.m_CEs_[0];
|
||||
|
@ -1941,6 +1943,41 @@ final class CollationParsedRuleBuilder
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We treat digits differently - they are "uber special" and should be
|
||||
// processed differently if numeric collation is on.
|
||||
int uniChar = 0;
|
||||
if ((element.m_uchars_.length() == 2)
|
||||
&& UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
|
||||
uniChar = UCharacterProperty.getRawSupplementary(
|
||||
element.m_uchars_.charAt(0),
|
||||
element.m_uchars_.charAt(1));
|
||||
}
|
||||
else if (element.m_uchars_.length() == 1) {
|
||||
uniChar = element.m_uchars_.charAt(0);
|
||||
}
|
||||
|
||||
// Here, we either have one normal CE OR mapCE is set. Therefore, we
|
||||
// stuff only one element to the expansion buffer. When we encounter a
|
||||
// digit and we don't do numeric collation, we will just pick the CE
|
||||
// we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE
|
||||
// && ucol_prv_getSpecialPrevCE). If we picked a special, further
|
||||
// processing will occur. If it's a simple CE, we'll return due
|
||||
// to how the loop is constructed.
|
||||
if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
|
||||
// prepare the element
|
||||
int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_
|
||||
| (CollationElementIterator.CE_DIGIT_TAG_
|
||||
<< RuleBasedCollator.CE_TAG_SHIFT_) | 1;
|
||||
if (element.m_mapCE_ != 0) {
|
||||
// if there is an expansion, we'll pick it here
|
||||
expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
|
||||
}
|
||||
else {
|
||||
expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);
|
||||
}
|
||||
element.m_mapCE_ = expansion;
|
||||
}
|
||||
|
||||
// here we want to add the prefix structure.
|
||||
// I will try to process it as a reverse contraction, if possible.
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollatorReader.java,v $
|
||||
* $Date: 2003/06/03 18:49:34 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2003/08/27 22:28:45 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -91,8 +91,8 @@ final class CollatorReader
|
|||
*/
|
||||
protected void readHeader(RuleBasedCollator rbc) throws IOException
|
||||
{
|
||||
int size = m_dataInputStream_.readInt();
|
||||
// all the offsets are in bytes
|
||||
int size = m_dataInputStream_.readInt();
|
||||
// all the offsets are in bytes
|
||||
// to get the address add to the header address and cast properly
|
||||
// Default options int options
|
||||
m_dataInputStream_.skip(4); // options
|
||||
|
@ -166,7 +166,7 @@ final class CollatorReader
|
|||
*/
|
||||
protected void readOptions(RuleBasedCollator rbc) throws IOException
|
||||
{
|
||||
rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
|
||||
rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
|
||||
rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt()
|
||||
== RuleBasedCollator.AttributeValue.ON_);
|
||||
rbc.m_defaultIsAlternateHandlingShifted_
|
||||
|
@ -186,6 +186,8 @@ final class CollatorReader
|
|||
rbc.m_defaultStrength_ = m_dataInputStream_.readInt();
|
||||
rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt()
|
||||
== RuleBasedCollator.AttributeValue.ON_);
|
||||
rbc.m_defaultIsNumericCollation_ = (m_dataInputStream_.readInt()
|
||||
== RuleBasedCollator.AttributeValue.ON_);
|
||||
m_dataInputStream_.skip(64); // reserved for future use
|
||||
}
|
||||
|
||||
|
@ -206,7 +208,7 @@ final class CollatorReader
|
|||
{
|
||||
readHeader(rbc);
|
||||
readOptions(rbc);
|
||||
m_expansionSize_ >>= 2;
|
||||
m_expansionSize_ >>= 2;
|
||||
rbc.m_expansion_ = new int[m_expansionSize_];
|
||||
for (int i = 0; i < m_expansionSize_; i ++) {
|
||||
rbc.m_expansion_[i] = m_dataInputStream_.readInt();
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java,v $
|
||||
* $Date: 2003/08/25 23:23:12 $
|
||||
* $Revision: 1.44 $
|
||||
* $Date: 2003/08/27 22:28:45 $
|
||||
* $Revision: 1.45 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -445,6 +445,20 @@ public final class RuleBasedCollator extends Collator
|
|||
setStrength(m_defaultStrength_);
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to set numeric collation to its default value.
|
||||
* When numeric collation is turned on, this Collator generates a collation
|
||||
* key for the numeric value of substrings of digits. This is a way to get
|
||||
* '100' to sort AFTER '2'
|
||||
* @see #getNumericCollation
|
||||
* @see #setNumericCollation
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public void setNumericCollationDefault()
|
||||
{
|
||||
setNumericCollation(m_defaultIsNumericCollation_);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the mode for the direction of SECONDARY weights to be used in
|
||||
* French collation.
|
||||
|
@ -625,6 +639,21 @@ public final class RuleBasedCollator extends Collator
|
|||
m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
|
||||
}
|
||||
|
||||
/**
|
||||
* When numeric collation is turned on, this Collator generates a collation
|
||||
* key for the numeric value of substrings of digits. This is a way to get
|
||||
* '100' to sort AFTER '2'
|
||||
* @param flag true to turn numeric collation on and false to turn it off
|
||||
* @see #getNumericCollation
|
||||
* @see #setNumericCollationDefault
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public void setNumericCollation(boolean flag)
|
||||
{
|
||||
// sort substrings of digits as numbers
|
||||
m_isNumericCollation_ = flag;
|
||||
}
|
||||
|
||||
// public getters --------------------------------------------------------
|
||||
|
||||
/**
|
||||
|
@ -863,6 +892,21 @@ public final class RuleBasedCollator extends Collator
|
|||
return m_variableTopValue_ << 16;
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to retrieve the numeric collation value.
|
||||
* When numeric collation is turned on, this Collator generates a collation
|
||||
* key for the numeric value of substrings of digits. This is a way to get
|
||||
* '100' to sort AFTER '2'
|
||||
* @see #setNumericCollation
|
||||
* @see #setNumericCollationDefault
|
||||
* @return true if numeric collation is turned on, false otherwise
|
||||
* @draft ICU 2.8
|
||||
*/
|
||||
public boolean getNumericCollation()
|
||||
{
|
||||
return m_isNumericCollation_;
|
||||
}
|
||||
|
||||
// public other methods -------------------------------------------------
|
||||
|
||||
/**
|
||||
|
@ -1324,6 +1368,7 @@ public final class RuleBasedCollator extends Collator
|
|||
boolean m_isJamoSpecial_;
|
||||
|
||||
// Collator options ------------------------------------------------------
|
||||
|
||||
int m_defaultVariableTopValue_;
|
||||
boolean m_defaultIsFrenchCollation_;
|
||||
boolean m_defaultIsAlternateHandlingShifted_;
|
||||
|
@ -1332,6 +1377,8 @@ public final class RuleBasedCollator extends Collator
|
|||
int m_defaultDecomposition_;
|
||||
int m_defaultStrength_;
|
||||
boolean m_defaultIsHiragana4_;
|
||||
boolean m_defaultIsNumericCollation_;
|
||||
|
||||
/**
|
||||
* Value of the variable top
|
||||
*/
|
||||
|
@ -1344,6 +1391,10 @@ public final class RuleBasedCollator extends Collator
|
|||
* Case sorting customization
|
||||
*/
|
||||
int m_caseFirst_;
|
||||
/**
|
||||
* Numeric collation option
|
||||
*/
|
||||
boolean m_isNumericCollation_;
|
||||
|
||||
// end Collator options --------------------------------------------------
|
||||
|
||||
|
@ -1515,10 +1566,9 @@ public final class RuleBasedCollator extends Collator
|
|||
Object elements = rb.getObject("CollationElements");
|
||||
if (elements != null) {
|
||||
Object[][] rules = (Object[][])elements;
|
||||
m_rules_ = (String)rules[1][1];
|
||||
// %%CollationBin
|
||||
if(rules[0][1] instanceof byte[]){
|
||||
|
||||
m_rules_ = (String)rules[1][1];
|
||||
byte map[] = (byte [])rules[0][1];
|
||||
BufferedInputStream input =
|
||||
new BufferedInputStream(
|
||||
|
@ -1547,7 +1597,8 @@ public final class RuleBasedCollator extends Collator
|
|||
// due to resource redirection ICUListResourceBundle does not
|
||||
// raise missing resource error
|
||||
//throw new MissingResourceException("Could not get resource for constructing RuleBasedCollator","com.ibm.icu.impl.data.LocaleElements_"+locale.toString(), "%%CollationBin");
|
||||
init((String)rules[1][1]);
|
||||
m_rules_ = (String)rules[0][1];
|
||||
init(m_rules_);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -1606,6 +1657,7 @@ public final class RuleBasedCollator extends Collator
|
|||
m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
|
||||
m_defaultStrength_ = UCA_.m_defaultStrength_;
|
||||
m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
|
||||
m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
|
||||
m_expansionOffset_ = UCA_.m_expansionOffset_;
|
||||
m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
|
||||
m_isCaseLevel_ = UCA_.m_isCaseLevel_;
|
||||
|
@ -1621,6 +1673,7 @@ public final class RuleBasedCollator extends Collator
|
|||
m_top3_ = UCA_.m_top3_;
|
||||
m_topCount3_ = UCA_.m_topCount3_;
|
||||
m_variableTopValue_ = UCA_.m_variableTopValue_;
|
||||
m_isNumericCollation_ = UCA_.m_isNumericCollation_;
|
||||
setWithUCATables();
|
||||
latinOneFailed_ = false;
|
||||
}
|
||||
|
@ -1818,7 +1871,7 @@ public final class RuleBasedCollator extends Collator
|
|||
* Minimum size required for the binary collation data in bytes.
|
||||
* Size of UCA header + size of options to 4 bytes
|
||||
*/
|
||||
private static final int MIN_BINARY_DATA_SIZE_ = (42 + 24) << 2;
|
||||
private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;
|
||||
|
||||
/**
|
||||
* If this collator is to generate only simple tertiaries for fast path
|
||||
|
@ -3679,6 +3732,7 @@ public final class RuleBasedCollator extends Collator
|
|||
m_isCaseLevel_ = m_defaultIsCaseLevel_;
|
||||
m_caseFirst_ = m_defaultCaseFirst_;
|
||||
m_isHiragana4_ = m_defaultIsHiragana4_;
|
||||
m_isNumericCollation_ = m_defaultIsNumericCollation_;
|
||||
latinOneFailed_ = false;
|
||||
updateInternalState();
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue