diff --git a/icu4j/src/com/ibm/icu/impl/UCharacterIterator.java b/icu4j/src/com/ibm/icu/impl/UCharacterIterator.java index fac65c3690e..c699bffac20 100644 --- a/icu4j/src/com/ibm/icu/impl/UCharacterIterator.java +++ b/icu4j/src/com/ibm/icu/impl/UCharacterIterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/UCharacterIterator.java,v $ - * $Date: 2002/04/03 00:00:00 $ - * $Revision: 1.4 $ + * $Date: 2002/05/14 16:48:49 $ + * $Revision: 1.5 $ * ******************************************************************************* */ @@ -41,27 +41,62 @@ public final class UCharacterIterator implements CharacterIterator // public constructor ------------------------------------------------------ /** - * Public constructor + * Public constructor. + * By default the iteration range will be from 0 to the end of the text. * @param replacable text which the iterator will be based on */ public UCharacterIterator(Replaceable replaceable) { m_replaceable_ = replaceable; m_index_ = 0; - m_length_ = replaceable.length(); + m_start_ = 0; + m_limit_ = replaceable.length(); } /** * Public constructor + * By default the iteration range will be from 0 to the end of the text. * @param str text which the iterator will be based on */ public UCharacterIterator(String str) { m_replaceable_ = new ReplaceableString(str); m_index_ = 0; - m_length_ = m_replaceable_.length(); + m_start_ = 0; + m_limit_ = m_replaceable_.length(); } + /** + * Constructs an iterator over the given range of the given string. + * @param text text to be iterated over + * @param start offset of the first character to iterate + * @param limit offset of the character following the last character to + * iterate + */ + public UCharacterIterator(String str, int start, int limit) + { + m_replaceable_ = new ReplaceableString(str); + m_start_ = start; + m_limit_ = limit; + m_index_ = m_start_; + } + + /** + * Constructs an iterator over the given range of the given replaceable + * string. + * @param text text to be iterated over + * @param start offset of the first character to iterate + * @param limit offset of the character following the last character to + * iterate + */ + public UCharacterIterator(Replaceable replaceable, int start, int limit) + { + m_replaceable_ = replaceable; + m_start_ = start; + m_limit_ = limit; + m_index_ = m_start_; + } + // public methods ---------------------------------------------------------- /** @@ -87,7 +122,7 @@ public final class UCharacterIterator implements CharacterIterator */ public char current() { - if (m_index_ >= 0 && m_index_ < m_length_) { + if (m_index_ >= m_start_ && m_index_ < m_limit_) { return m_replaceable_.charAt(m_index_); } return DONE; @@ -99,7 +134,7 @@ public final class UCharacterIterator implements CharacterIterator */ public int currentCodePoint() { - if (m_index_ >= 0 && m_index_ < m_length_) { + if (m_index_ >= m_start_ && m_index_ < m_limit_) { return m_replaceable_.char32At(m_index_); } return DONE_CODEPOINT; @@ -111,26 +146,28 @@ public final class UCharacterIterator implements CharacterIterator */ public char first() { - m_index_ = 0; + m_index_ = m_start_; return current(); } /** - * Returns the start of the text. - * @return 0 + * Returns the start of the text to iterate. + * @return by default this method will return 0, unless a range for + * iteration had been specified during construction. */ public int getBeginIndex() { - return 0; + return m_start_; } /** - * Returns the length of the text - * @return length of the text + * Returns the limit offset of the text to iterate + * @return by default this method returns the length of the text, unless a + * range for iteration had been specified during construction. */ public int getEndIndex() { - return m_length_; + return m_limit_; } /** @@ -143,31 +180,31 @@ public final class UCharacterIterator implements CharacterIterator } /** - * Gets the last UTF16 character from the text and shifts the index to the - * end of the text accordingly. - * @return the last UTF16 character + * Gets the last UTF16 iterateable character from the text and shifts the + * index to the end of the text accordingly. + * @return the last UTF16 iterateable character */ public char last() { - if (m_length_ != 0) { - m_index_ = m_length_ - 1; + if (m_limit_ != m_start_) { + m_index_ = m_limit_ - 1; return m_replaceable_.charAt(m_index_); } - m_index_ = m_length_; + m_index_ = m_limit_; return DONE; } /** * Returns next UTF16 character and increments the iterator's index by 1. - * If the resulting index is greater or equal to the text length, the - * index is reset to the text length and a value of DONE_CODEPOINT is + * If the resulting index is greater or equal to the iteration limit, the + * index is reset to the text iteration limit and a value of DONE_CODEPOINT is * returned. * @return next UTF16 character in text or DONE if the new index is off the - * end of the text range. + * end of the text iteration limit. */ public char next() { - if (m_index_ < m_length_) { + if (m_index_ < m_limit_) { char result = m_replaceable_.charAt(m_index_); m_index_ ++; return result; @@ -182,20 +219,20 @@ public final class UCharacterIterator implements CharacterIterator * with surrogate pairs intermixed. If the index of a leading or trailing * code unit of a surrogate pair is given, return the code point after the * surrogate pair. - * If the resulting index is greater or equal to the text length, the - * current index is reset to the text length and a value of DONE_CODEPOINT - * is returned. + * If the resulting index is greater or equal to the text iterateable limit, + * the current index is reset to the text iterateable limit and a value of + * DONE_CODEPOINT is returned. * @return next codepoint in text or DONE_CODEPOINT if the new index is off the - * end of the text range. + * end of the text iterateable limit. */ public int nextCodePoint() { - if (m_index_ < m_length_) { + if (m_index_ < m_limit_) { char ch = m_replaceable_.charAt(m_index_); m_index_ ++; if (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE && - m_index_ < m_length_) { + m_index_ < m_limit_) { char trail = m_replaceable_.charAt(m_index_); if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { @@ -212,14 +249,15 @@ public final class UCharacterIterator implements CharacterIterator /** * Returns previous UTF16 character and decrements the iterator's index by * 1. - * If the resulting index is less than 0, the index is reset to 0 and a - * value of DONE_CODEPOINT is returned. + * If the resulting index is less than the text iterateable limit, the + * index is reset to the start of the text iteration and a value of + * DONE_CODEPOINT is returned. * @return next UTF16 character in text or DONE if the new index is off the - * start of the text range. + * start of the text iteration range. */ public char previous() { - if (m_index_ > 0) { + if (m_index_ > m_start_) { m_index_ --; return m_replaceable_.charAt(m_index_); } @@ -233,19 +271,20 @@ public final class UCharacterIterator implements CharacterIterator * with surrogate pairs intermixed. If the index of a leading or trailing * code unit of a surrogate pair is given, return the code point before the * surrogate pair. - * If the resulting index is less than 0, the current index is reset to 0 - * and a value of DONE_CODEPOINT is returned. + * If the resulting index is less than the text iterateable range, the + * current index is reset to the start of the range and a value of + * DONE_CODEPOINT is returned. * @return previous codepoint in text or DONE_CODEPOINT if the new index is - * off the start of the text range. + * off the start of the text iteration range. */ public int previousCodePoint() { - if (m_index_ > 0) { + if (m_index_ > m_start_) { m_index_ --; char ch = m_replaceable_.charAt(m_index_); if (ch >= UTF16.TRAIL_SURROGATE_MIN_VALUE && ch <= UTF16.TRAIL_SURROGATE_MAX_VALUE && - m_index_ > 0) { + m_index_ > m_start_) { char lead = m_replaceable_.charAt(m_index_); if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { @@ -267,12 +306,11 @@ public final class UCharacterIterator implements CharacterIterator * @exception IllegalArgumentException is thrown if an invalid index is * supplied. i.e. index is out of bounds. * @return the character at the specified index or DONE if the specified - * index is equal to the end of the text. + * index is equal to the limit of the text iteration range. */ public char setIndex(int index) { - int length = m_replaceable_.length(); - if (index < 0 || index > length) { + if (index < m_start_ || index > m_limit_) { throw new IllegalArgumentException("Index index out of bounds"); } m_index_ = index; @@ -290,7 +328,12 @@ public final class UCharacterIterator implements CharacterIterator */ private int m_index_; /** - * Replaceable text length + * Start offset of iterateable range, by default this is 0 */ - private int m_length_; + private int m_start_; + /** + * Limit offset of iterateable range, by default this is the length of the + * string + */ + private int m_limit_; } diff --git a/icu4j/src/com/ibm/icu/text/BOSCU.java b/icu4j/src/com/ibm/icu/text/BOSCU.java new file mode 100644 index 00000000000..b7c0bf38099 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/BOSCU.java @@ -0,0 +1,382 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2002, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/BOSCU.java,v $ +* $Date: 2002/05/14 16:48:48 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ +package com.ibm.icu.text; + +import com.ibm.icu.impl.UCharacterIterator; + +/** + *

Binary Ordered Compression Scheme for Unicode

+ * + *

Specific application:
+ * Encode a Unicode string for the identical level of a sort key.
+ * Restrictions: + *

+ *

+ * + *

Method: Slope Detection
+ * Remember the previous code point (initial 0). + * For each cp in the string, encode the difference to the previous one. + *

+ *

With a compact encoding of differences, this yields good results for + * small scripts and UTF-like results otherwise. + *

+ *

Encoding of differences:
+ *

+ *

+ *

We encode differences with few bytes if their absolute values are small. + * For correct ordering, we must treat the entire value range -10ffff..+10ffff + * in ascending order, which forbids encoding the sign and the absolute value + * separately. + * Instead, we split the lead byte range in the middle and encode non-negative + * values going up and negative values going down. + *

+ *

For very small absolute values, the difference is added to a middle byte + * value for single-byte encoded differences. + * For somewhat larger absolute values, the difference is divided by the number + * of byte values available, the modulo is used for one trail byte, and the + * remainder is added to a lead byte avoiding the single-byte range. + * For large absolute values, the difference is similarly encoded in three + * bytes. + *

+ *

This encoding does not use byte values 0, 1, 2, but uses all other byte + * values for lead/single bytes so that the middle range of single bytes is as + * large as possible. + *

+ *

Note that the lead byte ranges overlap some, but that the sequences as a + * whole are well ordered. I.e., even if the lead byte is the same for + * sequences of different lengths, the trail bytes establish correct order. + * It would be possible to encode slightly larger ranges for each length (>1) + * by subtracting the lower bound of the range. However, that would also slow + * down the calculation. + *

+ *

For the actual string encoding, an optimization moves the previous code + * point value to the middle of its Unicode script block to minimize the + * differences in same-script text runs. + *

+ * @author Syn Wee Quek + * @since release 2.2, May 3rd 2002 + * @draft 2.2 + */ +public class BOSCU +{ + // public constructors -------------------------------------------------- + + // public methods ------------------------------------------------------- + + /** + *

Encode the code points of a string as a sequence of byte-encoded + * differences (slope detection), preserving lexical order.

+ *

Optimize the difference-taking for runs of Unicode text within + * small scripts:
+ * Most small scripts are allocated within aligned 128-blocks of Unicode + * code points. Lexical order is preserved if "prev" is always moved + * into the middle of such a block.

+ *

Additionally, "prev" is moved from anywhere in the Unihan area into + * the middle of that area.

+ *

Note that the identical-level run in a sort key is generated from + * NFD text - there are never Hangul characters included.

+ * @param source text source + * @param buffer output buffer + * @param offset to start writing to + * @return end offset where the writing stop + */ + public static int writeIdenticalLevelRun(String source, byte buffer[], + int offset) + { + int prev = 0; + UCharacterIterator iterator = new UCharacterIterator(source); + int codepoint = iterator.nextCodePoint(); + while (codepoint != UCharacterIterator.DONE_CODEPOINT) { + if (prev < 0x4e00 || prev >= 0xa000) { + prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_; + } + else { + // Unihan U+4e00..U+9fa5: + // double-bytes down from the upper end + prev = 0x9fff - SLOPE_REACH_POS_2_; + } + + offset = writeDiff(codepoint - prev, buffer, offset); + prev = codepoint; + codepoint = iterator.nextCodePoint(); + } + return offset; + } + + /** + * How many bytes would writeIdenticalLevelRun() write? + * @param source text source string + * @return the length of the BOSCU result + */ + public static int lengthOfIdenticalLevelRun(String source) + { + int prev = 0; + int result = 0; + UCharacterIterator iterator = new UCharacterIterator(source); + int codepoint = iterator.nextCodePoint(); + while (codepoint != UCharacterIterator.DONE_CODEPOINT) { + if (prev < 0x4e00 || prev >= 0xa000) { + prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_; + } + else { + // Unihan U+4e00..U+9fa5: + // double-bytes down from the upper end + prev = 0x9fff - SLOPE_REACH_POS_2_; + } + + codepoint = iterator.nextCodePoint(); + result += lengthOfDiff(codepoint - prev); + prev = codepoint; + } + return result; + } + + // public setter methods ------------------------------------------------- + + // public getter methods ------------------------------------------------ + + // public other methods ------------------------------------------------- + + // protected constructor ------------------------------------------------ + + // protected data members ------------------------------------------------ + + // protected methods ----------------------------------------------------- + + // private data members -------------------------------------------------- + + /** + * Do not use byte values 0, 1, 2 because they are separators in sort keys. + */ + private static final int SLOPE_MIN_ = 3; + private static final int SLOPE_MAX_ = 0xff; + private static final int SLOPE_MIDDLE_ = 0x81; + private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1; + private static final int SLOPE_MAX_BYTES_ = 4; + + /** + * Number of lead bytes: + * 1 middle byte for 0 + * 2*80=160 single bytes for !=0 + * 2*42=84 for double-byte values + * 2*3=6 for 3-byte values + * 2*1=2 for 4-byte values + * + * The sum must be <=SLOPE_TAIL_COUNT. + * + * Why these numbers? + * - There should be >=128 single-byte values to cover 128-blocks + * with small scripts. + * - There should be >=20902 single/double-byte values to cover Unihan. + * - It helps CJK Extension B some if there are 3-byte values that cover + * the distance between them and Unihan. + * This also helps to jump among distant places in the BMP. + * - Four-byte values are necessary to cover the rest of Unicode. + * + * Symmetrical lead byte counts are for convenience. + * With an equal distribution of even and odd differences there is also + * no advantage to asymmetrical lead byte counts. + */ + private static final int SLOPE_SINGLE_ = 80; + private static final int SLOPE_LEAD_2_ = 42; + private static final int SLOPE_LEAD_3_ = 3; + private static final int SLOPE_LEAD_4_ = 1; + + /** + * The difference value range for single-byters. + */ + private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_; + private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_); + + /** + * The difference value range for double-byters. + */ + private static final int SLOPE_REACH_POS_2_ = + SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1; + private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1); + + /** + * The difference value range for 3-byters. + */ + private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_ + * SLOPE_TAIL_COUNT_ + * SLOPE_TAIL_COUNT_ + + (SLOPE_LEAD_3_ - 1) + * SLOPE_TAIL_COUNT_ + + (SLOPE_TAIL_COUNT_ - 1); + private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1); + + /** + * The lead byte start values. + */ + private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_ + + SLOPE_SINGLE_ + 1; + private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_ + + SLOPE_LEAD_2_; + private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ + + SLOPE_REACH_NEG_1_; + private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_ + - SLOPE_LEAD_2_; + + // private constructor --------------------------------------------------- + + /** + * Constructor private to prevent initialization + */ + private BOSCU() + { + } + + // private methods ------------------------------------------------------- + + /** + * Integer division and modulo with negative numerators + * yields negative modulo results and quotients that are one more than + * what we need here. + * @param number which operations are to be performed on + * @param factor the factor to use for division + * @return (result of division) << 32 | modulo + */ + private static final long getNegDivMod(int number, int factor) + { + int modulo = number % factor; + long result = number / factor; + if (modulo < 0) { + -- result; + modulo += factor; + } + return (result << 32) | modulo; + } + + /** + * Encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes, + * preserving lexical order + * @param diff + * @param buffer byte buffer to append to + * @param offset to the byte buffer to start appending + * @return end offset where the appending stops + */ + private static final int writeDiff(int diff, byte buffer[], int offset) + { + if (diff >= SLOPE_REACH_NEG_1_) { + if (diff <= SLOPE_REACH_POS_1_) { + buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff); + } + else if (diff <= SLOPE_REACH_POS_2_) { + buffer[offset ++] = (byte)(SLOPE_START_POS_2_ + + (diff / SLOPE_TAIL_COUNT_)); + buffer[offset ++] = (byte)(SLOPE_MIN_ + + (diff % SLOPE_TAIL_COUNT_)); + } + else if (diff <= SLOPE_REACH_POS_3_) { + buffer[offset + 2] = (byte)(SLOPE_MIN_ + + (diff % SLOPE_TAIL_COUNT_)); + diff /= SLOPE_TAIL_COUNT_; + buffer[offset + 1] = (byte)(SLOPE_MIN_ + + (diff % SLOPE_TAIL_COUNT_)); + buffer[offset] = (byte)(SLOPE_START_POS_3_ + + (diff / SLOPE_TAIL_COUNT_)); + offset += 3; + } + else { + buffer[offset + 3] = (byte)(SLOPE_MIN_ + + diff % SLOPE_TAIL_COUNT_); + diff /= SLOPE_TAIL_COUNT_; + buffer[offset] = (byte)(SLOPE_MIN_ + + diff % SLOPE_TAIL_COUNT_); + diff /= SLOPE_TAIL_COUNT_; + buffer[offset + 1] = (byte)(SLOPE_MIN_ + + diff % SLOPE_TAIL_COUNT_); + buffer[offset] = (byte)SLOPE_MAX_; + offset += 4; + } + } + else { + long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); + int modulo = (int)division; + if (diff >= SLOPE_REACH_NEG_2_) { + diff = (int)(division >> 32); + buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff); + buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo); + } + else if (diff >= SLOPE_REACH_NEG_3_) { + buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo); + diff = (int)(division >> 32); + division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); + modulo = (int)division; + diff = (int)(division >> 32); + buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo); + buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff); + offset += 3; + } + else { + buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo); + diff = (int)(division >> 32); + division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); + modulo = (int)division; + diff = (int)(division >> 32); + buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo); + division = getNegDivMod(diff, SLOPE_TAIL_COUNT_); + modulo = (int)division; + buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo); + buffer[offset] = SLOPE_MIN_; + offset += 4; + } + } + return offset; + } + + /** + * How many bytes would writeDiff() write? + * @param diff + */ + private static final int lengthOfDiff(int diff) + { + if (diff >= SLOPE_REACH_NEG_1_) { + if (diff <= SLOPE_REACH_POS_1_) { + return 1; + } + else if (diff <= SLOPE_REACH_POS_2_) { + return 2; + } + else if(diff <= SLOPE_REACH_POS_3_) { + return 3; + } + else { + return 4; + } + } + else { + if (diff >= SLOPE_REACH_NEG_2_) { + return 2; + } + else if (diff >= SLOPE_REACH_NEG_3_) { + return 3; + } + else { + return 4; + } + } + } +} diff --git a/icu4j/src/com/ibm/icu/text/CollationElementIterator.java b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java new file mode 100755 index 00000000000..1ac958fb1ef --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java @@ -0,0 +1,2116 @@ + +package com.ibm.icu.text; + +import java.text.CharacterIterator; +import com.ibm.icu.impl.NormalizerImpl; +import com.ibm.icu.impl.UCharacterProperty; + +/** + *

The CollationElementIterator class is used as an iterator + * to walk through each character of an international string. Use the iterator + * to return the ordering priority of the positioned character. The ordering + * priority of a character, which we refer to as a key, defines how a + * character is collated in the given collation object.

+ *

For example, consider the following in Spanish: + *

+ *
+ * "ca" -> the first key is key('c') and second key is key('a').
+ * "cha" -> the first key is key('ch') and second key is key('a').
+ * 
+ *
+ * And in German, + *
+ *
+ * "\u00e4b"-> the first key is key('a'), the second key is key('e'), and
+ * the third key is key('b').
+ * 
+ *
+ *

+ *

The key of a character is an integer composed of primary order(short), + * secondary order(byte), and tertiary order(byte). Java strictly defines + * the size and signedness of its primitive data types. Therefore, the static + * functions primaryOrder, secondaryOrder, and + * tertiaryOrder return int, short, + * and short respectively to ensure the correctness of the key + * value.

+ *

+ * Example of the iterator usage, + *

+ *
+ *  String testString = "This is a test";
+ *  RuleBasedCollator ruleBasedCollator = (RuleBasedCollator)Collator.getInstance();
+ *  CollationElementIterator collationElementIterator = ruleBasedCollator.getCollationElementIterator(testString);
+ *  int primaryOrder = CollationElementIterator.primaryOrder(collationElementIterator.next());
+ * 
+ *
+ *

+ *

+ * CollationElementIterator.next returns the collation order + * of the next character. A collation order consists of primary order, + * secondary order and tertiary order. The data type of the collation + * order is int. The first 16 bits of a collation order + * is its primary order; the next 8 bits is the secondary order and the + * last 8 bits is the tertiary order.

+ * @see Collator + * @see RuleBasedCollator + * @author Syn Wee Quek + * @since release 2.2, April 18 2002 + * @draft 2.2 + */ +public final class CollationElementIterator +{ + // public data members -------------------------------------------------- + + /** + * Null order which indicates the end of string is reached + * @draft 2.2 + */ + public final static int NULLORDER = 0xffffffff; + /** + * Ignorable collation element order. + */ + public static final int IGNORABLE = 0; + + // public methods ------------------------------------------------------- + + // public getters ------------------------------------------------------- + + /** + *

Returns the character offset in the original text corresponding to + * the next collation element. (That is, getOffset() returns the position + * in the text corresponding to the collation element that will be + * returned by the next call to next().) This value could be either + *

+ *

+ *

Note calling getOffset() immediately after setOffset(offset) may not + * return the value offset.

+ * @return The character offset in the original text corresponding to the + * collation element that will be returned by the next call to + * next(). + * @draft 2.2 + */ + public int getOffset() + { + return m_source_.getIndex(); + } + + + /** + * Return the maximum length of any expansion sequences that end with the + * specified collation element. + * @param ce a collation element returned by previous() or next(). + * @return the maximum length of any expansion sequences ending + * with the specified collation element. + * @draft 2.2 + */ + public int getMaxExpansion(int ce) + { + int start = 0; + int limit = m_collator_.m_expansionEndCE_.length; + while (start < limit - 1) { + int mid = start + ((limit - start) >> 1); + if (ce <= m_collator_.m_expansionEndCE_[mid]) { + limit = mid; + } + else { + start = mid; + } + } + int result = 1; + if (m_collator_.m_expansionEndCE_[start] == ce) { + result = m_collator_.m_expansionEndCEMaxSize_[start]; + } + else if (m_collator_.m_expansionEndCE_[limit] == ce) { + result = m_collator_.m_expansionEndCEMaxSize_[limit]; + } + else if ((ce & 0xFFFF) == 0x00C0) { + result = 2; + } + return result; + } + + // public other methods ------------------------------------------------- + + /** + *

Resets the cursor to the beginning of the string. The next call + * to next() will return the first collation element in the string.

+ * @draft 2.2 + */ + public synchronized void reset() + { + m_source_.setIndex(0); + updateInternalState(); + } + + /** + *

Get the next collation element in the string.

+ *

This iterator iterates over a sequence of collation elements that + * were built from the string. Because there isn't necessarily a + * one-to-one mapping from characters to collation elements, this doesn't + * mean the same thing as "return the collation element [or ordering + * priority] of the next character in the string".

+ *

This function returns the collation element that the iterator is + * currently pointing to and then updates the internal pointer to point to + * the next element. previous() updates the pointer first and then + * returns the element. This means that when you change direction while + * iterating (i.e., call next() and then call previous(), or call + * previous() and then call next()), you'll get back the same element + * twice.

+ * @return the next collation element + * @draft 2.2 + */ + public synchronized int next() + { + m_isForwards_ = true; + if (m_CEBufferSize_ > 0) { + if (m_CEBufferOffset_ < m_CEBufferSize_) { + // if there are expansions left in the buffer, we return it + return m_CEBuffer_[m_CEBufferOffset_ ++]; + } + m_CEBufferSize_ = 0; + m_CEBufferOffset_ = 0; + } + + char ch = nextChar(); + /* System.out.println("ch " + Integer.toHexString(ch) + " " + + Integer.toHexString(m_source_.current()));*/ + if (ch == CharacterIterator.DONE) { + return NULLORDER; + } + if (m_collator_.m_isHiragana4_) { + m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x3094) + || ch == 0x309d || ch == 0x309e; + } + + int result = NULLORDER; + if (ch <= 0xFF) { + // For latin-1 characters we never need to fall back to the UCA + // table because all of the UCA data is replicated in the + // latinOneMapping array + result = m_collator_.m_trie_.getLatin1LinearValue(ch); + if (RuleBasedCollator.isSpecial(result)) { + result = nextSpecial(m_collator_, result, ch); + } + } + else + { + result = m_collator_.m_trie_.getLeadValue(ch); + //System.out.println(Integer.toHexString(result)); + if (RuleBasedCollator.isSpecial(result)) { + // surrogate leads are handled as special ces + result = nextSpecial(m_collator_, result, ch); + } + if (result == CE_NOT_FOUND_) { + // couldn't find a good CE in the tailoring + // if we got here, the codepoint MUST be over 0xFF - so we look + // directly in the UCA + result = m_collator_.UCA_.m_trie_.getLeadValue(ch); + if (RuleBasedCollator.isSpecial(result)) { + // UCA also gives us a special CE + result = nextSpecial(m_collator_.UCA_, result, ch); + } + } + } + return result; + } + + /** + *

Get the previous collation element in the string.

+ *

This iterator iterates over a sequence of collation elements that + * were built from the string. Because there isn't necessarily a + * one-to-one mapping from characters to collation elements, this doesn't + * mean the same thing as "return the collation element [or ordering + * priority] of the previous character in the string".

+ *

This function updates the iterator's internal pointer to point to + * the collation element preceding the one it's currently pointing to and + * then returns that element, while next() returns the current element and + * then updates the pointer. This means that when you change direction + * while iterating (i.e., call next() and then call previous(), or call + * previous() and then call next()), you'll get back the same element + * twice.

+ * @return the previous collation element + * @draft 2.2 + */ + public synchronized int previous() + { + if (m_source_.getIndex() <= 0 && m_isForwards_) { + // if iterator is new or reset, we can immediate perform backwards + // iteration even when the offset is not right. + m_source_.setIndex(m_source_.getEndIndex()); + updateInternalState(); + } + m_isForwards_ = false; + int result = NULLORDER; + if (m_CEBufferSize_ > 0) { + if (m_CEBufferOffset_ > 0) { + return m_CEBuffer_[-- m_CEBufferOffset_]; + } + m_CEBufferSize_ = 0; + m_CEBufferOffset_ = 0; + } + char ch = previousChar(); + if (ch == CharacterIterator.DONE) { + return NULLORDER; + } + if (m_collator_.m_isHiragana4_) { + m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f); + } + if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) { + result = previousSpecial(m_collator_, CE_CONTRACTION_, ch); + } + else { + if (ch <= 0xFF) { + result = m_collator_.m_trie_.getLatin1LinearValue(ch); + if (RuleBasedCollator.isSpecial(result)) { + result = previousSpecial(m_collator_, result, ch); + } + } + else { + if (m_bufferOffset_ < 0 && isThaiBaseConsonant(ch) + && m_source_.getIndex() != 0) { + if (isThaiPreVowel(m_source_.previous())) { + result = CE_THAI_; + } + else { + result = m_collator_.m_trie_.getLeadValue(ch); + } + m_source_.next(); + } + else { + result = m_collator_.m_trie_.getLeadValue(ch); + } + if (RuleBasedCollator.isSpecial(result)) { + result = previousSpecial(m_collator_, result, ch); + } + if (result == CE_NOT_FOUND_) { + if (!isBackwardsStart() + && m_collator_.isContractionEnd(ch)) { + result = CE_CONTRACTION_; + } + else { + result = m_collator_.m_trie_.getLeadValue(ch); + } + + if (RuleBasedCollator.isSpecial(result)) { + result = previousSpecial(m_collator_.UCA_, result, ch); + } + } + } + } + return result; + } + + /** + * Return the primary strength of a collation element. + * @param ce the collation element + * @return the element's primary strength + * @draft 2.2 + */ + public final static int primaryOrder(int ce) + { + return (ce & RuleBasedCollator.CE_PRIMARY_MASK_) >> CE_PRIMARY_SHIFT_; + } + /** + * Return the secondary strength of a collation element. + * @param ce the collation element + * @return the element's secondary strength + * @draft 2.2 + */ + public final static short secondaryOrder(int ce) + { + return (short)((ce & RuleBasedCollator.CE_SECONDARY_MASK_) + >> CE_SECONDARY_SHIFT_); + } + + /** + * Return the tertiary strength of a collation element. + * @param colelem the collation element + * @return the element's tertiary strength + * @draft 2.2 + */ + public final static short tertiaryOrder(int ce) + { + return (short)(ce & RuleBasedCollator.CE_TERTIARY_MASK_); + } + + /** + *

Sets the iterator to point to the collation element corresponding to + * the specified character (the parameter is a CHARACTER offset in the + * original string, not an offset into its corresponding sequence of + * collation elements). The value returned by the next call to next() + * will be the collation element corresponding to the specified position + * in the text. If that position is in the middle of a contracting + * character sequence, the result of the next call to next() is the + * collation element for that sequence. This means that getOffset() + * is not guaranteed to return the same value as was passed to a preceding + * call to setOffset().

+ * @param offset new character offset into the original text to set. + * @draft 2.2 + */ + public void setOffset(int offset) + { + m_source_.setIndex(offset); + char ch = m_source_.current(); + if (m_collator_.isUnsafe(ch)) { + // if it is unsafe we need to check if it is part of a contraction + // or a surrogate character + if (UTF16.isTrailSurrogate(ch)) { + // if it is a surrogate pair we move up one character + char prevch = m_source_.previous(); + if (!UTF16.isLeadSurrogate(prevch)) { + m_source_.setIndex(offset); // go back to the same index + } + } + else { + // could be part of a contraction + // backup to a safe point and iterate till we pass offset + while (m_source_.getIndex() > 0) { + if (!m_collator_.isUnsafe(ch)) { + break; + } + ch = m_source_.previous(); + } + updateInternalState(); + int prevoffset = 0; + while (m_source_.getIndex() < offset) { + prevoffset = m_source_.getIndex(); + next(); + } + m_source_.setIndex(prevoffset); + } + } + updateInternalState(); + } + + /** + *

Set a new string over which to iterate.

+ *

Iteration will start from the start of source.

+ * @param source the new source text. + * @draft 2.2 + */ + public synchronized void setText(String source) + { + m_source_ = new StringCharacterIterator(source); + updateInternalState(); + } + + /** + *

Set a new string iterator over which to iterate.

+ *

Iteration will start from the start of source.

+ * @param source the new source text. + * @draft 2.2 + */ + public synchronized void setText(CharacterIterator source) + { + m_source_ = source; + m_source_.setIndex(0); + updateInternalState(); + } + + // protected data members ----------------------------------------------- + + /** + * true if current codepoint was Hiragana + */ + protected boolean m_isCodePointHiragana_; + + // protected constructors ----------------------------------------------- + + /** + *

CollationElementIterator constructor. This takes the source string + * and the Collator. The cursor will walk thru the source string based + * on the predefined collation rules. If the source string is empty, + * NULLORDER will be returned on the calls to next().

+ * @param source the source string. + * @param collator the RuleBasedCollator + * @draft 2.2 + */ + CollationElementIterator(String source, RuleBasedCollator collator) + { + m_source_ = new StringCharacterIterator(source); + m_collator_ = collator; + m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_]; + m_buffer_ = new StringBuffer(); + m_backup_ = new Backup(); + updateInternalState(); + } + + /** + *

CollationElementIterator constructor. This takes the source string + * and the Collator. The cursor will walk thru the source string based + * on the predefined collation rules. If the source string is empty, + * NULLORDER will be returned on the calls to next().

+ * @param source the source string iterator. + * @param collator the RuleBasedCollator + * @draft 2.2 + */ + CollationElementIterator(CharacterIterator source, + RuleBasedCollator collator) + { + m_source_ = source; + m_collator_ = collator; + m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_]; + m_buffer_ = new StringBuffer(); + m_backup_ = new Backup(); + updateInternalState(); + } + + // private data members ------------------------------------------------- + + // private inner class -------------------------------------------------- + + /** + * Backup data class + */ + private static class Backup + { + // protected data members ------------------------------------------- + + /** + * Backup non FCD sequence limit + */ + protected int m_FCDLimit_; + /** + * Backup non FCD sequence start + */ + protected int m_FCDStart_; + /** + * Backup if previous Codepoint is Hiragana quatenary + */ + protected boolean m_isCodePointHiragana_; + /** + * Backup buffer position + */ + protected int m_bufferOffset_; + /** + * Backup source iterator offset + */ + protected int m_offset_; + /** + * Backup buffer contents + */ + protected StringBuffer m_buffer_; + + // protected constructor -------------------------------------------- + + /** + * Empty constructor + */ + protected Backup() + { + m_buffer_ = new StringBuffer(); + } + } + // end inner class ------------------------------------------------------ + + /** + * Direction of travel + */ + private boolean m_isForwards_; + /** + * Source string iterator + */ + private CharacterIterator m_source_; + /** + * This is position to the m_buffer_, -1 if iterator is not in m_buffer_ + */ + private int m_bufferOffset_; + /** + * This is the CE from CEs buffer that should be returned + */ + private int m_CEBufferOffset_; + /** + * This is the position to which we have stored processed CEs + */ + private int m_CEBufferSize_; + /** + * Buffer for temporary storage of normalized characters, discontiguous + * characters and Thai characters + */ + private StringBuffer m_buffer_; + /** + * Position in the original string to continue forward FCD check from. + */ + private int m_FCDLimit_; + /** + * Position in the original string that starts with a non-FCD sequence + */ + private int m_FCDStart_; + /** + * The collator this iterator is based on + */ + private RuleBasedCollator m_collator_; + /** + * true if Hiragana quatenary is on + */ + private boolean m_isHiragana4_; + /** + * CE buffer + */ + private int m_CEBuffer_[]; + /** + * In reality we should not have to deal with expansion sequences longer + * then 16. However this value can be change if a bigger buffer is needed. + * Note, if the size is change to too small a number, BIG trouble. + * Reasonable small value is around 10, if there's no Arabic or other + * funky collations that have long expansion sequence. This is the longest + * expansion sequence this can handle without bombing out. + */ + private static final int CE_BUFFER_INIT_SIZE_ = 512; + /** + * Backup storage + */ + private Backup m_backup_; + /** + * One character before the first non-zero combining class character + */ + private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0; + /** + * One character before the first character with leading non-zero combining + * class + */ + private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300; + /** + * Mask for the last byte + */ + private static final int LAST_BYTE_MASK_ = 0xFF; + /** + * Shift value for the second last byte + */ + private static final int SECOND_LAST_BYTE_SHIFT_ = 8; + + // special ce values and tags ------------------------------------------- + private static final int CE_NOT_FOUND_ = 0xF0000000; + private static final int CE_EXPANSION_ = 0xF1000000; + private static final int CE_CONTRACTION_ = 0xF2000000; + private static final int CE_THAI_ = 0xF3000000; + /** + * Indicates the last ce has been consumed. Compare with NULLORDER. + * NULLORDER is returned if error occurs. + */ + private static final int CE_NO_MORE_CES_ = 0x00010101; + private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000; + private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100; + private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001; + + private static final int CE_NOT_FOUND_TAG_ = 0; + private static final int CE_EXPANSION_TAG_ = 1; + private static final int CE_CONTRACTION_TAG_ = 2; + private static final int CE_THAI_TAG_ = 3; + /** + * Charset processing, not yet implemented + */ + private static final int CE_CHARSET_TAG_ = 4; + /** + * AC00-D7AF + */ + private static final int CE_HANGUL_SYLLABLE_TAG_ = 6; + /** + * D800-DBFF + */ + private static final int CE_LEAD_SURROGATE_TAG_ = 7; + /** + * DC00-DFFF + */ + private static final int CE_TRAIL_SURROGATE_TAG_ = 8; + /** + * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D + */ + private static final int CE_CJK_IMPLICIT_TAG_ = 9; + private static final int CE_IMPLICIT_TAG_ = 10; + private static final int CE_SPEC_PROC_TAG_ = 11; + /** + * This is a 3 byte primary with starting secondaries and tertiaries. + * It fits in a single 32 bit CE and is used instead of expansion to save + * space without affecting the performance (hopefully). + */ + private static final int CE_LONG_PRIMARY_TAG_ = 12; + private static final int CE_CE_TAGS_COUNT = 13; + private static final int CE_BYTE_COMMON_ = 0x05; + private static final int CE_PRIMARY_SHIFT_ = 16; + private static final int CE_SECONDARY_SHIFT_ = 8; + + // end special ce values and tags --------------------------------------- + + private static final int IMPLICIT_HAN_START_ = 0x3400; + private static final int IMPLICIT_HAN_LIMIT_ = 0xA000; + private static final int IMPLICIT_SUPPLEMENTARY_COUNT_ = 0x100000; + private static final int IMPLICIT_BYTES_TO_AVOID_ = 3; + private static final int IMPLICIT_OTHER_COUNT_ = + 256 - IMPLICIT_BYTES_TO_AVOID_; + private static final int IMPLICIT_LAST_COUNT_ = IMPLICIT_OTHER_COUNT_ >> 1; + private static final int IMPLICIT_LAST_COUNT2_ = + (IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) / + (IMPLICIT_OTHER_COUNT_ * IMPLICIT_OTHER_COUNT_) + 1; + private static final int IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ * + IMPLICIT_OTHER_COUNT_ - IMPLICIT_HAN_START_; + private static final int IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ * + IMPLICIT_LAST_COUNT_ + IMPLICIT_HAN_START_; + private static final int IMPLICIT_LAST2_MULTIPLIER_ = + IMPLICIT_OTHER_COUNT_ / IMPLICIT_LAST_COUNT2_; + private static final int HANGUL_SBASE_ = 0xAC00; + private static final int HANGUL_LBASE_ = 0x1100; + private static final int HANGUL_VBASE_ = 0x1161; + private static final int HANGUL_TBASE_ = 0x11A7; + private static final int HANGUL_VCOUNT_ = 21; + private static final int HANGUL_TCOUNT_ = 28; + // private methods ------------------------------------------------------ + + /** + * Reset the iterator internally + */ + private void updateInternalState() + { + m_isCodePointHiragana_ = false; + m_bufferOffset_ = -1; + m_CEBufferOffset_ = 0; + m_CEBufferSize_ = 0; + m_FCDLimit_ = -1; + m_FCDStart_ = m_source_.getEndIndex(); + m_isHiragana4_ = m_collator_.m_isHiragana4_; + m_isForwards_ = true; + } + + /** + * Backup the current internal state + * @param backup object to store the data + */ + private void backupInternalState(Backup backup) + { + backup.m_offset_ = m_source_.getIndex(); + backup.m_FCDLimit_ = m_FCDLimit_; + backup.m_FCDStart_ = m_FCDStart_; + backup.m_isCodePointHiragana_ = m_isCodePointHiragana_; + backup.m_bufferOffset_ = m_bufferOffset_; + if (m_bufferOffset_ >= 0) { + backup.m_buffer_.append(m_buffer_); + } + } + + /** + * Update the iterator internally with backed-up state + * @param backup object that stored the data + */ + private void updateInternalState(Backup backup) + { + m_source_.setIndex(backup.m_offset_); + m_isCodePointHiragana_ = backup.m_isCodePointHiragana_; + m_bufferOffset_ = backup.m_bufferOffset_; + m_FCDLimit_ = backup.m_FCDLimit_; + m_FCDStart_ = backup.m_FCDStart_; + m_buffer_.delete(0, m_buffer_.length()); + if (m_bufferOffset_ >= 0) { + m_buffer_.append(backup.m_buffer_); + } + } + + /** + * A fast combining class retrieval system. + * @param ch UTF16 character + * @return combining class of ch + */ + private int getCombiningClass(char ch) + { + if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ && + m_collator_.isUnsafe(ch)) { + return NormalizerImpl.getCombiningClass(ch); + } + return 0; + } + + /** + *

Incremental normalization, this is an essential optimization. + * Assuming FCD checks has been done, normalize the non-FCD characters into + * the buffer. + * Source offsets points to the current processing character. + *

+ */ + private void normalize() + { + /* synwee todo normalize to 1 before fcd + try { + decompose(m_buffer_, m_source_, m_FCDStart_, m_FCDLimit_, + m_collator_.m_decomposition_); + } + catch (ArrayOutOfBoundsException e) { + // increase the size of the buffer + m_buffer_ = new char[m_buffer_.length << 1]; + decompose(m_buffer_, m_source_, m_FCDStart_, m_FCDLimit_, + m_collator_.m_decomposition_); + } + */ + m_bufferOffset_ = 0; + } + + /** + *

Incremental FCD check and normalization. Gets the next base character + * position and determines if the in-between characters needs normalization. + *

+ *

When entering, the state is known to be this: + *

+ * Incoming source offsets points to the next processing character. + * Return source offsets points to the current processing character. + *

+ * @return true if FCDCheck passes, false otherwise + */ + private boolean FCDCheck() + { + boolean result = true; + + // srcP = collationSource->pos-1; + + // Get the trailing combining class of the current character. + // If it's zero, we are OK. + char ch = m_source_.previous(); + m_FCDStart_ = m_source_.getIndex(); + // trie access + char fcd = 0; // synwee todo: unorm_getFCD16(ch); + if (fcd != 0 && UTF16.isLeadSurrogate(ch)) { + ch = m_source_.next(); // CharacterIterator.DONE has 0 fcd + if (UTF16.isTrailSurrogate(ch)) { + fcd = 0xFFFF; // unorm_getFCD16FromSurrogatePair(fcd, ch); + } else { + fcd = 0; + } + } + + byte prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_); + + if (prevTrailCC != 0) { + // The current char has a non-zero trailing CC. Scan forward until + // we find a char with a leading cc of zero. + while (true) { + ch = m_source_.next(); + if (ch == CharacterIterator.DONE) { + break; + } + // trie access + fcd = 0; // unorm_getFCD16(ch); + if (fcd != 0 && UTF16.isLeadSurrogate(ch)) { + ch = m_source_.next(); + if (UTF16.isTrailSurrogate(ch)) { + fcd = 0xFFFF; // unorm_getFCD16FromSurrogatePair(fcd, ch); + } else { + fcd = 0; + } + } + byte leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_); + if (leadCC == 0) { + // this is a base character, we stop the FCD checks + break; + } + + if (leadCC < prevTrailCC) { + result = false; + } + + prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_); + } + } + m_source_.setIndex(m_FCDStart_); + m_FCDLimit_ = m_source_.getIndex(); + return result; + } + + /** + *

Method tries to fetch the next character that is in fcd form.

+ *

Normalization is done if required.

+ *

Offsets are returned at the next character.

+ * @return next fcd character + */ + private char nextChar() + { + char result; + // loop handles the next character whether it is in the buffer or not. + if (m_bufferOffset_ == -1) { + // we're working on the source and not normalizing. fast path. + // note Thai pre-vowel reordering uses buffer too + result = m_source_.current(); + } + else { + // we are in the buffer, buffer offset will never be 0 here + result = m_buffer_.charAt(m_bufferOffset_ ++); + if (result == 0) { + // Null marked end of buffer, revert to the source string and + // loop back to top to try again to get a character. + m_source_.setIndex(m_FCDLimit_); + m_bufferOffset_ = -1; + m_buffer_.delete(0, m_buffer_.length()); + return nextChar(); + } + } + + if (m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION + || m_bufferOffset_ != -1 || m_FCDLimit_ > m_source_.getIndex() + // skip the fcd checks + || result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ + // Fast fcd safe path. trail combining class == 0. + ) { + m_source_.next(); + return result; + } + + if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) { + // We need to peek at the next character in order to tell if we are + // FCD + char next = m_source_.next(); + if (next == CharacterIterator.DONE + || next == LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) { + return result; // end of source string and if next character + // starts with a base character is always fcd. + } + } + + // Need a more complete FCD check and possible normalization. + if (!FCDCheck()) { + normalize(); + result = m_buffer_.charAt(0); + m_bufferOffset_ = 1; + } + m_source_.next(); + return result; + } + + /** + *

Incremental normalization, this is an essential optimization. + *7 Assuming FCD checks has been done, normalize the non-FCD characters into + * the buffer. + * Source offsets points to the current processing character.

+ */ + public void normalizeBackwards() + { + int start = m_FCDStart_; + int size = 0; + /* synwee todo normalize including fcd + try { + size = decompose(m_buffer_, m_source_, start, m_FCDLimit_); + } + catch (ArrayOutOfBoundsException .) { + m_buffer_ = new char[m_buffer_.length << 1]; + size = decompose(m_buffer_, m_source_, start, m_FCDLimit); + } + */ + m_bufferOffset_ = size - 1; + } + + /** + *

Incremental backwards FCD check and normalization. Gets the previous + * base character position and determines if the in-between characters + * needs normalization. + *

+ *

When entering, the state is known to be this: + *

+ * Input source offsets points to the previous character. + * Return source offsets points to the current processing character. + *

+ * @return true if FCDCheck passes, false otherwise + */ + private boolean FCDCheckBackwards() + { + boolean result = true; + char ch = m_source_.next(); + char fcd = 0; + m_FCDLimit_ = m_source_.getIndex(); + if (!UTF16.isSurrogate(ch)) { + fcd = 0; // synwee todo unorm_getFCD16(fcdTrieIndex, c); + } + else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) { + // note trail surrogate characters gets 0 fcd + ch = m_source_.previous(); + if (UTF16.isLeadSurrogate(ch)) { + fcd = 0; // unorm_getFCD16(fcdTrieIndex, c2); + if (fcd != 0) { + fcd = 0; // unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); + } + } + else { + fcd = 0; // unpaired surrogate + } + } + + byte leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_); + if (leadCC != 0) { + // The current char has a non-zero leading combining class. + // Scan backward until we find a char with a trailing cc of zero. + while (true) { + if (m_source_.getIndex() == 0) { + break; + } + ch = m_source_.previous(); + if (!UTF16.isSurrogate(ch)) { + fcd = 0; //unorm_getFCD16(fcdTrieIndex, c); + } + else { + if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) + { + ch = m_source_.previous(); + if (UTF16.isLeadSurrogate(ch)) { + fcd = 0; // unorm_getFCD16(fcdTrieIndex, c2); + } + if (fcd != 0) { + fcd = 0; // unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); + } + } else { + fcd = 0; // unpaired surrogate + } + byte prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_); + if (prevTrailCC == 0) { + break; + } + + if (leadCC < prevTrailCC) { + result = false; + } + leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_); + } + } + } + m_FCDStart_ = m_source_.getIndex(); // character with 0 lead/trail fcd + m_source_.setIndex(m_FCDLimit_); + return result; + } + + /** + *

Method tries to fetch the previous character that is in fcd form.

+ *

Normalization is done if required.

+ *

Offsets are returned at the current character.

+ * @return previous fcd character + */ + private char previousChar() + { + if (m_bufferOffset_ >= 0) { + m_bufferOffset_ --; + if (m_bufferOffset_ >= 0) { + return m_buffer_.charAt(m_bufferOffset_); + } + else { + // At the start of buffer, route back to string. + m_buffer_.delete(0, m_buffer_.length()); + if (m_FCDStart_ == 0) { + m_FCDStart_ = -1; + return CharacterIterator.DONE; + } + else { + m_FCDLimit_ = m_FCDStart_; + return previousChar(); + } + } + } + char result = m_source_.previous(); + if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ + || m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION + || m_FCDStart_ <= m_source_.getIndex() + || m_source_.getIndex() == 0) { + return result; + } + char ch = m_source_.previous(); + if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) { + // if previous character is FCD + m_source_.next(); + return result; + } + // Need a more complete FCD check and possible normalization. + if (!FCDCheckBackwards()) { + normalizeBackwards(); + m_bufferOffset_ --; + result = m_buffer_.charAt(m_bufferOffset_); + } + return result; + } + + /** + * Determines if it is at the start of source iteration + * @return true if iterator at the start, false otherwise + */ + private boolean isBackwardsStart() + { + return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0) + || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0); + } + + /** + * Determine if a character is a Thai vowel, which sorts after its base + * consonant. + * @param ch character to test + * @return true if ch is a Thai prevowel, false otherwise + */ + private boolean isThaiPreVowel(char ch) + { + return (ch >= 0xe40 && ch <= 0xe44) || (ch >= 0xec0 && ch <= 0xec4); + } + + /** + * Determine if a character is a Thai base consonant, which sorts before + * its prevowel + * @param ch character to test + * @return true if ch is a Thai base consonant, false otherwise + */ + private boolean isThaiBaseConsonant(char ch) + { + return ch >= 0xe01 && ch <= 0xe2e; + } + + + /** + * Determine if a character is a Jamo + * @param ch character to test + * @return true if ch is a Jamo, false otherwise + */ + private boolean isJamo(char ch) + { + return (ch - 0x1100 <= 0x1112 - 0x1100) + || (ch - 0x1161 <= 0x1175 - 0x1161) + || (ch - 0x11A8 <= 0x11C2 - 0x11A8); + } + + /** + * Checks if iterator is at the end of its source string. + * @return true if it is at the end, false otherwise + */ + private boolean isEnd() + { + if (m_bufferOffset_ >= 0) { + if (m_bufferOffset_ != m_buffer_.length()) { + return false; + } + else { + // at end of buffer. check if fcd is at the end + return m_FCDLimit_ == m_source_.getEndIndex(); + } + } + return m_source_.getEndIndex() == m_source_.getIndex(); + } + + /** + *

Special CE management for surrogates

+ *

Lead surrogate is encountered. CE to be retrieved by using the + * following code unit. If next character is a trail surrogate, both + * characters will be combined to retrieve the CE, otherwise completely + * ignorable (UCA specification) is returned.

+ * @param collator collator to use + * @param ce current CE + * @param trail character + * @return next CE for the surrogate characters + */ + private int nextSurrogate(RuleBasedCollator collator, int ce, char trail) + { + if (!UTF16.isTrailSurrogate(trail)) { + updateInternalState(m_backup_); + return IGNORABLE; + } + // TODO: CE contain the data from the previous CE + the mask. + // It should at least be unmasked + int result = collator.m_trie_.getTrailValue(ce, trail); + if (result == CE_NOT_FOUND_) { + updateInternalState(m_backup_); + } + return result; + } + + /** + * Gets the CE expansion offset + * @param collator current collator + * @param ce ce to test + * @return expansion offset + */ + private int getExpansionOffset(RuleBasedCollator collator, int ce) + { + return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_; + } + + /** + * Swaps the Thai and Laos characters and returns the CEs. + * @param collator collator to use + * @param ce current ce + * @param ch current character + * @return next CE for Thai characters + */ + private int nextThai(RuleBasedCollator collator, int ce, char ch) + { + if (m_bufferOffset_ != -1 // already swapped + || isEnd() || !isThaiBaseConsonant(m_source_.current())) { + // next character is also not a thai base consonant + // Treat Thai as a length one expansion + // find the offset to expansion table + return collator.m_expansion_[getExpansionOffset(collator, ce)]; + } + else { + // swap the prevowel and the following base consonant into the + // buffer with their order swapped + // buffer is always clean when we are in the source string + m_buffer_.append(nextChar()); + m_buffer_.append(ch); + m_FCDLimit_ = m_source_.getIndex(); + m_FCDStart_ = m_FCDLimit_ - 2; + m_bufferOffset_ = 0; + return IGNORABLE; + } + } + + /** + * Gets the contraction ce offset + * @param collator current collator + * @param ce current ce + * @return contraction offset + */ + private int getContractionOffset(RuleBasedCollator collator, int ce) + { + return (ce & 0xFFFFFF) - collator.m_contractionOffset_; + } + + /** + * Checks if CE is a special tag CE + * @param ce to check + * @return true if CE is a special tag CE, false otherwise + */ + private boolean isSpecialPrefixTag(int ce) + { + return RuleBasedCollator.isSpecial(ce) && + RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_; + } + + /** + *

Special processing getting a CE that is preceded by a certain + * prefix.

+ *

Used for optimizing Japanese length and iteration marks. When a + * special processing tag is encountered, iterate backwards to see if + * there's a match.

+ *

Contraction tables are used, prefix data is stored backwards in the + * table.

+ * @param collator collator to use + * @param ce current ce + * @param entrybackup entry backup iterator status + * @return next collation element + */ + private int nextSpecialPrefix(RuleBasedCollator collator, int ce, + Backup entrybackup) + { + backupInternalState(m_backup_); + updateInternalState(entrybackup); + previousChar(); + // We want to look at the character where we entered + + while (true) { + // This loop will run once per source string character, for as + // long as we are matching a potential contraction sequence + // First we position ourselves at the begining of contraction + // sequence + int entryoffset = getContractionOffset(collator, ce); + int offset = entryoffset; + if (isBackwardsStart()) { + ce = collator.m_contractionCE_[offset]; + break; + } + int previous = previousChar(); + while (previous > collator.m_contractionIndex_[offset]) { + // contraction characters are ordered, skip smaller characters + offset ++; + } + + if (previous == collator.m_contractionIndex_[offset]) { + // Found the source string char in the table. + // Pick up the corresponding CE from the table. + ce = collator.m_contractionCE_[offset]; + } + else { + // Source string char was not in the table, prefix not found + ce = collator.m_contractionCE_[entryoffset]; + } + + if (!isSpecialPrefixTag(ce)) { + // The source string char was in the contraction table, and + // the corresponding CE is not a prefix CE. We found the + // prefix, break out of loop, this CE will end up being + // returned. This is the normal way out of prefix handling + // when the source actually contained the prefix. + break; + } + } + if (ce != CE_NOT_FOUND_) { + // we found something and we can merilly continue + updateInternalState(m_backup_); + } + else { // prefix search was a failure, we have to backup all the way to + // the start + updateInternalState(entrybackup); + } + return ce; + } + + /** + * Checks if the ce is a contraction tag + * @param ce ce to check + * @return true if ce is a contraction tag, false otherwise + */ + private boolean isContractionTag(int ce) + { + return RuleBasedCollator.isSpecial(ce) && + RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_; + } + + /** + * Method to copy skipped characters into the buffer and sets the fcd + * position. To ensure that the skipped characters are considered later, + * we need to place it in the appropriate position in the buffer and + * reassign the source index. simple case if index reside in string, + * simply copy to buffer and fcdposition = pos, pos = start of buffer. + * if pos in normalization buffer, we'll insert the copy infront of pos + * and point pos to the start of the buffer. why am i doing these copies? + * well, so that the whole chunk of codes in the getNextCE, + * ucol_prv_getSpecialCE does not require any changes, which will be + * really painful. + * @param skipped character buffer + */ + private void setDiscontiguous(StringBuffer skipped) + { + if (m_bufferOffset_ >= 0) { + skipped.append(m_buffer_.substring(m_bufferOffset_)); + } + else { + m_FCDLimit_ = m_source_.getIndex(); + } + + m_bufferOffset_ = 0; + m_buffer_ = skipped; + } + + /** + * Returns the current character for forward iteration + * @return current character + */ + private char currentChar() + { + if (m_bufferOffset_ < 0) { + char result = m_source_.previous(); + m_source_.next(); + return result; + } + + // m_bufferOffset_ is never 0 in normal circumstances except after a + // discontiguous contraction since it is always returned and moved + // by 1 when we do nextChar() + return m_buffer_.charAt(m_bufferOffset_ - 1); + } + + /** + * Method to get the discontiguous collation element within the source. + * Note this function will set the position to the appropriate places. + * Passed in character offset points to the second combining character + * after the start character. + * @param collator current collator used + * @param entryoffset index to the start character in the contraction table + * @return discontiguous collation element offset + */ + private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset) + { + int offset = entryoffset; + boolean multicontraction = false; + StringBuffer skipped = new StringBuffer(); + char ch = currentChar(); + skipped.append(currentChar()); // accent after the first character + Backup backup = new Backup(); + backupInternalState(backup); + char nextch = ch; + while (true) { + ch = nextch; + nextch = nextChar(); + if (nextch == CharacterIterator.DONE + || getCombiningClass(nextch) == 0) { + // if there are no more accents to move around + // we don't have to shift previousChar, since we are resetting + // the offset later + if (multicontraction) { + setDiscontiguous(skipped); + return collator.m_contractionCE_[offset]; + } + break; + } + + offset ++; // skip the combining class offset + while (nextch > collator.m_contractionIndex_[offset]) { + offset ++; + } + + int ce = CE_NOT_FOUND_; + if (nextch != collator.m_contractionIndex_[offset] + || getCombiningClass(nextch) == getCombiningClass(ch)) { + // unmatched or blocked character + skipped.append(nextch); + continue; + } + else { + ce = collator.m_contractionCE_[offset]; + } + + if (ce == CE_NOT_FOUND_) { + break; + } + else if (isContractionTag(ce)) { + // this is a multi-contraction + offset = getContractionOffset(collator, ce); + if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) { + multicontraction = true; + backupInternalState(backup); + } + } + else { + setDiscontiguous(skipped); + return ce; + } + } + updateInternalState(backup); + return collator.m_contractionCE_[entryoffset]; + } + + /** + * Gets the next contraction ce + * @param collator collator to use + * @param ce current ce + * @param entrybackup entry backup iterator status + */ + private int nextContraction(RuleBasedCollator collator, int ce) + { + Backup backup = new Backup(); + backupInternalState(backup); + int entryce = CE_NOT_FOUND_; + while (true) { + int entryoffset = getContractionOffset(collator, ce); + int offset = entryoffset; + + if (isEnd()) { + ce = collator.m_contractionCE_[offset]; + if (ce == CE_NOT_FOUND_) { + // back up the source over all the chars we scanned going + // into this contraction. + ce = entryce; + updateInternalState(backup); + } + break; + } + + // get the discontiguos maximum combining class + byte maxCC = (byte)(collator.m_contractionIndex_[offset] & 0xFF); + // checks if all characters have the same combining class + byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8); + char ch = nextChar(); + offset ++; + while(ch > collator.m_contractionIndex_[offset]) { + // contraction characters are ordered, skip all smaller + offset ++; + } + + if (ch == collator.m_contractionIndex_[offset]) { + // Found the source string char in the contraction table. + // Pick up the corresponding CE from the table. + ce = collator.m_contractionCE_[offset]; + } + else + { + // Source string char was not in contraction table. + // Unless it is a discontiguous contraction, we are done + byte sCC; + if (maxCC == 0 || (sCC = (byte)getCombiningClass(ch)) == 0 + || sCC > maxCC || (allSame != 0 && sCC == maxCC) || + isEnd()) { + // Contraction can not be discontiguous, back up by one + previousChar(); + ce = collator.m_contractionCE_[entryoffset]; + } + else { + // Contraction is possibly discontiguous. + // find the next character if ch is not a base character + char nextch = nextChar(); + if (nextch != CharacterIterator.DONE) { + previousChar(); + } + if (getCombiningClass(nextch) == 0) { + previousChar(); + // base character not part of discontiguous contraction + ce = collator.m_contractionCE_[entryoffset]; + } + else { + ce = nextDiscontiguous(collator, entryoffset); + } + } + } + + if (ce == CE_NOT_FOUND_) { + // source did not match the contraction, revert back original + updateInternalState(backup); + ce = entryce; + break; + } + + // source was a contraction + if (!isContractionTag(ce)) { + break; + } + + // ccontinue looping to check for the remaining contraction. + if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) { + // there are further contractions to be performed, so we store + // the so-far completed ce, so that if we fail in the next + // round we just return this one. + entryce = collator.m_contractionCE_[entryoffset]; + backupInternalState(backup); + if (backup.m_bufferOffset_ >= 0) { + backup.m_bufferOffset_ --; + } + else { + backup.m_offset_ --; + } + } + } + return ce; + } + + /** + * Gets the next ce for long primaries, stuffs the rest of the collation + * elements into the ce buffer + * @param ce current ce + * @return next ce + */ + private int nextLongPrimary(int ce) + { + m_CEBuffer_[1] = ((ce & 0xFF) << 24) + | RuleBasedCollator.CE_CONTINUATION_MARKER_; + m_CEBufferOffset_ = 1; + m_CEBufferSize_ = 2; + m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | + CE_BYTE_COMMON_; + return m_CEBuffer_[0]; + } + + /** + * Gets the number of expansion + * @param ce current ce + * @return number of expansion + */ + private int getExpansionCount(int ce) + { + return ce & 0xF; + } + + /** + * Gets the next expansion ce and stuffs the rest of the collation elements + * into the ce buffer + * @param collator current collator + * @param ce current ce + * @return next expansion ce + */ + private int nextExpansion(RuleBasedCollator collator, int ce) + { + // NOTE: we can encounter both continuations and expansions in an + // expansion! + // I have to decide where continuations are going to be dealt with + int offset = getExpansionOffset(collator, ce); + m_CEBufferSize_ = getExpansionCount(ce); + m_CEBufferOffset_ = 1; + m_CEBuffer_[0] = collator.m_expansion_[offset]; + if (m_CEBufferSize_ != 0) { + // if there are less than 16 elements in expansion + for (int i = 1; i < m_CEBufferSize_; i ++) { + m_CEBuffer_[i] = collator.m_expansion_[offset + i]; + } + } + else { + // ce are terminated + m_CEBufferSize_ = 1; + while (collator.m_expansion_[offset] != 0) { + m_CEBuffer_[m_CEBufferSize_ ++] = + collator.m_expansion_[++ offset]; + } + } + return m_CEBuffer_[0]; + } + + /** + * Gets the next implicit ce for codepoints + * @param codepoint current codepoint + * @param fixupoffset an offset to calculate the implicit ce + * @return implicit ce + */ + private int nextImplicit(int codepoint, int fixupoffset) + { + if ((codepoint & 0xFFFE) == 0xFFFE + || (0xD800 <= codepoint && codepoint <= 0xDC00)) { + // illegal code value, use completely ignoreable! + return IGNORABLE; + } + // we must skip all 00, 01, 02 bytes, so most bytes have 253 values + // we must leave a gap of 01 between all values of the last byte, so + // the last byte has 126 values (3 byte case) + // shift so that HAN all has the same first primary, for compression. + // for the 4 byte case, we make the gap as large as we can fit. + // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1) + // Four byte forms (most supplementaries) are EF xx xx xx + // (with a gap of LAST2_MULTIPLIER == 14) + int last0 = codepoint - IMPLICIT_BOUNDARY_; + int result = 0; + if (last0 < 0) { + // shift so HAN shares single block + codepoint += IMPLICIT_HAN_SHIFT_; + int last1 = codepoint / IMPLICIT_LAST_COUNT_; + last0 = codepoint % IMPLICIT_LAST_COUNT_; + int last2 = last1 / IMPLICIT_OTHER_COUNT_; + last1 %= IMPLICIT_OTHER_COUNT_; + result = 0xEC030300 - fixupoffset + (last2 << 24) + (last1 << 16) + + (last0 << 9); + } + else { + int last1 = last0 / IMPLICIT_LAST_COUNT2_; + last0 %= IMPLICIT_LAST_COUNT2_; + int last2 = last1 / IMPLICIT_OTHER_COUNT_; + last1 %= IMPLICIT_OTHER_COUNT_; + result = 0xEF030303 - fixupoffset + (last2 << 16) + (last1 << 8) + + (last0 * IMPLICIT_LAST2_MULTIPLIER_); + } + m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_) + | 0x00000505; + m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0; + m_CEBufferOffset_ = 1; + m_CEBufferSize_ = 2; + return m_CEBuffer_[0]; + } + + /** + * Returns the next ce associated with the following surrogate characters + * @param ch current character + * @return ce + */ + private int nextSurrogate(char ch) + { + char nextch = nextChar(); + if (nextch != CharacterIterator.DONE && + UTF16.isTrailSurrogate(nextch)) { + int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch); + if ((codepoint >= 0x20000 && codepoint <= 0x2a6d6) + || (codepoint >= 0x2F800 && codepoint <= 0x2FA1D)) { + // this might be a CJK supplementary cp + return nextImplicit(codepoint, 0x04000000); + } + // or a regular one + return nextImplicit(codepoint, 0); + } + if (nextch != CharacterIterator.DONE) { + previousChar(); // reverts back to the original position + } + return IGNORABLE; // completely ignorable + } + + /** + * Returns the next ce for a hangul character, this is an implicit + * calculation + * @param collator current collator + * @param ch current character + * @return hangul ce + */ + private int nextHangul(RuleBasedCollator collator, char ch) + { + char L = (char)(ch - HANGUL_SBASE_); + + // divide into pieces + // do it in this order since some compilers can do % and / in one + // operation + char T = (char)(L % HANGUL_TCOUNT_); + L /= HANGUL_TCOUNT_; + char V = (char)(L % HANGUL_VCOUNT_); + L /= HANGUL_VCOUNT_; + + // offset them + L += HANGUL_LBASE_; + V += HANGUL_VBASE_; + T += HANGUL_TBASE_; + + // return the first CE, but first put the rest into the expansion + // buffer + m_CEBufferSize_ = 0; + if (!collator.m_isJamoSpecial_) { // FAST PATH + m_CEBuffer_[m_CEBufferSize_ ++] = + collator.UCA_.m_trie_.getLeadValue(L); + m_CEBuffer_[m_CEBufferSize_ ++] = + collator.UCA_.m_trie_.getLeadValue(V); + + if (T != HANGUL_TBASE_) { + m_CEBuffer_[m_CEBufferSize_ ++] = + collator.UCA_.m_trie_.getLeadValue(T); + } + m_CEBufferOffset_ = 1; + return m_CEBuffer_[0]; + } + else { + // Jamo is Special + // Since Hanguls pass the FCD check, it is guaranteed that we + // won't be in the normalization buffer if something like this + // happens + // Move Jamos into normalization buffer + m_buffer_.append((char)L); + m_buffer_.append((char)V); + if (T != HANGUL_TBASE_) { + m_buffer_.append((char)T); + } + m_FCDLimit_ = m_source_.getIndex(); + m_FCDStart_ = m_FCDLimit_ - 1; + // Indicate where to continue in main input string after + // exhausting the buffer + return IGNORABLE; + } + } + + /** + *

Special CE management. Expansions, contractions etc...

+ * @param collator can be plain UCA + * @param ce current ce + * @param ch current character + * @return next special ce + */ + private int nextSpecial(RuleBasedCollator collator, int ce, char ch) + { + int codepoint = ch; + Backup entrybackup = new Backup(); + backupInternalState(entrybackup); + while (true) { + // This loop will repeat only in the case of contractions, + // surrogate + switch(RuleBasedCollator.getTag(ce)) { + case CE_NOT_FOUND_TAG_: + // impossible case for icu4j + return ce; + case RuleBasedCollator.CE_SURROGATE_TAG_: + if (isEnd()) { + return IGNORABLE; + } + backupInternalState(m_backup_); + char trail = nextChar(); + ce = nextSurrogate(collator, ce, trail); + // calculate the supplementary code point value, + // if surrogate was not tailored we go one more round + codepoint = + UCharacterProperty.getRawSupplementary(ch, trail); + break; + case CE_THAI_TAG_: + ce = nextThai(collator, ce, ch); + break; + case CE_SPEC_PROC_TAG_: + ce = nextSpecialPrefix(collator, ce, entrybackup); + break; + case CE_CONTRACTION_TAG_: + ce = nextContraction(collator, ce); + break; + case CE_LONG_PRIMARY_TAG_: + return nextLongPrimary(ce); + case CE_EXPANSION_TAG_: + return nextExpansion(collator, ce); + // various implicits optimization + case CE_CJK_IMPLICIT_TAG_: + // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D + return nextImplicit(codepoint, 0x04000000); + case CE_IMPLICIT_TAG_: // everything that is not defined + return nextImplicit(codepoint, 0); + case CE_TRAIL_SURROGATE_TAG_: + return IGNORABLE; // DC00-DFFF broken surrogate + case CE_LEAD_SURROGATE_TAG_: // D800-DBFF + return nextSurrogate(ch); + case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF + return nextHangul(collator, ch); + case CE_CHARSET_TAG_: + // not yet implemented probably after 1.8 + return CE_NOT_FOUND_; + default: + ce = IGNORABLE; + // synwee todo, throw exception or something here. + } + if (!RuleBasedCollator.isSpecial(ce)) { + break; + } + } + return ce; + } + + /** + * Getting the previous Thai ce + * @param collator current collator + * @param ch current character + * @return previous Thai ce + */ + private int previousThai(RuleBasedCollator collator, int ce, char ch) + { + char prevch = previousChar(); + if (isBackwardsStart() || !isThaiBaseConsonant(ch) + || !isThaiPreVowel(prevch)) { + if (prevch != CharacterIterator.DONE) { + nextChar(); + } + // Treat Thai as a length one expansion + return collator.m_expansion_[getExpansionOffset(collator, ce)]; + } + else + { + // Move the prevowel and the following base Consonant into the + // normalization buffer with their order swapped + // buffer is always clean when we are in the source string + m_buffer_.append(ch); + m_buffer_.append(prevch); + m_bufferOffset_ = 2; + + if (m_source_.getIndex() == 0) { + m_FCDStart_ = 0; + m_FCDLimit_ = 2; + } + else { + m_FCDStart_ = m_source_.getIndex(); + m_FCDLimit_ = m_FCDStart_ + 2; + } + + return IGNORABLE; + } + } + + /** + * Special processing is getting a CE that is preceded by a certain prefix. + * Currently this is only needed for optimizing Japanese length and + * iteration marks. When we encouter a special processing tag, we go + * backwards and try to see if we have a match. Contraction tables are used + * - so the whole process is not unlike contraction. prefix data is stored + * backwards in the table. + * @param collator current collator + * @param ce current ce + * @return previous ce + */ + private int previousSpecialPrefix(RuleBasedCollator collator, int ce) + { + Backup backup = new Backup(); + backupInternalState(backup); + while (true) { + // position ourselves at the begining of contraction sequence + int offset = getContractionOffset(collator, ce); + int entryoffset = offset; + if (isBackwardsStart()) { + ce = collator.m_contractionCE_[offset]; + break; + } + char prevch = previousChar(); + while (prevch > collator.m_contractionIndex_[offset]) { + // since contraction codepoints are ordered, we skip all that + // are smaller + offset ++; + } + if (prevch == collator.m_contractionIndex_[offset]) { + ce = collator.m_contractionCE_[offset]; + } + else { + // char was not in the table. prefix not found + ce = collator.m_contractionCE_[entryoffset]; + } + + if (!isSpecialPrefixTag(ce)) { + // char was in the contraction table, and the corresponding ce + // is not a prefix ce. We found the prefix, break out of loop, + // this ce will end up being returned. + break; + } + } + updateInternalState(backup); + return ce; + } + + /** + * Retrieves the previous contraction ce. To ensure that the backwards and + * forwards iteration matches, we take the current region of most possible + * match and pass it through the forward iteration. This will ensure that + * the obstinate problem of overlapping contractions will not occur. + * @param collator current collator + * @param ce current ce + * @param ch current character + * @return previous contraction ce + */ + private int previousContraction(RuleBasedCollator collator, int ce, char ch) + { + int entryoffset = getContractionOffset(collator, ce); + if (isBackwardsStart()) { + // start of string or this is not the end of any contraction + return collator.m_contractionCE_[entryoffset]; + } + StringBuffer buffer = new StringBuffer(); + while (collator.isUnsafe(ch)) { + buffer.insert(0, ch); + ch = previousChar(); + if (isBackwardsStart()) { + break; + } + } + // adds the initial base character to the string + buffer.insert(0, ch); + // a new collation element iterator is used to simply things, since + // using the current collation element iterator will mean that the + // forward and backwards iteration will share and change the same + // buffers. it is going to be painful. + CollationElementIterator temp = + new CollationElementIterator(buffer.toString(), collator); + ce = temp.next(); + m_CEBufferSize_ = 0; + while (ce != NULLORDER) { + if (m_CEBufferSize_ == m_CEBuffer_.length) { + try { + int tempbuffer[] = new int[m_CEBuffer_.length + 50]; + System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0, + m_CEBuffer_.length); + m_CEBuffer_ = tempbuffer; + } + catch (Exception e) + { + e.printStackTrace(); + return NULLORDER; + } + } + m_CEBuffer_[m_CEBufferSize_ ++] = ce; + ce = temp.next(); + } + + m_CEBufferOffset_ = m_CEBufferSize_ - 1; + return m_CEBuffer_[m_CEBufferOffset_]; + } + + /** + * Returns the previous long primary ces + * @param ce long primary ce + * @return previous long primary ces + */ + private int previousLongPrimary(int ce) + { + m_CEBufferSize_ = 0; + m_CEBuffer_[m_CEBufferSize_ ++] = + ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_; + m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24) + | RuleBasedCollator.CE_CONTINUATION_MARKER_; + m_CEBufferOffset_ = m_CEBufferSize_ - 1; + return m_CEBuffer_[m_CEBufferOffset_]; + } + + /** + * Returns the previous expansion ces + * @param collator current collator + * @param ce current ce + * @return previous expansion ce + */ + private int previousExpansion(RuleBasedCollator collator, int ce) + { + // find the offset to expansion table + int offset = getExpansionOffset(collator, ce); + m_CEBufferSize_ = getExpansionCount(ce); + if (m_CEBufferSize_ != 0) { + // less than 16 elements in expansion + for (int i = 0; i < m_CEBufferSize_; i ++) { + m_CEBuffer_[i] = collator.m_expansion_[offset + i]; + } + + } + else { + // null terminated ces + while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) { + m_CEBuffer_[m_CEBufferSize_] = + collator.m_expansion_[offset + m_CEBufferSize_]; + m_CEBufferSize_ ++; + } + } + m_CEBufferOffset_ = m_CEBufferSize_ - 1; + return m_CEBuffer_[m_CEBufferOffset_]; + } + + /** + * Returns previous hangul ces + * @param collator current collator + * @param ch current character + * @return previous hangul ce + */ + private int previousHangul(RuleBasedCollator collator, char ch) + { + char L = (char)(ch - HANGUL_SBASE_); + // we do it in this order since some compilers can do % and / in one + // operation + char T = (char)(L % HANGUL_TCOUNT_); + L /= HANGUL_TCOUNT_; + char V = (char)(L % HANGUL_VCOUNT_); + L /= HANGUL_VCOUNT_; + + // offset them + L += HANGUL_LBASE_; + V += HANGUL_VBASE_; + T += HANGUL_TBASE_; + + m_CEBufferSize_ = 0; + if (!collator.m_isJamoSpecial_) { + m_CEBuffer_[m_CEBufferSize_ ++] = + collator.UCA_.m_trie_.getLeadValue(L); + m_CEBuffer_[m_CEBufferSize_ ++] = + collator.UCA_.m_trie_.getLeadValue(V); + if (T != HANGUL_TBASE_) { + m_CEBuffer_[m_CEBufferSize_ ++] = + collator.UCA_.m_trie_.getLeadValue(T); + } + m_CEBufferOffset_ = m_CEBufferSize_ - 1; + return m_CEBuffer_[m_CEBufferOffset_]; + } + else { + // Since Hanguls pass the FCD check, it is guaranteed that we won't + // be in the normalization buffer if something like this happens + // Move Jamos into normalization buffer + m_buffer_.append(L); + m_buffer_.append(V); + if (T != HANGUL_TBASE_) { + m_buffer_.append(T); + } + + m_FCDStart_ = m_source_.getIndex(); + m_FCDLimit_ = m_FCDStart_ + 1; + return IGNORABLE; + } + } + + /** + * Gets implicit codepoint ces + * @param codepoint current codepoint + * @param fixupoffset offset to shift ces for han + * @return implicit codepoint ces + */ + private int previousImplicit(int codepoint, int fixupoffset) + { + if ((codepoint & 0xFFFE) == 0xFFFE + || (0xD800 <= codepoint && codepoint <= 0xDC00)) { + return IGNORABLE; // illegal code value, completely ignoreable! + } + // we must skip all 00, 01, 02 bytes, so most bytes have 253 values + // we must leave a gap of 01 between all values of the last byte, so + // the last byte has 126 values (3 byte case) + // we shift so that HAN all has the same first primary, for + // compression. + // for the 4 byte case, we make the gap as large as we can fit. + // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1) + // Four byte forms (most supplementaries) are EF xx xx xx (with a gap + // of LAST2_MULTIPLIER == 14) + int last0 = codepoint - IMPLICIT_BOUNDARY_; + int result = 0; + + if (last0 < 0) { + // shift HAN to share single block + codepoint += IMPLICIT_HAN_SHIFT_; + int last1 = codepoint / IMPLICIT_LAST_COUNT_; + last0 = codepoint % IMPLICIT_LAST_COUNT_; + int last2 = last1 / IMPLICIT_OTHER_COUNT_; + last1 %= IMPLICIT_OTHER_COUNT_; + result = 0xEC030300 - fixupoffset + (last2 << 24) + (last1 << 16) + + (last0 << 9); + } + else { + int last1 = last0 / IMPLICIT_LAST_COUNT2_; + last0 %= IMPLICIT_LAST_COUNT2_; + int last2 = last1 / IMPLICIT_OTHER_COUNT_; + last1 %= IMPLICIT_OTHER_COUNT_; + result = 0xEF030303 - fixupoffset + (last2 << 16) + (last1 << 8) + + (last0 * IMPLICIT_LAST2_MULTIPLIER_); + } + m_CEBufferSize_ = 2; + m_CEBufferOffset_ = 1; + m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_) + | 0x00000505; + m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0; + return m_CEBuffer_[1]; + } + + /** + * Gets the previous surrogate ce + * @param ch current character + * @return previous surrogate ce + */ + private int previousSurrogate(char ch) + { + if (isBackwardsStart()) { + // we are at the start of the string, wrong place to be at + return IGNORABLE; + } + char prevch = previousChar(); + // Handles Han and Supplementary characters here. + if (UTF16.isLeadSurrogate(prevch)) { + return previousImplicit( + UCharacterProperty.getRawSupplementary(prevch, ch), 0); + } + if (prevch != CharacterIterator.DONE) { + nextChar(); + } + return IGNORABLE; // completely ignorable + } + + /** + *

Special CE management. Expansions, contractions etc...

+ * @param collator can be plain UCA + * @param ce current ce + * @param ch current character + * @return previous special ce + */ + private int previousSpecial(RuleBasedCollator collator, int ce, char ch) + { + while(true) { + // the only ces that loops are thai, special prefix and + // contractions + switch (RuleBasedCollator.getTag(ce)) { + case CE_NOT_FOUND_TAG_: // this tag always returns + return ce; + case RuleBasedCollator.CE_SURROGATE_TAG_: + // essentialy a disengaged lead surrogate. a broken + // sequence was encountered and this is an error + return IGNORABLE; + case CE_THAI_TAG_: + ce = previousThai(collator, ce, ch); + break; + case CE_SPEC_PROC_TAG_: + ce = previousSpecialPrefix(collator, ce); + break; + case CE_CONTRACTION_TAG_: + return previousContraction(collator, ce, ch); + case CE_LONG_PRIMARY_TAG_: + return previousLongPrimary(ce); + case CE_EXPANSION_TAG_: // always returns + return previousExpansion(collator, ce); + case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF + return previousHangul(collator, ch); + case CE_LEAD_SURROGATE_TAG_: // D800-DBFF + return IGNORABLE; // broken surrogate sequence + case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF + return previousSurrogate(ch); + case CE_CJK_IMPLICIT_TAG_: + // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ + return previousImplicit(ch, 0x04000000); + case CE_IMPLICIT_TAG_: // everything that is not defined + // UCA is filled with these. Tailorings are NOT_FOUND + return previousImplicit(ch, 0); + case CE_CHARSET_TAG_: // this tag always returns + return CE_NOT_FOUND_; + default: + // this tag always returns + ce = IGNORABLE; + // synwee todo, throw exception or something here. + } + if (!RuleBasedCollator.isSpecial(ce)) { + break; + } + } + return ce; + } +} diff --git a/icu4j/src/com/ibm/icu/text/CollationKey.java b/icu4j/src/com/ibm/icu/text/CollationKey.java new file mode 100755 index 00000000000..1385431f3d2 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CollationKey.java @@ -0,0 +1,260 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2002, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationKey.java,v $ +* $Date: 2002/05/14 16:48:49 $ +* $Revision: 1.4 $ +* +******************************************************************************* +*/ +package com.ibm.icu.text; + +import java.util.Arrays; + +/** + *

A CollationKey represents a String under the + * rules of a specific Collator object. Comparing two + * CollationKeys returns the relative order of the + * Strings they represent. Using CollationKeys to + * compare Strings is generally faster than using + * Collator.compare. Thus, when the Strings must be + * compared multiple times, for example when sorting a list of + * Strings. It's more efficient to use CollationKeys. + *

+ *

You can not create CollationKeys directly. Rather, generate + * them by calling Collator.getCollationKey(String). You can only + * compare CollationKeys generated from the same + * Collator object.

+ *

Generating a CollationKey for a String + * involves examining the entire String and converting it to + * series of bits that can be compared bitwise. This allows fast comparisons + * once the keys are generated. The cost of generating keys is recouped in + * faster comparisons when Strings need to be compared many + * times. On the other hand, the result of a comparison is often determined by + * the first couple of characters of each String. + * Collator.compare(String, String) examines only as many characters as it needs + * which allows it to be faster when doing single comparisons.

+ *

The following example shows how CollationKeys might be used + * to sort a list of Strings.

+ *
+ *
+ * // Create an array of CollationKeys for the Strings to be sorted.
+ * Collator myCollator = Collator.getInstance();
+ * CollationKey[] keys = new CollationKey[3];
+ * keys[0] = myCollator.getCollationKey("Tom");
+ * keys[1] = myCollator.getCollationKey("Dick");
+ * keys[2] = myCollator.getCollationKey("Harry");
+ * sort( keys );
+ * 
+ * //... + *
+ * // Inside body of sort routine, compare keys this way + * if( keys[i].compareTo( keys[j] ) > 0 ) + * // swap keys[i] and keys[j] + *
+ * //... + *
+ * // Finally, when we've returned from sort. + * System.out.println( keys[0].getSourceString() ); + * System.out.println( keys[1].getSourceString() ); + * System.out.println( keys[2].getSourceString() ); + *
+ *
+ * + * @see Collator + * @see RuleBasedCollator + * @author Syn Wee Quek + * @since release 2.2, April 18 2002 + * @draft 2.2 + */ +public final class CollationKey implements Comparable +{ + // public methods ------------------------------------------------------- + + // public getters ------------------------------------------------------- + + /** + * Returns the String that this CollationKey represents. + * @return source string that this CollationKey represents + * @draft 2.2 + */ + public String getSourceString() + { + return m_source_; + } + + /** + *

Duplicates and returns the value of this CollationKey as a sequence + * of big-endian bytes.

+ *

If two CollationKeys could be legitimately compared, then one could + * compare the byte arrays of each to obtain the same result.

+ * @return CollationKey value in a sequence of big-endian byte bytes. + * @draft 2.2 + */ + public byte[] toByteArray() + { + int length = 0; + while (true) { + if (m_key_[length] == 0) { + break; + } + length ++; + } + length ++; + byte result[] = new byte[length]; + System.arraycopy(m_key_, 0, result, 0, length); + return result; + } + + // public other methods ------------------------------------------------- + + /** + *

Compare this CollationKey to the target CollationKey. The collation + * rules of the Collator object which created these keys are applied.

+ *

Note: CollationKeys created by different Collators + * can not be compared.

+ * @param target target CollationKey + * @return an integer value, if value is less than zero this CollationKey + * is less than than target, if value is zero if they are equal + * and value is greater than zero if this CollationKey is greater + * than target. + * @see Collator#compare(String, String) + * @draft 2.2 + */ + public int compareTo(CollationKey target) + { + int i = 0; + while (m_key_[i] != 0 && target.m_key_[i] != 0) { + int key = m_key_[i] & 0xFF; + int targetkey = target.m_key_[i] & 0xFF; + if (key < targetkey) { + return -1; + } + if (targetkey < key) { + return 1; + } + i ++; + } + // last comparison if we encounter a 0 + int key = m_key_[i] & 0xFF; + int targetkey = target.m_key_[i] & 0xFF; + if (key < targetkey) { + return -1; + } + if (targetkey < key) { + return 1; + } + return 0; + } + + /** + *

Compares this CollationKey with the specified Object.

+ * @param obj the Object to be compared. + * @return Returns a negative integer, zero, or a positive integer + * respectively if this CollationKey is less than, equal to, or + * greater than the given Object. + * @exception ClassCastException thrown when the specified Object is not a + * CollationKey. + * @see #compareTo(CollationKey) + * @draft 2.2 + */ + public int compareTo(Object obj) + { + return compareTo((CollationKey)obj); + } + + /** + *

Compare this CollationKey and the target CollationKey for equality. + *

+ *

The collation rules of the Collator object which created these keys + * are applied.

+ *

Note: CollationKeys created by different Collators + * can not be compared.

+ * @param target the CollationKey to compare to. + * @return true if two objects are equal, false otherwise. + * @draft 2.2 + */ + public boolean equals(Object target) + { + if (this == target) { + return true; + } + if (target == null || !(target instanceof CollationKey)) { + return false; + } + CollationKey other = (CollationKey)target; + int i = 0; + while (true) { + if (m_key_[i] != other.m_key_[i]) { + return false; + } + if (m_key_[i] == 0) { + break; + } + i ++; + } + return true; + } + + /** + *

Creates a hash code for this CollationKey. The hash value is + * calculated on the key itself, not the String from which the key was + * created. Thus if x and y are CollationKeys, then + * x.hashCode(x) == y.hashCode() if x.equals(y) is true. This allows + * language-sensitive comparison in a hash table.

+ *

See the CollatinKey class description for an example.

+ * @return the hash value. + * @draft 2.2 + */ + public int hashCode() + { + if (m_hashCode_ == 0) { + int size = m_key_.length >> 1; + StringBuffer key = new StringBuffer(size); + int i = 0; + while (m_key_[i] != 0 && m_key_[i + 1] != 0) { + key.append((m_key_[i] << 8) | m_key_[i + 1]); + i += 2; + } + if (m_key_[i] != 0) { + key.append(m_key_[i] << 8); + } + m_hashCode_ = key.hashCode(); + } + return m_hashCode_; + } + + // protected constructor ------------------------------------------------ + + /** + * Protected CollationKey can only be generated by Collator objects + * @param source string the CollationKey represents + * @param key sort key array of bytes + * @param size of sort key + * @draft 2v2 + */ + CollationKey(String source, byte key[]) + { + m_source_ = source; + m_key_ = key; + m_hashCode_ = 0; + } + + // private data members ------------------------------------------------- + + /** + * Source string this CollationKey represents + */ + private String m_source_; + /** + * Sequence of bytes that represents the sort key + */ + private byte m_key_[]; + /** + * Hash code for the key + */ + private int m_hashCode_; +} \ No newline at end of file diff --git a/icu4j/src/com/ibm/icu/text/Collator.java b/icu4j/src/com/ibm/icu/text/Collator.java new file mode 100755 index 00000000000..993f0127c29 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/Collator.java @@ -0,0 +1,454 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2002, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Collator.java,v $ +* $Date: 2002/05/14 16:48:49 $ +* $Revision: 1.4 $ +* +******************************************************************************* +*/ +package com.ibm.icu.text; + +import java.util.Locale; + +/** +*

The Collator class performs locale-sensitive String comparison. +* You use this class to build searching and sorting routines for natural +* language text.

+*

Collator is an abstract base class. Subclasses implement specific +* collation strategies. One subclass, RuleBasedCollator, is currently +* provided and is applicable to a wide set of languages. Other subclasses +* may be created to handle more specialized needs.

+*

Like other locale-sensitive classes, you can use the static factory +* method, getInstance, to obtain the appropriate Collator object for a given +* locale. You will only need to look at the subclasses of Collator if you need +* to understand the details of a particular collation strategy or if you need +* to modify that strategy.

+*

The following example shows how to compare two strings using the Collator +* for the default locale. +*

+* // Compare two strings in the default locale
+* Collator myCollator = Collator.getInstance();
+* if (myCollator.compare("abc", "ABC") < 0) {
+*     System.out.println("abc is less than ABC");
+* }
+* else {
+*     System.out.println("abc is greater than or equal to ABC");
+* }
+* 
+*

You can set a Collator's strength property to +* determine the level of difference considered significant in comparisons. +* Four strengths are provided: PRIMARY, SECONDARY, +* TERTIARY, and IDENTICAL. The exact assignment of +* strengths to language features is locale dependant. For example, in Czech, +* "e" and "f" are considered primary differences, while "e" and "\u00EA" are +* secondary differences, "e" and "E" are tertiary differences and "e" and "e" +* are identical. The following shows how both case and accents could be +* ignored for US English.

+*
+* //Get the Collator for US English and set its strength to PRIMARY
+* Collator usCollator = Collator.getInstance(Locale.US);
+* usCollator.setStrength(Collator.PRIMARY);
+* if (usCollator.compare("abc", "ABC") == 0) {
+*     System.out.println("Strings are equivalent");
+* }
+* 
+*

For comparing Strings exactly once, the compare method provides the best +* performance. When sorting a list of Strings however, it is generally +* necessary to compare each String multiple times. In this case, +* CollationKeys provide better performance. The CollationKey class converts a +* String to a series of bits that can be compared bitwise against other +* CollationKeys. A CollationKey is created by a Collator object for a given +* String.

+*

Note: CollationKeys from different Collators can not be compared. See the +* class description for CollationKey for an example using CollationKeys. +*

+* @author Syn Wee Quek +* @since release 2.2, April 18 2002 +* @draft 2.2 +*/ + +public abstract class Collator +{ + // public data members --------------------------------------------------- + + /** + * Collator strength value. When set, only PRIMARY differences are + * considered significant during comparison. The assignment of strengths + * to language features is locale dependant. A common example is for + * different base letters ("a" vs "b") to be considered a PRIMARY + * difference. + * @see #setStrength + * @see #getStrength + * @draft 2.2 + */ + public final static int PRIMARY + = RuleBasedCollator.AttributeValue.PRIMARY_; + /** + * Collator strength value. When set, only SECONDARY and above + * differences are considered significant during comparison. The + * assignment of strengths to language features is locale dependant. A + * common example is for different accented forms of the same base letter + * ("a" vs "\u00E4") to be considered a SECONDARY difference. + * @see #setStrength + * @see #getStrength + * @draft 2.2 + */ + public final static int SECONDARY + = RuleBasedCollator.AttributeValue.SECONDARY_; + /** + * Collator strength value. When set, only TERTIARY and above differences + * are considered significant during comparison. The assignment of + * strengths to language features is locale dependant. A common example is + * for case differences ("a" vs "A") to be considered a TERTIARY + * difference. + * @see #setStrength + * @see #getStrength + * @draft 2.2 + */ + public final static int TERTIARY + = RuleBasedCollator.AttributeValue.TERTIARY_; + + /** + * Collator strength value. When set, only QUARTENARY and above differences + * are considered significant during comparison. The assignment of + * strengths to language features is locale dependant. + * difference. + * @see #setStrength + * @see #getStrength + * @draft 2.2 + */ + public final static int QUATERNARY + = RuleBasedCollator.AttributeValue.QUATERNARY_; + + /** + *

Collator strength value. When set, all differences are considered + * significant during comparison. The assignment of strengths to language + * features is locale dependant. A common example is for control + * characters ("\u0001" vs "\u0002") to be considered equal at + * the PRIMARY, SECONDARY, and TERTIARY levels but different at the + * IDENTICAL level. Additionally, differences between pre-composed + * accents such as "\u00C0" (A-grave) and combining accents such as + * "A\u0300" (A, combining-grave) will be considered significant at + * the tertiary level if decomposition is set to NO_DECOMPOSITION. + *

+ *

Note this value is different from JDK's

+ * @draft 2.2 + */ + public final static int IDENTICAL + = RuleBasedCollator.AttributeValue.IDENTICAL_; + + /** + *

Decomposition mode value. With NO_DECOMPOSITION set, accented + * characters will not be decomposed for collation. This is the default + * setting and provides the fastest collation but will only produce + * correct results for languages that do not use accents.

+ *

Note this value is different from JDK's

+ * @see #getDecomposition + * @see #setDecomposition + * @draft 2.2 + */ + public final static int NO_DECOMPOSITION + = RuleBasedCollator.AttributeValue.OFF_; + + /** + *

Decomposition mode value. With CANONICAL_DECOMPOSITION set, + * characters that are canonical variants according to Unicode 2.0 will be + * decomposed for collation. This should be used to get correct collation + * of accented characters.

+ *

CANONICAL_DECOMPOSITION corresponds to Normalization Form D as + * described in + * Unicode Technical Report #15.

+ * @see #getDecomposition + * @see #setDecomposition + * @draft 2.2 + */ + public final static int CANONICAL_DECOMPOSITION = 1; + + /** + *

Decomposition mode value. With FULL_DECOMPOSITION set, both Unicode + * canonical variants and Unicode compatibility variants will be + * decomposed for collation. This causes not only accented characters to + * be collated, but also characters that have special formats to be + * collated with their norminal form. For example, the half-width and + * full-width ASCII and Katakana characters are then collated together. + * FULL_DECOMPOSITION is the most complete and therefore the slowest + * decomposition mode.

+ *

+ * FULL_DECOMPOSITION corresponds to Normalization Form KD as described in + * Unicode + * Technical Report #15.

+ * @see #getDecomposition + * @see #setDecomposition + * @draft 2.2 + */ + public final static int FULL_DECOMPOSITION = 2; + + // public methods -------------------------------------------------------- + + // public setters -------------------------------------------------------- + + /** + *

Sets this Collator's strength property. The strength property + * determines the minimum level of difference considered significant + * during comparison.

+ *

See the Collator class description for an example of use.

+ * @param the new strength value. + * @see #getStrength + * @see #PRIMARY + * @see #SECONDARY + * @see #TERTIARY + * @see #IDENTICAL + * @exception IllegalArgumentException If the new strength value is not one of + * PRIMARY, SECONDARY, TERTIARY or IDENTICAL. + * @draft 2.2 + */ + public synchronized void setStrength(int newStrength) { + if ((newStrength != PRIMARY) && + (newStrength != SECONDARY) && + (newStrength != TERTIARY) && + (newStrength != QUATERNARY) && + (newStrength != IDENTICAL)) { + throw new IllegalArgumentException("Incorrect comparison level."); + } + m_strength_ = newStrength; + } + + /** + * Set the decomposition mode of this Collator. See getDecomposition + * for a description of decomposition mode. + * @param decomposition the new decomposition mode + * @see #getDecomposition + * @see #NO_DECOMPOSITION + * @see #CANONICAL_DECOMPOSITION + * @see #FULL_DECOMPOSITION + * @exception IllegalArgumentException If the given value is not a valid decomposition + * mode. + * @draft 2.2 + */ + public synchronized void setDecomposition(int decomposition) { + if ((decomposition != NO_DECOMPOSITION) && + (decomposition != CANONICAL_DECOMPOSITION) && + (decomposition != FULL_DECOMPOSITION)) { + throw new IllegalArgumentException("Wrong decomposition mode."); + } + if (decomposition != NO_DECOMPOSITION) { + m_decomposition_ = decomposition; + } + else { + m_decomposition_ = CANONICAL_DECOMPOSITION; + } + } + + // public getters -------------------------------------------------------- + + /** + * Gets the Collator for the current default locale. + * The default locale is determined by java.util.Locale.getDefault(). + * @return the Collator for the default locale (for example, en_US) if it + * is created successfully, otherwise if there is a failure, + * null will be returned. + * @see java.util.Locale#getDefault + * @draft 2.2 + */ + public static final Collator getInstance() + { + return getInstance(Locale.getDefault()); + } + + /** + * Gets the Collator for the desired locale. + * @param locale the desired locale. + * @return Collator for the desired locale if it is created successfully, + * otherwise if there is a failure, the default UCA collator will + * be returned. + * @see java.util.Locale + * @see java.util.ResourceBundle + * @draft 2.2 + */ + public static final Collator getInstance(Locale locale) + { + try { + return new RuleBasedCollator(locale); + } + catch(Exception e) { + return RuleBasedCollator.UCA_; + } + } + + /** + *

Returns this Collator's strength property. The strength property + * determines the minimum level of difference considered significant + * during comparison.

+ *

See the Collator class description for an example of use.

+ * @return this Collator's current strength property. + * @see #setStrength + * @see #PRIMARY + * @see #SECONDARY + * @see #TERTIARY + * @see #IDENTICAL + * @draft 2.2 + */ + public int getStrength() + { + return m_strength_; + } + + /** + *

Get the decomposition mode of this Collator. Decomposition mode + * determines how Unicode composed characters are handled. Adjusting + * decomposition mode allows the user to select between faster and more + * complete collation behavior. + *

The three values for decomposition mode are: + *

+ * See the documentation for these three constants for a description + * of their meaning. + *

+ * @return the decomposition mode + * @see #setDecomposition + * @see #NO_DECOMPOSITION + * @see #CANONICAL_DECOMPOSITION + * @see #FULL_DECOMPOSITION + * @draft 2.2 + */ + public int getDecomposition() + { + return m_decomposition_; + } + + // public other methods ------------------------------------------------- + + /** + * Convenience method for comparing the equality of two strings based on + * this Collator's collation rules. + * @param source the source string to be compared with. + * @param target the target string to be compared with. + * @return true if the strings are equal according to the collation + * rules. false, otherwise. + * @see #compare + * @draft 2.2 + */ + public boolean equals(String source, String target) + { + return (compare(source, target) == 0); + } + + /** + * Cloning this Collator. + * @return a cloned Collator of this object + * @draft 2.2 + */ + public Object clone() + { + try { + return (Collator)super.clone(); + } catch (CloneNotSupportedException e) { + throw new InternalError(); + } + } + + /** + * Compares the equality of two Collators. + * @param that the Collator to be compared with this. + * @return true if this Collator is the same as that Collator; + * false otherwise. + * @draft 2.2 + */ + public boolean equals(Object that) + { + if (this == that) { + return true; + } + if (that == null || getClass() != that.getClass()) { + return false; + } + Collator other = (Collator) that; + return ((m_strength_ == other.m_strength_) && + (m_decomposition_ == other.m_decomposition_)); + } + + // public abstract methods ----------------------------------------------- + + /** + * Generates the hash code for this Collator. + * @draft 2.2 + */ + public abstract int hashCode(); + + /** + *

Compares the source string to the target string according to the + * collation rules for this Collator. Returns an integer less than, equal + * to or greater than zero depending on whether the source String is less + * than, equal to or greater than the target string. See the Collator + * class description for an example of use.

+ *

For a one time comparison, this method has the best performance. If + * a given String will be involved in multiple comparisons, + * CollationKey.compareTo() has the best performance. See the Collator + * class description for an example using CollationKeys.

+ * @param source the source string. + * @param target the target string. + * @return Returns an integer value. Value is less than zero if source is + * less than target, value is zero if source and target are equal, + * value is greater than zero if source is greater than target. + * @see CollationKey + * @see #getCollationKey + * @draft 2.2 + */ + public abstract int compare(String source, String target); + + /** + *

Transforms the String into a series of bits that can be compared + * bitwise to other CollationKeys. CollationKeys provide better + * performance than Collator.compare() when Strings are involved in + * multiple comparisons.

+ *

See the Collator class description for an example using + * CollationKeys.

+ * @param source the string to be transformed into a collation key. + * @return the CollationKey for the given String based on this Collator's + * collation rules. If the source String is null, a null + * CollationKey is returned. + * @see CollationKey + * @see #compare(String, String) + * @draft 2.2 + */ + public abstract CollationKey getCollationKey(String source); + + // protected data members ------------------------------------------------ + + /** + * Collation strength + */ + protected int m_strength_; + /** + * Decomposition mode + */ + protected int m_decomposition_; + + // protected constructor ------------------------------------------------- + + /** + *

Protected constructor for use by subclasses. + * Public access to creating Collators is handled by the API getInstance(). + *

+ * @draft 2.2 + */ + protected Collator() throws Exception + { + m_strength_ = TERTIARY; + m_decomposition_ = CANONICAL_DECOMPOSITION; + } + + // protected methods ----------------------------------------------------- + + // private variables ----------------------------------------------------- + + // private methods ------------------------------------------------------- +} + diff --git a/icu4j/src/com/ibm/icu/text/CollatorReader.java b/icu4j/src/com/ibm/icu/text/CollatorReader.java new file mode 100644 index 00000000000..110ba7bffcd --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CollatorReader.java @@ -0,0 +1,284 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2002, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollatorReader.java,v $ +* $Date: 2002/05/14 16:48:49 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ +package com.ibm.icu.text; + +import java.io.InputStream; +import java.io.DataInputStream; +import java.io.IOException; +import com.ibm.icu.impl.ICUBinary; +import com.ibm.icu.impl.IntTrie; + +/** +*

Internal reader class for ICU data file uca.dat containing +* Unicode Collation Algorithm data.

+*

This class simply reads uca.dat, authenticates that it is a valid +* ICU data file and split its contents up into blocks of data for use in +* com.ibm.icu.text.Collator. +*

+*

uca.dat which is in big-endian format is jared together with this +* package.

+* @author Syn Wee Quek +* @since release 2.2, April 18 2002 +* @draft 2.2 +*/ + +final class CollatorReader +{ + // protected constructor --------------------------------------------- + + /** + *

Protected constructor.

+ * @param inputStream ICU uprop.dat file input stream + * @exception IOException throw if data file fails authentication + * @draft 2.1 + */ + protected CollatorReader(InputStream inputStream) throws IOException + { + ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, + DATA_FORMAT_VERSION_, UNICODE_VERSION_); + m_dataInputStream_ = new DataInputStream(inputStream); + } + + /** + *

Protected constructor.

+ * @param inputStream ICU uprop.dat file input stream + * @param readICUHeader flag to indicate if the ICU header has to be read + * @exception IOException throw if data file fails authentication + * @draft 2.1 + */ + protected CollatorReader(InputStream inputStream, boolean readICUHeader) + throws IOException + { + if (readICUHeader) { + ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, + DATA_FORMAT_VERSION_, UNICODE_VERSION_); + } + m_dataInputStream_ = new DataInputStream(inputStream); + } + + // protected methods ------------------------------------------------- + + /** + * Read and break up the header stream of data passed in as arguments into + * meaningful Collator data. + * @param rbc RuleBasedCollator to populate with header information + * @exception IOException thrown when there's a data error. + */ + protected void readHeader(RuleBasedCollator rbc) throws IOException + { + int size = m_dataInputStream_.readInt(); + // all the offsets are in bytes + // to get the address add to the header address and cast properly + // Default options int options + m_dataInputStream_.skipBytes(4); + // this one is needed only for UCA, to copy the appropriate + // contractions + m_dataInputStream_.skipBytes(4); + // reserved for future use + m_dataInputStream_.readInt(); + // const uint8_t *mappingPosition; + int mapping = m_dataInputStream_.readInt(); + // uint32_t *expansion; + rbc.m_expansionOffset_ = m_dataInputStream_.readInt(); + // UChar *contractionIndex; + rbc.m_contractionOffset_ = m_dataInputStream_.readInt(); + // uint32_t *contractionCEs; + int contractionCE = m_dataInputStream_.readInt(); + // needed for various closures int contractionSize + m_dataInputStream_.skipBytes(4); + // array of last collation element in expansion + int expansionEndCE = m_dataInputStream_.readInt(); + // array of maximum expansion size corresponding to the expansion + // collation elements with last element in expansionEndCE + int expansionEndCEMaxSize = m_dataInputStream_.readInt(); + // size of endExpansionCE int expansionEndCESize + m_dataInputStream_.skipBytes(4); + // hash table of unsafe code points + int unsafe = m_dataInputStream_.readInt(); + // hash table of final code points in contractions. + int contractionEnd = m_dataInputStream_.readInt(); + // int CEcount = m_dataInputStream_.readInt(); + m_dataInputStream_.skipBytes(4); + // is jamoSpecial + rbc.m_isJamoSpecial_ = m_dataInputStream_.readBoolean(); + m_dataInputStream_.skipBytes(3); + // byte version[] = new byte[4]; + m_dataInputStream_.skipBytes(4); + // byte charsetName[] = new byte[32]; // for charset CEs + m_dataInputStream_.skipBytes(32); + m_dataInputStream_.skipBytes(64); // for future use + if (rbc.m_contractionOffset_ == 0) { // contraction can be null + rbc.m_contractionOffset_ = mapping; + contractionCE = mapping; + } + m_expansionSize_ = rbc.m_contractionOffset_ - rbc.m_expansionOffset_; + m_contractionIndexSize_ = contractionCE - rbc.m_contractionOffset_; + m_contractionCESize_ = mapping - contractionCE; + m_trieSize_ = expansionEndCE - mapping; + m_expansionEndCESize_ = expansionEndCEMaxSize - expansionEndCE; + m_expansionEndCEMaxSizeSize_ = unsafe - expansionEndCEMaxSize; + m_unsafeSize_ = contractionEnd - unsafe; + m_contractionEndSize_ = size - contractionEnd; + rbc.m_contractionOffset_ >>= 1; // casting to ints + rbc.m_expansionOffset_ >>= 2; // casting to chars + } + + /** + * Read and break up the collation options passed in the stream of data + * and update the argument Collator with the results + * @param rbc RuleBasedCollator to populate + * @exception IOException thrown when there's a data error. + * @draft 2.2 + */ + public void readOptions(RuleBasedCollator rbc) throws IOException + { + rbc.m_variableTopValue_ = m_dataInputStream_.readInt(); + rbc.setAttributeDefault(RuleBasedCollator.Attribute.FRENCH_COLLATION_, + m_dataInputStream_.readInt()); + rbc.setAttributeDefault( + RuleBasedCollator.Attribute.ALTERNATE_HANDLING_, + m_dataInputStream_.readInt()); + rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_FIRST_, + m_dataInputStream_.readInt()); + rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_LEVEL_, + m_dataInputStream_.readInt()); + rbc.setAttributeDefault( + RuleBasedCollator.Attribute.NORMALIZATION_MODE_, + m_dataInputStream_.readInt()); + rbc.setAttributeDefault(RuleBasedCollator.Attribute.STRENGTH_, + m_dataInputStream_.readInt()); + rbc.setAttributeDefault( + RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_, + m_dataInputStream_.readInt()); + } + + /** + * Read and break up the stream of data passed in as arguments into + * meaningful Collator data.b + * @param rbc RuleBasedCollator to populate + * @exception IOException thrown when there's a data error. + * @draft 2.2 + */ + public void read(RuleBasedCollator rbc) throws IOException + { + readHeader(rbc); + readOptions(rbc); + m_expansionSize_ >>= 2; + rbc.m_expansion_ = new int[m_expansionSize_]; + for (int i = 0; i < m_expansionSize_; i ++) { + rbc.m_expansion_[i] = m_dataInputStream_.readInt(); + } + m_contractionIndexSize_ >>= 1; + rbc.m_contractionIndex_ = new char[m_contractionIndexSize_]; + for (int i = 0; i < m_contractionIndexSize_; i ++) { + rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar(); + } + m_contractionCESize_ >>= 2; + rbc.m_contractionCE_ = new int[m_contractionCESize_]; + for (int i = 0; i < m_contractionCESize_; i ++) { + rbc.m_contractionCE_[i] = m_dataInputStream_.readInt(); + } + rbc.m_trie_ = new IntTrie(m_dataInputStream_, rbc); + if (!rbc.m_trie_.isLatin1Linear()) { + throw new IOException("Data corrupted, " + + "Collator Tries expected to have linear " + + "latin one data arrays"); + } + m_expansionEndCESize_ >>= 2; + rbc.m_expansionEndCE_ = new int[m_expansionEndCESize_]; + for (int i = 0; i < m_expansionEndCESize_; i ++) { + rbc.m_expansionEndCE_[i] = m_dataInputStream_.readInt(); + } + rbc.m_expansionEndCEMaxSize_ = new byte[m_expansionEndCEMaxSizeSize_]; + for (int i = 0; i < m_expansionEndCEMaxSizeSize_; i ++) { + rbc.m_expansionEndCEMaxSize_[i] = m_dataInputStream_.readByte(); + } + rbc.m_unsafe_ = new byte[m_unsafeSize_]; + for (int i = 0; i < m_unsafeSize_; i ++) { + rbc.m_unsafe_[i] = m_dataInputStream_.readByte(); + } + rbc.m_contractionEnd_ = new byte[m_contractionEndSize_]; + for (int i = 0; i < m_contractionEndSize_; i ++) { + rbc.m_contractionEnd_[i] = m_dataInputStream_.readByte(); + } + } + + // private variables ------------------------------------------------- + + /** + * Data input stream for uca.dat + */ + private DataInputStream m_dataInputStream_; + + /** + * File format version and id that this class understands. + * No guarantees are made if a older version is used + */ + private static final byte DATA_FORMAT_VERSION_[] = + {(byte)0x2, (byte)0x0, (byte)0x0, (byte)0x0}; + private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x43, + (byte)0x6f, (byte)0x6c}; + private static final byte UNICODE_VERSION_[] = {(byte)0x3, (byte)0x0, + (byte)0x0, (byte)0x0}; + /** + * Corrupted error string + */ + private static final String CORRUPTED_DATA_ERROR_ = + "Data corrupted in Collation data file"; + + /** + * Size of expansion table in bytes + */ + private int m_expansionSize_; + /** + * Size of contraction index table in bytes + */ + private int m_contractionIndexSize_; + /** + * Size of contraction table in bytes + */ + private int m_contractionCESize_; + /** + * Size of the Trie in bytes + */ + private int m_trieSize_; + /** + * Size of the table that contains information about collation elements + * that end with an expansion + */ + private int m_expansionEndCESize_; + /** + * Size of the table that contains information about the maximum size of + * collation elements that end with a particular expansion CE corresponding + * to the ones in expansionEndCE + */ + private int m_expansionEndCEMaxSizeSize_; + /** + * Size of the table that contains information about the "Unsafe" + * codepoints + */ + private int m_unsafeSize_; + /** + * Size of the table that contains information about codepoints that ends + * with a contraction + */ + private int m_contractionEndSize_; + /** + * Size of the table that contains UCA contraction information + */ + private int m_UCAContractionSize_; + + // private methods --------------------------------------------------- + +} + diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java new file mode 100755 index 00000000000..d2737979a49 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java @@ -0,0 +1,2960 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2002, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java,v $ +* $Date: 2002/05/14 16:48:49 $ +* $Revision: 1.4 $ +* +******************************************************************************* +*/ +package com.ibm.icu.text; + +import java.io.InputStream; +import java.io.DataInputStream; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.ByteArrayInputStream; +import java.nio.IntBuffer; +import java.util.Locale; +import java.util.ResourceBundle; +import java.util.MissingResourceException; +import java.text.CharacterIterator; +import com.ibm.icu.impl.IntTrie; +import com.ibm.icu.impl.Trie; +import com.ibm.icu.impl.NormalizerImpl; +import com.ibm.icu.impl.ICULocaleData; +import com.ibm.icu.impl.UCharacterIterator; + +/** +*

The RuleBasedCollator class is a concrete subclass of Collator that +* provides a simple, data-driven, table collator. With this class you can +* create a customized table-based Collator. RuleBasedCollator maps characters +* to sort keys.

+*

RuleBasedCollator has the following restrictions for efficiency (other +* subclasses may be used for more complex languages) : +*

    +*
  1. If a special collation rule controlled by a <modifier> is +* specified it applies to the whole collator object. +*
  2. All non-mentioned characters are at the end of the collation order. +*
+*

+*

The collation table is composed of a list of collation rules, where each +* rule is of three forms: +*

+*    <modifier>
+*    <relation> <text-argument>
+*    <reset> <text-argument>
+* 
+*

+*

The definitions of the rule elements is as follows: +*

+*

+*

+* This sounds more complicated than it is in practice. For example, the +* following are equivalent ways of expressing the same thing: +*

+*
+* a < b < c
+* a < b & b < c
+* a < c & a < b
+* 
+*
+* Notice that the order is important, as the subsequent item goes immediately +* after the text-argument. The following are not equivalent: +*
+*
+* a < b & a < c
+* a < c & a < b
+* 
+*
+* Either the text-argument must already be present in the sequence, or some +* initial substring of the text-argument must be present. +* (e.g. "a < b & ae < e" is valid since "a" is present in the +* sequence before "ae" is reset). In this latter case, "ae" is not entered and +* treated as a single character; instead, "e" is sorted as if it were expanded +* to two characters: "a" followed by an "e". This difference appears in +* natural languages: in traditional Spanish "ch" is treated as though it +* contracts to a single character (expressed as "c < ch < d"), while in +* traditional German a-umlaut is treated as though it expanded to two +* characters (expressed as +* "a,A < b,B ... &ae;\u00e3&AE;\u00c3"). +* [\u00e3 and \u00c3 are, of course, the escape sequences for +* a-umlaut.] +*

+*

+* Ignorable Characters +*

+* For ignorable characters, the first rule must start with a relation (the +* examples we have used above are really fragments; "a < b" really should +* be "< a < b"). If, however, the first relation is not "<", then all +* the all text-arguments up to the first "<" are ignorable. For example, +* ", - < a < b" makes "-" an ignorable character, as we saw earlier in +* the word "black-birds". In the samples for different languages, you see that +* most accents are ignorable.

+*

Normalization and Accents +*

RuleBasedCollator automatically processes its rule table to +* include both pre-composed and combining-character versions of accented +* characters. Even if the provided rule string contains only base characters +* and separate combining accent characters, the pre-composed accented +* characters matching all canonical combinations of characters from the rule +* string will be entered in the table.

+*

This allows you to use a RuleBasedCollator to compare accented strings +* even when the collator is set to NO_DECOMPOSITION. There are two caveats, +* however. First, if the strings to be collated contain combining sequences +* that may not be in canonical order, you should set the collator to +* CANONICAL_DECOMPOSITION or FULL_DECOMPOSITION to enable sorting of combining +* sequences. Second, if the strings contain characters with compatibility +* decompositions (such as full-width and half-width forms), you must use +* FULL_DECOMPOSITION, since the rule tables only include canonical mappings. +*

+*

Errors

+*

The following are errors:

+* +*

If you produce one of these errors, a RuleBasedCollator +* throws a ParseException.

+*

Examples

+*

Simple: "< a < b < c < d"

+*

Norwegian: "< a,A< b,B< c,C< d,D< e,E< f,F< " + +* "g,G< h,H< i,I< j,J< k,K< l,L< m,M< " + +* "n,N< o,O< p,P< q,Q< r,R< s,S< t,T< " + +* "u,U< v,V< w,W< x,X< y,Y< z,Z< " + +* "\u00E5=a\ u030A,\u00C5=A\u030A;aa,AA< " + +* "\u00E6,\ u00C6< \u00F8,\u00D8"

+*

Normally, to create a rule-based Collator object, you will use +* Collator's factory method getInstance. However, to +* create a rule-based Collator object with specialized rules tailored to your +* needs, you construct the RuleBasedCollator with the rules +* contained in a String object. For example:

+*
+*
+* String Simple = "< a< b< c< d";
+* RuleBasedCollator mySimple = new RuleBasedCollator(Simple);
+* 
+*
+* Or: +*
+*
+* String Norwegian = "< a,A< b,B< c,C< d,D< e,E< f,F<" +  
+*                    "g,G< h,H< i,I< j,J < k,K< l,L< " +
+*                    "m,M< n,N< o,O< p,P< q,Q< r,R< " +
+*                    "s,S< t,T < u,U< v,V< w,W< x,X< " +
+*                    "y,Y< z,Z < \u00E5=a\u030A," +
+*                    "\u00C5=A\u030A;aa,AA< \u00E6," +
+*                    "\u00C6< \u00F8,\u00D8";
+* RuleBasedCollator myNorwegian = new RuleBasedCollator(Norwegian);
+* 
+*
+*

Combining Collators is as simple as concatenating strings. +* Here's an example that combines two Collators from two +* different locales:

+*
+*
+* // Create an en_US Collator object
+* RuleBasedCollator en_USCollator = (RuleBasedCollator)
+*     Collator.getInstance(new Locale("en", "US", ""));
+* // Create a da_DK Collator object
+* RuleBasedCollator da_DKCollator = (RuleBasedCollator)
+*     Collator.getInstance(new Locale("da", "DK", ""));
+* // Combine the two
+* // First, get the collation rules from en_USCollator
+* String en_USRules = en_USCollator.getRules();
+* // Second, get the collation rules from da_DKCollator
+* String da_DKRules = da_DKCollator.getRules();
+* RuleBasedCollator newCollator =
+*     new RuleBasedCollator(en_USRules + da_DKRules);
+* // newCollator has the combined rules
+* 
+*
+*

Another more interesting example would be to make changes on an existing +* table to create a new Collator object. For example, add +* "&C< ch, cH, Ch, CH" to the en_USCollator object to +* create your own:

+*
+*
+* // Create a new Collator object with additional rules
+* String addRules = "&C< ch, cH, Ch, CH";
+* RuleBasedCollator myCollator =
+*     new RuleBasedCollator(en_USCollator + addRules);
+* // myCollator contains the new rules
+* 
+*
+*

The following example demonstrates how to change the order of +* non-spacing accents, +*

+*
+* // old rule
+* String oldRules = 
+*     "=\u0301;\u0300;\u0302;\u0308"    // main accents
+*     + ";\u0327;\u0303;\u0304;\u0305"    // main accents
+*     + ";\u0306;\u0307;\u0309;\u030A"    // main accents
+*     + ";\u030B;\u030C;\u030D;\u030E"    // main accents
+*     + ";\u030F;\u0310;\u0311;\u0312"    // main accents
+*     + "< a , A ; ae, AE ; \u00e6 , \u00c6"
+*     + "< b , B < c, C < e, E & C < d, D";
+* // change the order of accent characters
+* String addOn = "& \u0300 ; \u0308 ; \u0302";
+* RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn);
+* 
+*
+*

The last example shows how to put new primary ordering in before the +* default setting. For example, in Japanese Collator, you +* can either sort English characters before or after Japanese characters, +*

+*
+* // get en_US Collator rules
+* RuleBasedCollator en_USCollator = (RuleBasedCollator)
+*                                             Collator.getInstance(Locale.US);
+* // add a few Japanese character to sort before English characters
+* // suppose the last character before the first base letter 'a' in
+* // the English collation rule is \u2212
+* String jaString = "& \u2212 < \u3041, \u3042 < \u3043, \u3044";
+* RuleBasedCollator myJapaneseCollator = new
+*     RuleBasedCollator(en_USCollator.getRules() + jaString);
+* 
+* @author Syn Wee Quek +* @since release 2.2, April 18 2002 +* @draft 2.2 +*/ +public class RuleBasedCollator extends Collator implements Trie.DataManipulate +{ + // public data members --------------------------------------------------- + + // public constructors --------------------------------------------------- + + /** + *

RuleBasedCollator constructor that takes the rules. + * Please see RuleBasedCollator class description for more details on the + * collation rule syntax.

+ *

Note different from Java, does not throw a ParseException

+ * @see java.util.Locale + * @param rules the collation rules to build the collation table from. + * @exception Exception thrown when there's an error creating the collator + * @draft 2.2 + */ + public RuleBasedCollator(String rules) throws Exception + { + setStrength(Collator.TERTIARY); + setDecomposition(Collator.CANONICAL_DECOMPOSITION); + m_rules_ = rules; + // tables = new RBCollationTables(rules, decomp); + // init(); + } + + // public methods -------------------------------------------------------- + + /** + * Return a CollationElementIterator for the given String. + * @see CollationElementIterator + * @draft 2.2 + */ + public CollationElementIterator getCollationElementIterator(String source) { + return new CollationElementIterator(source, this); + } + + /** + * Return a CollationElementIterator for the given String. + * @see CollationElementIterator + * @draft 2.2 + */ + public CollationElementIterator getCollationElementIterator( + CharacterIterator source) { + return new CollationElementIterator(source, this); + } + + // public setters -------------------------------------------------------- + + /** + * Sets the Hiragana Quartenary sort to be on or off + * @param flag true if Hiragana Quartenary sort is to be on, false + * otherwise + * @draft 2.2 + */ + public synchronized void setHiraganaQuartenary(boolean flag) + { + m_isHiragana4_ = flag; + } + + /** + * Sets the Hiragana Quartenary sort to be on or off depending on the + * Collator's locale specific default value. + * @draft 2.2 + */ + public synchronized void setHiraganaQuartenaryDefault() + { + m_isHiragana4_ = m_defaultIsHiragana4_; + } + + /** + * Sets the Collator to sort with the indicated casing first + * @param upper true for sorting uppercased characters before lowercased + * characters, false for sorting lowercased characters before + * uppercased characters + * @draft 2.2 + */ + public synchronized void setCaseFirst(boolean upper) + { + if (upper) { + m_caseFirst_ = AttributeValue.UPPER_FIRST_; + } + else { + m_caseFirst_ = AttributeValue.LOWER_FIRST_; + } + updateInternalState(); + } + + /** + * Sets the Collator to ignore any previous setCaseFirst(boolean) calls. + * Ignores case preferences. + * @draft 2.2 + */ + public synchronized void setCaseFirstOff() + { + m_caseFirst_ = AttributeValue.OFF_; + updateInternalState(); + } + + /** + * Sets the case sorting preferences to the Collator's locale specific + * default value. + * @see #setCaseFirst(boolean) + * @see #setCaseFirstOff + * @draft 2.2 + */ + public synchronized final void setCaseFirstDefault() + { + m_caseFirst_ = m_defaultCaseFirst_; + updateInternalState(); + } + + /** + * Sets the alternate handling value for quartenary strength to the + * Collator's locale specific default value. + * @see #setAlternateHandling + * @draft 2.2 + */ + public synchronized void setAlternateHandlingDefault() + { + m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; + } + + /** + * Sets case level sorting to the Collator's locale specific default value. + * @see #setCaseLevel + * @draft 2.2 + */ + public synchronized void setCaseLevelDefault() + { + m_isCaseLevel_ = m_defaultIsCaseLevel_; + updateInternalState(); + } + + /** + * Set the decomposition mode to the Collator's locale specific default + * value. + * @see #getDecomposition + * @draft 2.2 + */ + public synchronized void setDecompositionDefault() + { + m_decomposition_ = m_defaultDecomposition_; + } + + /** + * Sets French collation to the Collator's locale specific default value. + * @see #getFrenchCollation + * @draft 2.2 + */ + public synchronized void setFrenchCollationDefault() + { + m_isFrenchCollation_ = m_defaultIsFrenchCollation_; + updateInternalState(); + } + + /** + *

Sets strength to the Collator's locale specific default value.

+ * @see #setStrength + * @draft 2.2 + */ + public synchronized void setStrengthDefault() + { + m_strength_ = m_defaultStrength_; + updateInternalState(); + } + + /** + * Sets the French collation + * @param flag true to set the French collation on, false to set it off + * @draft 2.2 + */ + public synchronized void setFrenchCollation(boolean flag) + { + m_isFrenchCollation_ = flag; + updateInternalState(); + } + + /** + * Sets the alternate handling for quartenary strength to be either + * shifted or non-ignorable. This attribute will only be effective with + * a quartenary strength sort. + * @param shifted true if shifted for alternate handling is desired, false + * for the non-ignorable. + * @draft 2.2 + */ + public synchronized void setAlternateHandling(boolean shifted) + { + m_isAlternateHandlingShifted_ = shifted; + updateInternalState(); + } + + /** + * Sets if case level sorting is required. + * @param flag true if case level sorting is required, false otherwise + * @draft 2.2 + */ + public synchronized void setCaseLevel(boolean flag) + { + m_isCaseLevel_ = flag; + updateInternalState(); + } + + + // public getters -------------------------------------------------------- + + /** + * Internal method called to parse a lead surrogate's ce for the offset + * to the next trail surrogate data. + * @param ce collation element of the lead surrogate + * @return data offset or 0 for the next trail surrogate + * @draft 2.2 + */ + public int getFoldingOffset(int ce) + { + if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) { + return (ce & 0xFFFFFF); + } + return 0; + } + + /** + * Gets the collation rules for this RuleBasedCollator. * @return returns the collation rules + * @draft 2.2 + */ + public final String getRules() + { + return m_rules_; + } + + /** + *

Transforms the String into a series of bits that can be compared + * bitwise to other CollationKeys. CollationKeys provide better + * performance than Collator.compare() when Strings are involved in + * multiple comparisons.

+ *

Internally CollationKey stores its data in a null-terminated byte + * array.

+ *

See the Collator class description for an example using + * CollationKeys.

+ * @param source the string to be transformed into a collation key. + * @return the CollationKey for the given String based on this Collator's + * collation rules. If the source String is null, a null + * CollationKey is returned. + * @see CollationKey + * @see compare(String, String) + * @draft 2.2 + */ + public CollationKey getCollationKey(String source) + { + boolean compare[] = {m_isCaseLevel_, + true, + m_strength_ >= SECONDARY, + m_strength_ >= TERTIARY, + m_strength_ >= QUATERNARY, + m_strength_ == IDENTICAL + }; + + byte bytes[][] = {new byte[SORT_BUFFER_INIT_SIZE_CASE_], // case + new byte[SORT_BUFFER_INIT_SIZE_1_], // primary + new byte[SORT_BUFFER_INIT_SIZE_2_], // secondary + new byte[SORT_BUFFER_INIT_SIZE_3_], // tertiary + new byte[SORT_BUFFER_INIT_SIZE_4_] // quartenary + }; + int bytescount[] = {0, 0, 0, 0, 0}; + int count[] = {0, 0, 0, 0, 0}; + boolean doFrench = m_isFrenchCollation_ && compare[2]; + // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. + // If we have no qShifted, we don't need to set UCOL_COMMON_BOT4 so + // high. + int commonBottom4 = ((m_variableTopValue_ >> 8) & LAST_BYTE_MASK_) + 1; + byte hiragana4 = 0; + if (m_isHiragana4_ && compare[4]) { + // allocate one more space for hiragana, value for hiragana + hiragana4 = (byte)commonBottom4; + commonBottom4 ++; + } + + int bottomCount4 = 0xFF - commonBottom4; + // If we need to normalize, we'll do it all at once at the beginning! + if ((compare[5] || m_decomposition_ != NO_DECOMPOSITION) + /*&& UNORM_YES != unorm_quickCheck(source, len, normMode, status)*/ + ) { + /* + * len = unorm_internalNormalize(normSource, normSourceLen, + source, len, + normMode, FALSE, + status); + source = normSource;*/ + String norm = source; + getSortKeyBytes(norm, compare, bytes, bytescount, count, + doFrench, hiragana4, commonBottom4, bottomCount4); + } + else { + getSortKeyBytes(source, compare, bytes, bytescount, count, doFrench, + hiragana4, commonBottom4, bottomCount4); + } + byte sortkey[] = getSortKey(source, compare, bytes, bytescount, count, + doFrench, commonBottom4, bottomCount4); + return new CollationKey(source, sortkey); + } + + /** + * Checks if uppercase is sorted before lowercase + * @return true if Collator sorts uppercase before lower, false otherwise + * @draft 2.2 + */ + public boolean isUpperCaseFirst() + { + return (m_caseFirst_ == AttributeValue.UPPER_FIRST_); + } + + /** + * Checks if lowercase is sorted before uppercase + * @return true if Collator sorts lowercase before upper, false otherwise + * @draft 2.2 + */ + public boolean isLowerCaseFirst() + { + return (m_caseFirst_ == AttributeValue.LOWER_FIRST_); + } + + /** + * Checks if case sorting is off. + * @return true if case sorting is off, false otherwise + * @draft 2.2 + */ + public boolean isCaseFirstOff() + { + return (m_caseFirst_ == AttributeValue.OFF_); + } + + /** + * Checks if the alternate handling attribute is shifted or non-ignorable. + * + * @param shifted true if checks are to be done on shifted, false if + * checks are to be done on non-ignorable + * @return true or false + * @draft 2.2 + */ + public boolean isAlternateHandling(boolean shifted) + { + if (shifted) { + return m_isAlternateHandlingShifted_; + } + return !m_isAlternateHandlingShifted_; + } + + /** + * Checks if case level sorting is on + * @return true if case level sorting is on + * @draft 2.2 + */ + public boolean isCaseLevel() + { + return m_isCaseLevel_; + } + + /** + * Checks if French Collation sorting is on + * @return true if French Collation sorting is on + * @draft 2.2 + */ + public boolean isFrenchCollation() + { + return m_isFrenchCollation_; + } + + // public other methods ------------------------------------------------- + + /** + * Compares the equality of two RuleBasedCollators. + * @param obj the RuleBasedCollator to be compared with. + * @return true if this RuleBasedCollator has exactly the same behaviour + * as obj, false otherwise. + * @draft 2.2 + */ + public boolean equals(Object obj) { + if (obj == null || !super.equals(obj)) { + return false; // super does class check + } + RuleBasedCollator other = (RuleBasedCollator)obj; + // all other non-transient information is also contained in rules. + return (m_rules_.equals(other.m_rules_)); + } + + /** + * Standard override; no change in semantics. + * @draft 2.2 + */ + public Object clone() { + // synwee todo: do after all implementation done + return null; + } + + /** + * Generates the hash code for this RuleBasedCollator. + * @return the unique hash code for this Collator + * @draft 2.2 + */ + public final int hashCode() + { + return getRules().hashCode(); + } + + /** + *

Compares the source string to the target string according to the + * collation rules for this Collator. Returns an integer less than, equal + * to or greater than zero depending on whether the source String is less + * than, equal to or greater than the target string. See the Collator + * class description for an example of use.

+ *

For a one time comparison, this method has the best performance. If + * a given String will be involved in multiple comparisons, + * CollationKey.compareTo() has the best performance. See the Collator + * class description for an example using CollationKeys.

+ * @param source the source string. + * @param target the target string. + * @return Returns an integer value. Value is less than zero if source is + * less than target, value is zero if source and target are equal, + * value is greater than zero if source is greater than target. + * @see CollationKey + * @see Collator#getCollationKey + * @draft 2.2 + */ + public final int compare(String source, String target) + { + if (source == target) { + return 0; + } + + // Find the length of any leading portion that is equal + int offset = getFirstUnmatchedOffset(source, target); + if (source.charAt(offset) == 0) { + if (target.charAt(offset) == 0) { + return 0; + } + return 1; + } + else if (target.charAt(offset) == 0) { + return -1; + } + + // setting up the collator parameters + boolean compare[] = {m_isCaseLevel_, + true, + m_strength_ >= SECONDARY, + m_strength_ >= TERTIARY, + m_strength_ >= QUATERNARY, + m_strength_ == IDENTICAL + }; + boolean doFrench = m_isFrenchCollation_ && compare[2]; + boolean doShift4 = m_isAlternateHandlingShifted_ && compare[4]; + boolean doHiragana4 = m_isHiragana4_ && compare[4]; + + if (doHiragana4 && doShift4) { + String sourcesub = source.substring(offset); + String targetsub = target.substring(offset); + return compareBySortKeys(sourcesub, targetsub); + } + + // Preparing the CE buffers. will be filled during the primary phase + int cebuffer[][] = {new int[CE_BUFFER_SIZE_], new int[CE_BUFFER_SIZE_]}; + int cebuffersize[] = {0, 0}; + // This is the lowest primary value that will not be ignored if shifted + int lowestpvalue = m_isAlternateHandlingShifted_ + ? m_variableTopValue_ << 16 : 0; + int result = doPrimaryCompare(doHiragana4, lowestpvalue, source, + target, offset, cebuffer, cebuffersize); + if (cebuffer[0] == null && cebuffer[1] == null) { + // since the cebuffer is cleared when we have determined that + // either source is greater than target or vice versa, the return + // result is the comparison result and not the hiragana result + return result; + } + + int hiraganaresult = result; + + if (compare[2]) { + result = doSecondaryCompare(cebuffer, cebuffersize, doFrench); + if (result != 0) { + return result; + } + } + // doing the case bit + if (compare[0]) { + result = doCaseCompare(cebuffer); + if (result != 0) { + return result; + } + } + // Tertiary level + if (compare[3]) { + result = doTertiaryCompare(cebuffer); + if (result != 0) { + return result; + } + } + + if (compare[4]) { // checkQuad + result = doQuaternaryCompare(cebuffer, lowestpvalue); + if (result != 0) { + return result; + } + } + else if (doHiragana4 && hiraganaresult != 0) { + // If we're fine on quaternaries, we might be different + // on Hiragana. This, however, might fail us in shifted. + return hiraganaresult; + } + + // For IDENTICAL comparisons, we use a bitwise character comparison + // as a tiebreaker if all else is equal. + // Getting here should be quite rare - strings are not identical - + // that is checked first, but compared == through all other checks. + if (compare[5]) { + return doIdenticalCompare(source, target, offset, true); + } + return 0; + } + + // public abstract methods ----------------------------------------------- + + // protected inner interfaces -------------------------------------------- + + /** + * Attribute values to be used when setting the Collator options + */ + protected static interface AttributeValue + { + /** + * Indicates that the default attribute value will be used. + * See individual attribute for details on its default value. + */ + static final int DEFAULT_ = -1; + /** + * Primary collation strength + */ + static final int PRIMARY_ = 0; + /** + * Secondary collation strength + */ + static final int SECONDARY_ = 1; + /** + * Tertiary collation strength + */ + static final int TERTIARY_ = 2; + /** + * Default collation strength + */ + static final int DEFAULT_STRENGTH_ = TERTIARY; + /** + * Internal use for strength checks in Collation elements + */ + static final int CE_STRENGTH_LIMIT_ = TERTIARY + 1; + /** + * Quaternary collation strength + */ + static final int QUATERNARY_ = 3; + /** + * Identical collation strength + */ + static final int IDENTICAL_ = 15; + /** + * Internal use for strength checks + */ + static final int STRENGTH_LIMIT_ = IDENTICAL + 1; + /** + * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, + * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE + */ + static final int OFF_ = 16; + /** + * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, + * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE + */ + static final int ON_ = 17; + /** + * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted + */ + static final int SHIFTED_ = 20; + /** + * Valid for ALTERNATE_HANDLING. Alternate handling will be non + * ignorable + */ + static final int NON_IGNORABLE_ = 21; + /** + * Valid for CASE_FIRST - lower case sorts before upper case + */ + static final int LOWER_FIRST_ = 24; + /** + * Upper case sorts before lower case + */ + static final int UPPER_FIRST_ = 25; + /** + * Valid for NORMALIZATION_MODE ON and OFF are also allowed for this + * attribute + */ + static final int ON_WITHOUT_HANGUL_ = 28; + /** + * Number of attribute values + */ + static final int LIMIT_ = 29; + } + + /** + * Attributes that collation service understands. All the attributes can + * take DEFAULT value, as well as the values specific to each one. + */ + protected static interface Attribute { + /** + * Attribute for direction of secondary weights - used in French. + * Acceptable values are ON, which results in secondary weights being + * considered backwards and OFF which treats secondary weights in the + * order they appear. + */ + static final int FRENCH_COLLATION_ = 0; + /** + * Attribute for handling variable elements. Acceptable values are + * NON_IGNORABLE (default) which treats all the codepoints with + * non-ignorable primary weights in the same way, and SHIFTED which + * causes codepoints with primary weights that are equal or below the + * variable top value to be ignored on primary level and moved to the + * quaternary level. + */ + static final int ALTERNATE_HANDLING_ = 1; + /** + * Controls the ordering of upper and lower case letters. Acceptable + * values are OFF (default), which orders upper and lower case letters + * in accordance to their tertiary weights, UPPER_FIRST which forces + * upper case letters to sort before lower case letters, and + * LOWER_FIRST which does the opposite. + */ + static final int CASE_FIRST_ = 2; + /** + * Controls whether an extra case level (positioned before the third + * level) is generated or not. Acceptable values are OFF (default), + * when case level is not generated, and ON which causes the case + * level to be generated. Contents of the case level are affected by + * the value of CASE_FIRST attribute. A simple way to ignore accent + * differences in a string is to set the strength to PRIMARY and + * enable case level. + */ + static final int CASE_LEVEL_ = 3; + /** + * Controls whether the normalization check and necessary + * normalizations are performed. When set to OFF (default) no + * normalization check is performed. The correctness of the result is + * guaranteed only if the input data is in so-called FCD form (see + * users manual for more info). When set to ON, an incremental check + * is performed to see whether the input data is in the FCD form. If + * the data is not in the FCD form, incremental NFD normalization is + * performed. + */ + static final int NORMALIZATION_MODE_ = 4; + /** + * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, + * QUATERNARY or IDENTICAL. The usual strength for most locales + * (except Japanese) is tertiary. Quaternary strength is useful when + * combined with shifted setting for alternate handling attribute and + * for JIS x 4061 collation, when it is used to distinguish between + * Katakana and Hiragana (this is achieved by setting the + * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is + * affected only by the number of non ignorable code points in the + * string. Identical strength is rarely useful, as it amounts to + * codepoints of the NFD form of the string. + */ + static final int STRENGTH_ = 5; + /** + * When turned on, this attribute positions Hiragana before all + * non-ignorables on quaternary level. This is a sneaky way to produce + * JIS sort order. + */ + static final int HIRAGANA_QUATERNARY_MODE_ = 6; + /** + * Attribute count + */ + static final int LIMIT_ = 7; + } + + // protected data members ------------------------------------------------ + + /** + * Size of collator raw data headers and options before the expansion + * data. This is used when expansion ces are to be retrieved. ICU4C uses + * the expansion offset starting from UCollator.UColHeader, hence ICU4J + * will have to minus that off to get the right expansion ce offset. In + * number of ints. + */ + protected int m_expansionOffset_; + /** + * Size of collator raw data headers, options and expansions before + * contraction data. This is used when contraction ces are to be retrieved. + * ICU4C uses contraction offset starting from UCollator.UColHeader, hence + * ICU4J will have to minus that off to get the right contraction ce + * offset. In number of chars. + */ + protected int m_contractionOffset_; + /** + * Flag indicator if Jamo is special + */ + protected boolean m_isJamoSpecial_; + + // Collator options ------------------------------------------------------ + protected int m_defaultVariableTopValue_; + protected boolean m_defaultIsFrenchCollation_; + protected boolean m_defaultIsAlternateHandlingShifted_; + protected int m_defaultCaseFirst_; + protected boolean m_defaultIsCaseLevel_; + protected int m_defaultDecomposition_; + protected int m_defaultStrength_; + protected boolean m_defaultIsHiragana4_; + /** + * Value of the variable top + */ + protected int m_variableTopValue_; + /** + * Attribute for special Hiragana + */ + protected boolean m_isHiragana4_; + /** + * Case sorting customization + */ + protected int m_caseFirst_; + + // end Collator options -------------------------------------------------- + + /** + * Expansion table + */ + protected int m_expansion_[]; + /** + * Contraction index table + */ + protected char m_contractionIndex_[]; + /** + * Contraction CE table + */ + protected int m_contractionCE_[]; + /** + * Data trie + */ + protected IntTrie m_trie_; + /** + * Table to store all collation elements that are the last element of an + * expansion. This is for use in StringSearch. + */ + protected int m_expansionEndCE_[]; + /** + * Table to store the maximum size of any expansions that end with the + * corresponding collation element in m_expansionEndCE_. For use in + * StringSearch too + */ + protected byte m_expansionEndCEMaxSize_[]; + /** + * Heuristic table to store information on whether a char character is + * considered "unsafe". "Unsafe" character are combining marks or those + * belonging to some contraction sequence from the offset 1 onwards. + * E.g. if "ABC" is the only contraction, then 'B' and 'C' are considered + * unsafe. If we have another contraction "ZA" with the one above, then + * 'A', 'B', 'C' are "unsafe" but 'Z' is not. + */ + protected byte m_unsafe_[]; + /** + * Table to store information on whether a codepoint can occur as the last + * character in a contraction + */ + protected byte m_contractionEnd_[]; + /** + * Table for UCA use, may be removed + */ + protected char m_UCAContraction_[]; + /** + * Original collation rules + */ + protected String m_rules_; + /** + * The smallest "unsafe" codepoint + */ + protected char m_minUnsafe_; + /** + * The smallest codepoint that could be the end of a contraction + */ + protected char m_minContractionEnd_; + + /** + * UnicodeData.txt property object + */ + protected static final RuleBasedCollator UCA_; + + // block to initialise character property database + static + { + try + { + UCA_ = new RuleBasedCollator(); + InputStream i = UCA_.getClass().getResourceAsStream( + "/com/ibm/icu/impl/data/ucadata.dat"); + + BufferedInputStream b = new BufferedInputStream(i, 90000); + CollatorReader reader = new CollatorReader(b); + reader.read(UCA_); + b.close(); + i.close(); + ResourceBundle rb = + ICULocaleData.getLocaleElements(Locale.ENGLISH); + UCA_.m_rules_ = rb.getString("%%UCARULES"); + UCA_.init(); + } + catch (Exception e) + { + e.printStackTrace(); + throw new RuntimeException(e.getMessage()); + } + } + + // protected constants --------------------------------------------------- + + protected static final int CE_SPECIAL_FLAG_ = 0xF0000000; + /** + * Lead surrogate that is tailored and doesn't start a contraction + */ + protected static final int CE_SURROGATE_TAG_ = 5; + + /** + * Minimum size required for the binary collation data in bytes. + * Size of UCA header + size of options to 4 bytes + */ + private static final int MIN_BINARY_DATA_SIZE_ = (41 + 8) << 2; + /** + * Mask to get the primary strength of the collation element + */ + protected static final int CE_PRIMARY_MASK_ = 0xFFFF0000; + /** + * Mask to get the secondary strength of the collation element + */ + protected static final int CE_SECONDARY_MASK_ = 0xFF00; + /** + * Mask to get the tertiary strength of the collation element + */ + protected static final int CE_TERTIARY_MASK_ = 0xFF; + /** + * Primary strength shift + */ + protected static final int CE_PRIMARY_SHIFT_ = 16; + /** + * Secondary strength shift + */ + protected static final int CE_SECONDARY_SHIFT_ = 8; + + /** + * Continuation marker + */ + protected static final int CE_CONTINUATION_MARKER_ = 0xC0; + + // end protected constants ----------------------------------------------- + + // protected constructor ------------------------------------------------- + + /** + * Constructors a RuleBasedCollator from the argument locale. + * If no resource bundle is associated with the locale, UCA is used + * instead. + * @param locale + * @exception Exception thrown when there's an error creating the Collator + */ + protected RuleBasedCollator(Locale locale) throws Exception + { + ResourceBundle rb = ICULocaleData.getLocaleElements(locale); + + if (rb != null) { + byte map[] = (byte [])rb.getObject("%%CollationBin"); + // synwee todo: problem, data in little endian and + // ICUListResourceBundle should not calculate size by + // using .available() that only gives the buffer size + BufferedInputStream input = + new BufferedInputStream(new ByteArrayInputStream(map)); + CollatorReader reader = new CollatorReader(input, false); + if (map.length > MIN_BINARY_DATA_SIZE_) { + // synwee todo: undo when problem solved + reader.read(this); + } + else { + reader.readHeader(this); + reader.readOptions(this); + // duplicating UCA_'s data + m_expansion_ = UCA_.m_expansion_; + m_contractionIndex_ = UCA_.m_contractionIndex_; + m_contractionCE_ = UCA_.m_contractionCE_; + m_trie_ = UCA_.m_trie_; + m_expansionEndCE_ = UCA_.m_expansionEndCE_; + m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_; + m_unsafe_ = UCA_.m_unsafe_; + m_contractionEnd_ = UCA_.m_contractionEnd_; + m_minUnsafe_ = UCA_.m_minUnsafe_; + m_minContractionEnd_ = UCA_.m_minContractionEnd_; + setStrengthDefault(); + setDecompositionDefault(); + setFrenchCollationDefault(); + setAlternateHandlingDefault(); + setCaseLevelDefault(); + setCaseFirstDefault(); + setHiraganaQuartenaryDefault(); + updateInternalState(); + } + Object rules = rb.getObject("CollationElements"); + if (rules != null) { + m_rules_ = (String)((Object[][])rules)[0][1]; + } + init(); + } + } + + /** + *

Protected constructor for use by subclasses. + * Public access to creating Collators is handled by the API + * Collator.getInstance() or RuleBasedCollator(String rules). + *

+ *

+ * This constructor constructs the UCA collator internally + *

+ * @draft 2.2 + */ + protected RuleBasedCollator() throws Exception + { + } + + // protected methods ----------------------------------------------------- + + /** + * Initializes the RuleBasedCollator + */ + protected synchronized final void init() + { + for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; + m_minUnsafe_ ++) { + // Find the smallest unsafe char. + if (isUnsafe(m_minUnsafe_)) { + break; + } + } + + for (m_minContractionEnd_ = 0; + m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; + m_minContractionEnd_ ++) { + // Find the smallest contraction-ending char. + if (isContractionEnd(m_minContractionEnd_)) { + break; + } + } + setStrengthDefault(); + setDecompositionDefault(); + setFrenchCollationDefault(); + setAlternateHandlingDefault(); + setCaseLevelDefault(); + setCaseFirstDefault(); + setHiraganaQuartenaryDefault(); + updateInternalState(); + } + + /** + * Test whether a char character is potentially "unsafe" for use as a + * collation starting point. "Unsafe" characters are combining marks or + * those belonging to some contraction sequence from the offset 1 onwards. + * E.g. if "ABC" is the only contraction, then 'B' and + * 'C' are considered unsafe. If we have another contraction "ZA" with + * the one above, then 'A', 'B', 'C' are "unsafe" but 'Z' is not. + * @param ch character to determin + * @return true if ch is unsafe, false otherwise + */ + protected final boolean isUnsafe(char ch) + { + if (ch < m_minUnsafe_) { + return false; + } + + if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { + if (UTF16.isTrailSurrogate(ch)) { + // Trail surrogate are always considered unsafe. + return true; + } + ch &= HEURISTIC_OVERFLOW_MASK_; + ch += HEURISTIC_OVERFLOW_OFFSET_; + } + int value = m_unsafe_[ch >> HEURISTIC_SHIFT_]; + return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0; + } + + /** + * Approximate determination if a char character is at a contraction end. + * Guaranteed to be true if a character is at the end of a contraction, + * otherwise it is not deterministic. + * @param ch character to be determined + */ + protected final boolean isContractionEnd(char ch) + { + if (UTF16.isTrailSurrogate(ch)) { + return true; + } + + if (ch < m_minContractionEnd_) { + return false; + } + + if (ch >= (HEURISTIC_SIZE_ << HEURISTIC_SHIFT_)) { + ch &= HEURISTIC_OVERFLOW_MASK_; + ch += HEURISTIC_OVERFLOW_OFFSET_; + } + int value = m_contractionEnd_[ch >> HEURISTIC_SHIFT_]; + return ((value >> (ch & HEURISTIC_MASK_)) & 1) != 0; + } + + /** + * Resets the internal case data members and compression values. + */ + protected synchronized void updateInternalState() + { + if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { + m_caseSwitch_ = CASE_SWITCH_; + } + else { + m_caseSwitch_ = NO_CASE_SWITCH_; + } + + if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) { + m_mask3_ = CE_REMOVE_CASE_; + m_common3_ = COMMON_NORMAL_3_; + m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_; + m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_; + m_bottom3_ = COMMON_BOTTOM_3_; + } + else { + m_mask3_ = CE_KEEP_CASE_; + m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_; + if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { + m_common3_ = COMMON_UPPER_FIRST_3_; + m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_; + m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_; + } else { + m_common3_ = COMMON_NORMAL_3_; + m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_; + m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_; + } + } + + // Set the compression values + int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1; + // we multilply double with int, but need only int + m_topCount3_ = (int)(PROPORTION_3_ * total3); + m_bottomCount3_ = total3 - m_topCount3_; + + if (!m_isCaseLevel_ && m_strength_ == AttributeValue.TERTIARY_ + && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) { + m_isSimple3_ = true; + } + else { + m_isSimple3_ = false; + } + } + + /** + *

Converts the C attribute index and values for use and stores it into + * the relevant default attribute variable.

+ *

Note internal use, no sanity checks done on arguments

+ */ + protected void setAttributeDefault(int attribute, int value) + { + switch (attribute) { + case Attribute.FRENCH_COLLATION_: + m_defaultIsFrenchCollation_ = (value == AttributeValue.ON_); + break; + case Attribute.ALTERNATE_HANDLING_: + m_defaultIsAlternateHandlingShifted_ = + (value == AttributeValue.SHIFTED_); + break; + case Attribute.CASE_FIRST_: + m_defaultCaseFirst_ = value; + break; + case Attribute.CASE_LEVEL_: + m_defaultIsCaseLevel_ = (value == AttributeValue.ON_); + break; + case Attribute.NORMALIZATION_MODE_: + m_defaultDecomposition_ = value; + break; + case Attribute.STRENGTH_: + m_defaultStrength_ = value; + case Attribute.HIRAGANA_QUATERNARY_MODE_: + m_defaultIsHiragana4_ = (value == AttributeValue.ON_); + } + } + + /** + * Retrieve the tag of a special ce + * @param ce ce to test + * @return tag of ce + */ + protected static int getTag(int ce) + { + return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_; + } + + /** + * Checking if ce is special + * @param ce to check + * @return true if ce is special + */ + protected static boolean isSpecial(int ce) + { + return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_; + } + + /** + * Getting the mask for collation strength + * @param strength collation strength + * @return collation element mask + */ + protected static final int getMask(int strength) + { + switch (strength) + { + case Collator.PRIMARY: + return CE_PRIMARY_MASK_; + case Collator.SECONDARY: + return CE_SECONDARY_MASK_ | CE_PRIMARY_MASK_; + default: + return CE_TERTIARY_MASK_ | CE_SECONDARY_MASK_ + | CE_PRIMARY_MASK_; + } + } + + /** + * Gets the primary weights from a CE + * @param ce collation element + * @return the primary weight of the collation element + */ + protected static final int getPrimaryWeight(int ce) + { + return ((ce) & CE_PRIMARY_MASK_) >> CE_PRIMARY_SHIFT_; + } + + /** + * Gets the secondary weights from a CE + * @param ce collation element + * @return the secondary weight of the collation element + */ + protected static final int getSecondaryWeight(int ce) + { + return (ce & CE_SECONDARY_MASK_) >> CE_SECONDARY_SHIFT_; + } + + /** + * Gets the tertiary weights from a CE + * @param ce collation element + * @return the tertiary weight of the collation element + */ + protected static final int getTertiaryWeight(int ce) + { + return ce & CE_TERTIARY_MASK_; + } + + // private variables ----------------------------------------------------- + + /** + * The smallest natural unsafe or contraction end char character before + * tailoring. + * This is a combining mark. + */ + private static final int DEFAULT_MIN_HEURISTIC_ = 0x300; + /** + * Heuristic table table size. Size is 32 bytes, 1 bit for each + * latin 1 char, and some power of two for hashing the rest of the chars. + * Size in bytes. + */ + private static final char HEURISTIC_SIZE_ = 1056; + /** + * Mask value down to "some power of two" - 1, + * number of bits, not num of bytes. + */ + private static final char HEURISTIC_OVERFLOW_MASK_ = 0x1fff; + /** + * Unsafe character shift + */ + private static final int HEURISTIC_SHIFT_ = 3; + /** + * Unsafe character addition for character too large, it has to be folded + * then incremented. + */ + private static final char HEURISTIC_OVERFLOW_OFFSET_ = 256; + /** + * Mask value to get offset in heuristic table. + */ + private static final char HEURISTIC_MASK_ = 7; + + private byte m_caseSwitch_; + private int m_common3_; + private byte m_mask3_; + /** + * When switching case, we need to add or subtract different values. + */ + private int m_addition3_; + /** + * Upper range when compressing + */ + private int m_top3_; + /** + * Upper range when compressing + */ + private int m_bottom3_; + private int m_topCount3_; + private int m_bottomCount3_; + /** + * Case first constants + */ + private static final byte CASE_SWITCH_ = (byte)0xC0; + private static final byte NO_CASE_SWITCH_ = 0; + /** + * Case level constants + */ + private static final byte CE_REMOVE_CASE_ = (byte)0x3F; + private static final byte CE_KEEP_CASE_ = (byte)0xFF; + /** + * Case strength mask + */ + private static final byte CE_CASE_BIT_MASK_ = (byte)0xC0; + private static final byte CE_CASE_MASK_3_ = (byte)0xFF; + /** + * Sortkey size factor. Values can be changed. + */ + private static final double PROPORTION_2_ = 0.5; + private static final double PROPORTION_3_ = 0.667; + + // These values come from the UCA ---------------------------------------- + + /** + * This is an enum that lists magic special byte values from the + * fractional UCA + */ + private static final byte BYTE_ZERO_ = 0x0; + private static final byte BYTE_LEVEL_SEPARATOR_ = (byte)0x01; + private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02; + private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03; + private static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_; + private static final byte BYTE_FIRST_TAILORED_ = (byte)0x04; + private static final byte BYTE_COMMON_ = (byte)0x05; + private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_; + private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C; + private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D; + private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF; + private static final int COMMON_BOTTOM_2_ = BYTE_COMMON_; + private static final int COMMON_TOP_2_ = 0x86; // int for unsigness + private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1; + private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80; + private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40; + private static final int COMMON_TOP_CASE_SWITCH_OFF_3_ = 0x85; + private static final int COMMON_TOP_CASE_SWITCH_LOWER_3_ = 0x45; + private static final int COMMON_TOP_CASE_SWITCH_UPPER_3_ = 0xC5; + private static final int COMMON_BOTTOM_3_ = 0x05; + private static final int COMMON_BOTTOM_CASE_SWITCH_UPPER_3_ = 0x86; + private static final int COMMON_BOTTOM_CASE_SWITCH_LOWER_3_ = + COMMON_BOTTOM_3_; + private static final int TOP_COUNT_2_ = (int)(PROPORTION_2_ * TOTAL_2_); + private static final int BOTTOM_COUNT_2_ = TOTAL_2_ - TOP_COUNT_2_; + private static final int COMMON_2_ = COMMON_BOTTOM_2_; + private static final int COMMON_UPPER_FIRST_3_ = 0xC5; + private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_; + private static final int COMMON_4_ = (byte)0xFF; + + /** + * If this collator is to generate only simple tertiaries for fast path + */ + private boolean m_isSimple3_; + + /** + * French collation sorting flag + */ + private boolean m_isFrenchCollation_; + /** + * Flag indicating if shifted is requested for quartenary alternate + * handling. If this is not true, the default for alternate handling will + * be non-ignorable. + */ + private boolean m_isAlternateHandlingShifted_; + /** + * Extra case level for sorting + */ + private boolean m_isCaseLevel_; + + private static final int CE_TAG_SHIFT_ = 24; + private static final int CE_TAG_MASK_ = 0x0F000000; + + private static final int SORT_BUFFER_INIT_SIZE_ = 128; + private static final int SORT_BUFFER_INIT_SIZE_1_ = + SORT_BUFFER_INIT_SIZE_ << 3; + private static final int SORT_BUFFER_INIT_SIZE_2_ = SORT_BUFFER_INIT_SIZE_; + private static final int SORT_BUFFER_INIT_SIZE_3_ = SORT_BUFFER_INIT_SIZE_; + private static final int SORT_BUFFER_INIT_SIZE_CASE_ = + SORT_BUFFER_INIT_SIZE_ >> 2; + private static final int SORT_BUFFER_INIT_SIZE_4_ = SORT_BUFFER_INIT_SIZE_; + + private static final int CE_CONTINUATION_TAG_ = 0xC0; + private static final int CE_REMOVE_CONTINUATION_MASK_ = 0xFFFFFF3F; + + private static final int LAST_BYTE_MASK_ = 0xFF; + + private static final int CE_RESET_TOP_VALUE_ = 0x9F000303; + private static final int CE_NEXT_TOP_VALUE_ = 0xE8960303; + + private static final byte SORT_CASE_BYTE_START_ = (byte)0x80; + private static final byte SORT_CASE_SHIFT_START_ = (byte)7; + + private static final byte SORT_LEVEL_TERMINATOR_ = 1; + + /** + * CE buffer size + */ + private static final int CE_BUFFER_SIZE_ = 512; + + // private methods ------------------------------------------------------- + + /** + * Checks if the argument ce is a continuation + * @param ce collation element to test + * @return true if ce is a continuation + */ + private static final boolean isContinuation(int ce) + { + return (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_; + } + + /** + * Gets the 2 bytes of primary order and adds it to the primary byte array + * @param ce current ce + * @param bytes array of byte arrays for each strength + * @param bytescount array of the size of each strength byte arrays + * @param count array of counters for each of the strength + * @param notIsContinuation flag indicating if the current bytes belong to + * a continuation ce + * @param doShift flag indicating if ce is to be shifted + * @param leadPrimary lead primary used for compression + * @param commonBottom4 common byte value for quartenary + * @param bottomCount4 smallest byte value for quartenary + * @return the new lead primary for compression + */ + private final int doPrimaryBytes(int ce, byte bytes[][], int bytescount[], + int count[], boolean notIsContinuation, + boolean doShift, int leadPrimary, + int commonBottom4, int bottomCount4) + { + + int p2 = (ce >>= 16) & LAST_BYTE_MASK_; // in ints for unsigned + int p1 = (ce >> 8) & LAST_BYTE_MASK_; // comparison + if (doShift) { + if (count[4] > 0) { + while (count[4] > bottomCount4) { + append(bytes, bytescount, 4, + (byte)(commonBottom4 + bottomCount4)); + count[4] -= bottomCount4; + } + append(bytes, bytescount, 4, + (byte)(commonBottom4 + (count[4] - 1))); + count[4] = 0; + } + // dealing with a variable and we're treating them as shifted + // This is a shifted ignorable + if (p1 != 0) { + // we need to check this since we could be in continuation + append(bytes, bytescount, 4, (byte)p1); + } + if (p2 != 0) { + append(bytes, bytescount, 4, (byte)p2); + } + } + else { + // Note: This code assumes that the table is well built + // i.e. not having 0 bytes where they are not supposed to be. + // Usually, we'll have non-zero primary1 & primary2, except + // in cases of LatinOne and friends, when primary2 will be + // regular and simple sortkey calc + if (p1 != CollationElementIterator.IGNORABLE) { + if (notIsContinuation) { + if (leadPrimary == p1) { + append(bytes, bytescount, 1, (byte)p2); + } + else { + if (leadPrimary != 0) { + append(bytes, bytescount, 1, + (byte)((p1 > leadPrimary) + ? BYTE_UNSHIFTED_MAX_ + : BYTE_UNSHIFTED_MIN_)); + } + if (p2 == CollationElementIterator.IGNORABLE) { + // one byter, not compressed + append(bytes, bytescount, 1, (byte)p1); + leadPrimary = 0; + } + else if (p1 < BYTE_FIRST_NON_LATIN_PRIMARY_ + || (p1 > ((CE_RESET_TOP_VALUE_ >> 24) & 0xFF) + && p1 < ((CE_NEXT_TOP_VALUE_ >> 24) & 0xFF))) { + // not compressible + leadPrimary = 0; + append(bytes, bytescount, 1, (byte)p1); + append(bytes, bytescount, 1, (byte)p2); + } + else { // compress + leadPrimary = p1; + append(bytes, bytescount, 1, (byte)p1); + append(bytes, bytescount, 1, (byte)p2); + } + } + } + else { + // continuation, add primary to the key, no compression + append(bytes, bytescount, 1, (byte)p1); + if (p2 != CollationElementIterator.IGNORABLE) { + append(bytes, bytescount, 1, (byte)p2); // second part + } + } + } + } + return leadPrimary; + } + + /** + * Gets the secondary byte and adds it to the secondary byte array + * @param ce current ce + * @param bytes array of byte arrays for each strength + * @param bytescount array of the size of each strength byte arrays + * @param count array of counters for each of the strength + * @param notIsContinuation flag indicating if the current bytes belong to + * a continuation ce + * @param doFrench flag indicator if french sort is to be performed + * @param frenchOffset start and end offsets to source string for reversing + */ + private final void doSecondaryBytes(int ce, byte bytes[][], + int bytescount[], int count[], + boolean notIsContinuation, + boolean doFrench, int frenchOffset[]) + { + int s = (ce >>= 8) & LAST_BYTE_MASK_; // int for comparison + if (s != 0) { + if (!doFrench) { + // This is compression code. + if (s == COMMON_2_ && notIsContinuation) { + count[2] ++; + } + else { + if (count[2] > 0) { + if (s > COMMON_2_) { // not necessary for 4th level. + while (count[2] > TOP_COUNT_2_) { + append(bytes, bytescount, 2, + (byte)(COMMON_TOP_2_ - TOP_COUNT_2_)); + count[2] -= TOP_COUNT_2_; + } + append(bytes, bytescount, 2, + (byte)(COMMON_TOP_2_ - (count[2] - 1))); + } + else { + while (count[2] > BOTTOM_COUNT_2_) { + append(bytes, bytescount, 2, + (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); + count[2] -= BOTTOM_COUNT_2_; + } + append(bytes, bytescount, 2, + (byte)(COMMON_BOTTOM_2_ + (count[2] - 1))); + } + count[2] = 0; + } + append(bytes, bytescount, 2, (byte)s); + } + } + else { + append(bytes, bytescount, 2, (byte)s); + // Do the special handling for French secondaries + // We need to get continuation elements and do intermediate + // restore + // abc1c2c3de with french secondaries need to be edc1c2c3ba + // NOT edc3c2c1ba + if (notIsContinuation) { + if (frenchOffset[0] != -1) { + // reverse secondaries from frenchStartPtr up to + // frenchEndPtr + reverseBuffer(bytes[2], frenchOffset); + frenchOffset[0] = -1; + } + } + else { + if (frenchOffset[0] == -1) { + frenchOffset[0] = bytescount[2] - 2; + } + frenchOffset[1] = bytescount[2] - 1; + } + } + } + } + + /** + * Reverse the argument buffer + * @param buffer to reverse + * @param offset start and end offsets to reverse + */ + private void reverseBuffer(byte buffer[], int offset[]) + { + int start = offset[0]; + int end = offset[1]; + while (start < end) { + byte b = buffer[start]; + buffer[start ++] = buffer[end]; + buffer[end --] = b; + } + } + + /** + * Insert the case shifting byte if required + * @param bytes array of byte arrays corresponding to each strength + * @param bytescount array of the size of the byte arrays + * @param caseshift value + * @return new caseshift value + */ + private static final int doCaseShift(byte bytes[][], int bytescount[], + int caseshift) + { + if (caseshift == 0) { + append(bytes, bytescount, 0, SORT_CASE_BYTE_START_); + caseshift = SORT_CASE_SHIFT_START_; + } + return caseshift; + } + + /** + * Performs the casing sort + * @param tertiary byte in ints for easy comparison + * @param bytes of byte arrays for each strength + * @param bytescount array of the size of each strength byte arrays + * @param notIsContinuation flag indicating if the current bytes belong to + * a continuation ce + * @param caseshift + * @return the new value of case shift + */ + private final int doCaseBytes(int tertiary, byte bytes[][], + int bytescount[], boolean notIsContinuation, + int caseshift) + { + caseshift = doCaseShift(bytes, bytescount, caseshift); + + if (notIsContinuation && tertiary != 0) { + byte casebits = (byte)(tertiary & 0xC0); + if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { + if (casebits == 0) { + bytes[0][bytescount[0] - 1] |= (1 << (-- caseshift)); + } + else { + // second bit + caseshift = doCaseShift(bytes, bytescount, caseshift); + bytes[0][bytescount[0] - 1] |= ((casebits >> 6) & 1) + << (-- caseshift); + } + } + else { + if (casebits != 0) { + bytes[0][bytescount[0] - 1] |= 1 << (-- caseshift); + // second bit + caseshift = doCaseShift(bytes, bytescount, caseshift); + bytes[0][bytescount[0] - 1] |= ((casebits >> 7) & 1) + << (-- caseshift); + } + } + } + + return caseshift; + } + + /** + * Gets the tertiary byte and adds it to the tertiary byte array + * @param tertiary byte in int for easy comparison + * @param bytes array of byte arrays for each strength + * @param bytescount array of the size of each strength byte arrays + * @param count array of counters for each of the strength + * @param notIsContinuation flag indicating if the current bytes belong to + * a continuation ce + */ + private final void doTertiaryBytes(int tertiary, byte bytes[][], + int bytescount[], int count[], + boolean notIsContinuation) + { + if (tertiary != 0) { + // This is compression code. + // sequence size check is included in the if clause + if (tertiary == m_common3_ && notIsContinuation) { + count[3] ++; + } + else { + int common3 = m_common3_ & LAST_BYTE_MASK_; + if ((tertiary > common3 + && m_common3_ == COMMON_NORMAL_3_) + || (tertiary <= common3 + && m_common3_ == COMMON_UPPER_FIRST_3_)) { + tertiary += m_addition3_; + } + if (count[3] > 0) { + if (tertiary > common3) { + while (count[3] > m_topCount3_) { + append(bytes, bytescount, 3, + (byte)(m_top3_ - m_topCount3_)); + count[3] -= m_topCount3_; + } + append(bytes, bytescount, 3, + (byte)(m_top3_ - (count[3] - 1))); + } + else { + while (count[3] > m_bottomCount3_) { + append(bytes, bytescount, 3, + (byte)(m_bottom3_ + m_bottomCount3_)); + count[3] -= m_bottomCount3_; + } + append(bytes, bytescount, 3, + (byte)(m_bottom3_ + (count[3] - 1))); + } + count[3] = 0; + } + append(bytes, bytescount, 3, (byte)tertiary); + } + } + } + + /** + * Gets the quartenary byte and adds it to the quartenary byte array + * @param bytes array of byte arrays for each strength + * @param bytescount array of the size of each strength byte arrays + * @param count array of counters for each of the strength + * @param isCodePointHiragana flag indicator if the previous codepoint + * we dealt with was Hiragana + * @param commonBottom4 smallest common quartenary byte + * @param bottomCount4 smallest quartenary byte + * @param hiragana4 hiragana quartenary byte + */ + private final void doQuartenaryBytes(byte bytes[][], int bytescount[], + int count[], + boolean isCodePointHiragana, + int commonBottom4, int bottomCount4, + byte hiragana4) + { + if (isCodePointHiragana) { // This was Hiragana, need to note it + if (count[4] > 0) { // Close this part + while (count[4] > bottomCount4) { + append(bytes, bytescount, 4, (byte)(commonBottom4 + + bottomCount4)); + count[4] -= bottomCount4; + } + append(bytes, bytescount, 4, (byte)(commonBottom4 + + (count[4] - 1))); + count[4] = 0; + } + append(bytes, bytescount, 4, hiragana4); // Add the Hiragana + } + else { // This wasn't Hiragana, so we can continue adding stuff + count[4] ++; + } + } + + /** + * Iterates through the argument string for all ces. + * Split the ces into their relevant primaries, secondaries etc. + * @param source normalized string + * @param compare array of flags indicating if a particular strength is + * to be processed + * @param bytes an array of byte arrays corresponding to the strengths + * @param bytescount an array of the size of the byte arrays + * @param count array of compression counters for each strength + * @param doFrench flag indicator if special handling of French has to be + * done + * @param hiragana4 offset for Hiragana quaternary + * @param commonBottom4 smallest common quaternary byte + * @param bottomCount4 smallest quaternary byte + */ + private synchronized final void getSortKeyBytes(String source, + boolean compare[], + byte bytes[][], + int bytescount[], + int count[], + boolean doFrench, + byte hiragana4, + int commonBottom4, + int bottomCount4) + { + int backupDecomposition = m_decomposition_; + m_decomposition_ = NO_DECOMPOSITION; // have to revert to backup later + CollationElementIterator coleiter = + new CollationElementIterator(source, this); + + int frenchOffset[] = {-1, -1}; + + // scriptorder not implemented yet + // const uint8_t *scriptOrder = coll->scriptOrder; + + boolean doShift = false; + boolean notIsContinuation = false; + + int leadPrimary = 0; // int for easier comparison + int caseShift = 0; + + while (true) { + int ce = coleiter.next(); + if (ce == CollationElementIterator.NULLORDER) { + break; + } + + if (ce == CollationElementIterator.IGNORABLE) { + continue; + } + + notIsContinuation = !isContinuation(ce); + + /* + * if (notIsContinuation) { + if (scriptOrder != NULL) { + primary1 = scriptOrder[primary1]; + } + }*/ + doShift = (m_isAlternateHandlingShifted_ + && ((notIsContinuation && ce <= m_variableTopValue_ + && (ce >> 24) != 0)) // primary byte not 0 + || (!notIsContinuation && doShift)); + leadPrimary = doPrimaryBytes(ce, bytes, bytescount, count, + notIsContinuation, doShift, leadPrimary, + commonBottom4, bottomCount4); + if (compare[2]) { + doSecondaryBytes(ce, bytes, bytescount, count, + notIsContinuation, doFrench, frenchOffset); + } + + int t = ce & LAST_BYTE_MASK_; + if (!notIsContinuation) { + t = ce & CE_REMOVE_CONTINUATION_MASK_; + } + + if (compare[0]) { + caseShift = doCaseBytes(t, bytes, bytescount, + notIsContinuation, caseShift); + } + else if (notIsContinuation) { + t ^= m_caseSwitch_; + } + + t &= m_mask3_; + + if (compare[3]) { + doTertiaryBytes(t, bytes, bytescount, count, + notIsContinuation); + } + + if (compare[4] && notIsContinuation) { // compare quad + doQuartenaryBytes(bytes, bytescount, count, + coleiter.m_isCodePointHiragana_, + commonBottom4, bottomCount4, hiragana4); + } + } + m_decomposition_ = backupDecomposition; // reverts to original + if (frenchOffset[0] != -1) { + // one last round of checks + reverseBuffer(bytes[2], frenchOffset); + } + } + + /** + * From the individual strength byte results the final compact sortkey + * will be calculated. + * @param source text string + * @param compare array of flags indicating if a particular strength is + * to be processed + * @param bytes an array of byte arrays corresponding to the strengths + * @param bytescount an array of the size of the byte arrays + * @param count array of compression counters for each strength + * @param doFrench flag indicating that special handling of French has to + * be done + * @param commonBottom4 smallest common quaternary byte + * @param bottomCount4 smallest quaternary byte + * @return the compact sortkey + */ + private final byte[] getSortKey(String source, boolean compare[], + byte bytes[][], int bytescount[], + int count[], boolean doFrench, + int commonBottom4, int bottomCount4) + { + // we have done all the CE's, now let's put them together to form + // a key + if (compare[2]) { + doSecondary(bytes, bytescount, count, doFrench); + if (compare[0]) { + doCase(bytes, bytescount); + } + if (compare[3]) { + doTertiary(bytes, bytescount, count); + if (compare[4]) { + doQuaternary(bytes, bytescount, count, commonBottom4, + bottomCount4); + if (compare[5]) { + doIdentical(source, bytes, bytescount); + } + + } + } + } + append(bytes, bytescount, 1, (byte)0); + return bytes[1]; + } + + /** + * Packs the French bytes + * @param bytes array of byte arrays corresponding to strenghts + * @param bytescount array of the size of byte arrays + * @param count array of compression counts + */ + private final void doFrench(byte bytes[][], int bytescount[], int count[]) + { + for (int i = 0; i < bytescount[2]; i ++) { + byte s = bytes[2][bytescount[2] - i - 1]; + // This is compression code. + if (s == COMMON_2_) { + ++ count[2]; + } + else { + if (count[2] > 0) { + if (s > COMMON_2_) { // not necessary for 4th level. + while (count[2] > TOP_COUNT_2_) { + append(bytes, bytescount, 1, + (byte)(COMMON_TOP_2_ - TOP_COUNT_2_)); + count[2] -= TOP_COUNT_2_; + } + append(bytes, bytescount, 1, (byte)(COMMON_TOP_2_ + - (count[2] - 1))); + } + else { + while (count[2] > BOTTOM_COUNT_2_) { + append(bytes, bytescount, 1, + (byte)(COMMON_BOTTOM_2_ + BOTTOM_COUNT_2_)); + count[2] -= BOTTOM_COUNT_2_; + } + append(bytes, bytescount, 1, (byte)(COMMON_BOTTOM_2_ + + (count[2] - 1))); + } + count[2] = 0; + } + append(bytes, bytescount, 1, s); + } + } + if (count[2] > 0) { + while (count[2] > BOTTOM_COUNT_2_) { + append(bytes, bytescount, 1, (byte)(COMMON_BOTTOM_2_ + + BOTTOM_COUNT_2_)); + count[2] -= BOTTOM_COUNT_2_; + } + append(bytes, bytescount, 1, (byte)(COMMON_BOTTOM_2_ + + (count[2] - 1))); + } + } + + /** + * Compacts the secondary bytes and stores them into the primary array + * @param bytes array of byte arrays corresponding to the strengths + * @param bytecount array of the size of the byte arrays + * @param count array of the number of compression counts + * @param doFrench flag indicator that French has to be handled specially + */ + private final void doSecondary(byte bytes[][], int bytescount[], + int count[], boolean doFrench) + { + if (count[2] > 0) { + while (count[2] > BOTTOM_COUNT_2_) { + append(bytes, bytescount, 2, (byte)(COMMON_BOTTOM_2_ + + BOTTOM_COUNT_2_)); + count[2] -= BOTTOM_COUNT_2_; + } + append(bytes, bytescount, 2, (byte)(COMMON_BOTTOM_2_ + + (count[2] - 1))); + } + + append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); + + if (doFrench) { // do the reverse copy + doFrench(bytes, bytescount, count); + } + else { + if (bytes[1].length <= bytescount[1] + bytescount[2]) { + bytes[1] = increase(bytes[1], bytescount[1], bytescount[2]); + } + System.arraycopy(bytes[2], 0, bytes[1], bytescount[1], + bytescount[2]); + bytescount[1] += bytescount[2]; + } + } + + /** + * Increase buffer size + * @param array array of bytes + * @param size of the byte array + * @param incrementsize size to increase + * @return the new buffer + */ + private static final byte[] increase(byte buffer[], int size, + int incrementsize) + { + byte result[] = new byte[buffer.length + incrementsize]; + System.arraycopy(buffer, 0, result, 0, size); + return result; + } + + /** + * Increase buffer size + * @param array array of bytes + * @param size of the byte array + * @param incrementsize size to increase + * @return the new buffer + */ + private static final int[] increase(int buffer[], int size, + int incrementsize) + { + int result[] = new int[buffer.length + incrementsize]; + System.arraycopy(buffer, 0, result, 0, size); + return result; + } + + /** + * Compacts the case bytes and stores them into the primary array + * @param bytes array of byte arrays corresponding to the strengths + * @param bytecount array of the size of the byte arrays + */ + private final void doCase(byte bytes[][], int bytescount[]) + { + append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); + if (bytes[1].length <= bytescount[1] + bytescount[0]) { + bytes[1] = increase(bytes[1], bytescount[1], bytescount[0]); + } + if (bytes[1].length <= bytescount[1] + bytescount[0]) { + bytes[1] = increase(bytes[1], bytescount[1], bytescount[0]); + } + System.arraycopy(bytes[0], 0, bytes[1], bytescount[1], bytescount[0]); + bytescount[1] += bytescount[0]; + } + + /** + * Compacts the tertiary bytes and stores them into the primary array + * @param bytes array of byte arrays corresponding to the strengths + * @param bytecount array of the size of the byte arrays + * @param count array of the number of compression counts + */ + private final void doTertiary(byte bytes[][], int bytescount[], + int count[]) + { + if (count[3] > 0) { + if (m_common3_ != COMMON_BOTTOM_3_) { + while (count[3] >= m_topCount3_) { + append(bytes, bytescount, 3, (byte)(m_top3_ + - m_topCount3_)); + count[3] -= m_topCount3_; + } + append(bytes, bytescount, 3, (byte)(m_top3_ - count[3])); + } + else { + while (count[3] > m_bottomCount3_) { + append(bytes, bytescount, 3, (byte)(m_bottom3_ + + m_bottomCount3_)); + count[3] -= m_bottomCount3_; + } + append(bytes, bytescount, 3, (byte)(m_bottom3_ + + (count[3] - 1))); + } + } + append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); + if (bytes[1].length <= bytescount[1] + bytescount[3]) { + bytes[1] = increase(bytes[1], bytescount[1], bytescount[3]); + } + System.arraycopy(bytes[3], 0, bytes[1], bytescount[1], bytescount[3]); + bytescount[1] += bytescount[3]; + } + + /** + * Compacts the quaternary bytes and stores them into the primary array + * @param bytes array of byte arrays corresponding to the strengths + * @param bytecount array of the size of the byte arrays + * @param count array of compression counts + */ + private final void doQuaternary(byte bytes[][], int bytescount[], + int count[], int commonbottom4, + int bottomcount4) + { + if (count[4] > 0) { + while (count[4] > bottomcount4) { + append(bytes, bytescount, 4, (byte)(commonbottom4 + + bottomcount4)); + count[4] -= bottomcount4; + } + append(bytes, bytescount, 4, (byte)(commonbottom4 + + (count[4] - 1))); + } + append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); + if (bytes[1].length <= bytescount[1] + bytescount[4]) { + bytes[1] = increase(bytes[1], bytescount[1], bytescount[4]); + } + System.arraycopy(bytes[4], 0, bytes[1], bytescount[1], bytescount[4]); + bytescount[1] += bytescount[4]; + } + + /** + * Deals with the identical sort. + * Appends the BOCSU version of the source string to the ends of the + * byte buffer. + * @param source text string + * @param bytes array of a byte array corresponding to the strengths + * @param bytescount array of the byte array size + */ + private final void doIdentical(String source, byte bytes[][], + int bytescount[]) + { + int isize = BOSCU.lengthOfIdenticalLevelRun(source); + append(bytes, bytescount, 1, SORT_LEVEL_TERMINATOR_); + if (bytes[1].length <= bytescount[1] + isize) { + bytes[1] = increase(bytes[1], bytescount[1], 1 + isize); + } + BOSCU.writeIdenticalLevelRun(source, bytes[1], bytescount[1]); + } + + /** + * Gets the offset of the first unmatched characters in source and target. + * This method returns the offset of the start of a contraction or a + * combining sequence, if the first difference is in the middle of such a + * sequence. + * @param source string + * @param target string + * @return offset of the first unmatched characters in source and target. + */ + private final int getFirstUnmatchedOffset(String source, String target) + { + int result = 0; + while (source.charAt(result) == target.charAt(result) + && source.charAt(result) != 0) { + result ++; + } + if (result > 0) { + // There is an identical portion at the beginning of the two + // strings. If the identical portion ends within a contraction or a + // combining character sequence, back up to the start of that + // sequence. + char schar = source.charAt(result); // first differing chars + char tchar = target.charAt(result); + if (schar != 0 && isUnsafe(schar) || tchar != 0 && isUnsafe(tchar)) + { + // We are stopped in the middle of a contraction or combining + // sequence. + // Look backwards for the part of the string for the start of + // the sequence + // It doesn't matter which string we scan, since they are the + // same in this region. + do { + result --; + } + while (result > 0 && isUnsafe(source.charAt(result))); + } + } + return result; + } + + /** + * Appending an byte to an array of bytes and increases it if we run out of + * space + * @param array of byte arrays + * @param array of the end offsets corresponding to array + * @param appendarrayindex of the int array to append + * @param value to append + */ + private static final void append(byte array[][], int arrayoffset[], + int appendarrayindex, byte value) + { + if (arrayoffset[appendarrayindex] + 1 + >= array[appendarrayindex].length) { + array[appendarrayindex] = increase(array[appendarrayindex], + arrayoffset[appendarrayindex], + SORT_BUFFER_INIT_SIZE_); + } + array[appendarrayindex][arrayoffset[appendarrayindex]] = value; + arrayoffset[appendarrayindex] ++; + } + + /** + * This is a trick string compare function that goes in and uses sortkeys + * to compare. It is used when compare gets in trouble and needs to bail + * out. + * @param source text string + * @param target text string + */ + private final int compareBySortKeys(String source, String target) + { + CollationKey sourcekey = getCollationKey(source); + CollationKey targetkey = getCollationKey(target); + return sourcekey.compareTo(targetkey); + } + + /** + * Performs the primary comparisons, and fills up the CE buffer at the + * same time. + * The return value toggles between the comparison result and the hiragana + * result. If either the source is greater than target or vice versa, the + * return result is the comparison result, ie 1 or -1, furthermore the + * cebuffers will be cleared when that happens. If the primary comparisons + * are equal, we'll have to continue with secondary comparison. In this case + * the cebuffer will not be cleared and the return result will be the + * hiragana result. + * @param doHiragana4 flag indicator that Hiragana Quaternary has to be + * observed + * @param lowestpvalue the lowest primary value that will not be ignored if + * alternate handling is shifted + * @param source text string + * @param target text string + * @param textoffset offset in text to start the comparison + * @param cebuffer array of CE buffers to populate, offset 0 for source, + * 1 for target, cleared when a primary difference is + * found. + * @param cebuffersize array of CE buffer size corresponding to the + * cebuffer, 0 when a primary difference is found. + * @return comparion result if a primary difference is found, otherwise + * hiragana result + */ + private final int doPrimaryCompare(boolean doHiragana4, int lowestpvalue, + String source, String target, + int textoffset, int cebuffer[][], + int cebuffersize[]) + { + // Preparing the context objects for iterating over strings + UCharacterIterator siter = new UCharacterIterator(source, textoffset, + source.length()); + CollationElementIterator scoleiter = new CollationElementIterator( + siter, this); + UCharacterIterator titer = new UCharacterIterator(target, textoffset, + target.length()); + CollationElementIterator tcoleiter = new CollationElementIterator( + titer, this); + + // Non shifted primary processing is quite simple + if (!m_isAlternateHandlingShifted_) { + int hiraganaresult = 0; + while (true) { + int sorder = 0; + // We fetch CEs until we hit a non ignorable primary or end. + do { + sorder = scoleiter.next(); + append(cebuffer, cebuffersize, 0, sorder); + sorder &= CE_PRIMARY_MASK_; + } while (sorder == CollationElementIterator.IGNORABLE); + + int torder = 0; + do { + torder = tcoleiter.next(); + append(cebuffer, cebuffersize, 1, torder); + torder &= CE_PRIMARY_MASK_; + } while (torder == CollationElementIterator.IGNORABLE); + + // if both primaries are the same + if (sorder == torder) { + // and there are no more CEs, we advance to the next level + if (cebuffer[0][cebuffersize[0] - 1] + == CollationElementIterator.NULLORDER) { + break; + } + if (doHiragana4 && hiraganaresult == 0 + && scoleiter.m_isCodePointHiragana_ != + tcoleiter.m_isCodePointHiragana_) { + if (scoleiter.m_isCodePointHiragana_) { + hiraganaresult = -1; + } + else { + hiraganaresult = 1; + } + } + } + else { + // if two primaries are different, we are done + return endCompare(sorder, torder, cebuffer, cebuffersize); + } + } + // no primary difference... do the rest from the buffers + return hiraganaresult; + } + else { // shifted - do a slightly more complicated processing :) + while (true) { + int sorder = getPrimaryShiftedCompareCE(scoleiter, lowestpvalue, + cebuffer, cebuffersize, 0); + int torder = getPrimaryShiftedCompareCE(tcoleiter, lowestpvalue, + cebuffer, cebuffersize, 1); + if (sorder == torder) { + if (cebuffer[0][cebuffersize[0] - 1] + == CollationElementIterator.NULLORDER) { + break; + } + else { + continue; + } + } + else { + return endCompare(sorder, torder, cebuffer, cebuffersize); + } + } // no primary difference... do the rest from the buffers + } + return 0; + } + + /** + * This is used only when we know that sorder is already different from + * torder. + * Compares sorder and torder, returns -1 if sorder is less than torder. + * Clears the cebuffer at the same time. + * @param sorder source strength order + * @param torder target strength order + * @param cebuffer array of buffers containing the ce values + * @param cebuffersize array of cebuffer offsets + * @return the comparison result of sorder and torder + */ + private static final int endCompare(int sorder, int torder, + int cebuffer[][], int cebuffersize[]) + { + cebuffer[0] = null; + cebuffer[1] = null; + cebuffersize[0] = 0; + cebuffersize[1] = 0; + if (sorder < torder) { + return -1; + } + return 1; + } + + /** + * Calculates the next primary shifted value and fills up cebuffer with the + * next non-ignorable ce. + * @param coleiter collation element iterator + * @param doHiragana4 flag indicator if hiragana quaternary is to be + * handled + * @param lowestpvalue lowest primary shifted value that will not be + * ignored + * @param cebuffer array of buffers to append with the next ce + * @param cebuffersize array of offsets corresponding to the cebuffer + * @param cebufferindex index of the buffer to append to + * @return result next modified ce + */ + private final static int getPrimaryShiftedCompareCE( + CollationElementIterator coleiter, + int lowestpvalue, int cebuffer[][], + int cebuffersize[], int cebufferindex) + { + boolean shifted = false; + int result = CollationElementIterator.IGNORABLE; + while (true) { + result = coleiter.next(); + if (result == CollationElementIterator.NULLORDER) { + append(cebuffer, cebuffersize, cebufferindex, result); + break; + } + else if (result == CollationElementIterator.IGNORABLE) { + continue; + } + else if (isContinuation(result)) { + if ((result & CE_PRIMARY_MASK_) + != CollationElementIterator.IGNORABLE) { + // There is primary value + if (shifted) { + result = (result & CE_PRIMARY_MASK_) + | CE_CONTINUATION_MARKER_; + // preserve interesting continuation + append(cebuffer, cebuffersize, cebufferindex, result); + continue; + } + else { + append(cebuffer, cebuffersize, cebufferindex, result); + break; + } + } + else { // Just lower level values + if (!shifted) { + append(cebuffer, cebuffersize, cebufferindex, result); + } + } + } + else { // regular + if ((result & CE_PRIMARY_MASK_) > lowestpvalue) { + append(cebuffer, cebuffersize, cebufferindex, result); + break; + } + else { + if ((result & CE_PRIMARY_MASK_) > 0) { + shifted = true; + result &= CE_PRIMARY_MASK_; + append(cebuffer, cebuffersize, cebufferindex, result); + continue; + } + else { + append(cebuffer, cebuffersize, cebufferindex, result); + shifted = false; + continue; + } + } + } + } + result &= CE_PRIMARY_MASK_; + return result; + } + + /** + * Appending an int to an array of ints and increases it if we run out of + * space + * @param array of int arrays + * @param array of the end offsets corresponding to array + * @param appendarrayindex of the int array to append + * @param value to append + */ + private static final void append(int array[][], int arrayoffset[], + int appendarrayindex, int value) + { + if (arrayoffset[appendarrayindex] + 1 + >= array[appendarrayindex].length) { + array[appendarrayindex] = increase(array[appendarrayindex], + arrayoffset[appendarrayindex], + CE_BUFFER_SIZE_); + } + array[appendarrayindex][arrayoffset[appendarrayindex]] = value; + arrayoffset[appendarrayindex] ++; + } + + /** + * Does secondary strength comparison based on the collected ces. + * @param cebuffer array of int arrays that contains the collected ces + * @param cebuffersize array of offsets corresponding to the cebuffer, + * indicates the offset of the last ce in buffer + * @param doFrench flag indicates if French ordering is to be done + * @return the secondary strength comparison result + */ + private static final int doSecondaryCompare(int cebuffer[][], + int cebuffersize[], + boolean doFrench) + { + // now, we're gonna reexamine collected CEs + if (!doFrench) { // normal + int offset = 0; + while (true) { + int sorder = CollationElementIterator.IGNORABLE; + while (sorder == CollationElementIterator.IGNORABLE) { + sorder = cebuffer[0][offset ++] & CE_SECONDARY_MASK_; + } + int torder = CollationElementIterator.IGNORABLE; + while (torder == CollationElementIterator.IGNORABLE) { + torder = cebuffer[1][offset ++] & CE_SECONDARY_MASK_; + } + + if (sorder == torder) { + if (cebuffer[0][offset - 1] + == CollationElementIterator.NULLORDER) { + break; + } + } + else { + return (sorder < torder) ? -1 : 1; + } + } + } + else { // do the French + int continuationoffset[] = {0, 0}; + int offset[] = {cebuffersize[0] - 2, cebuffersize[1] - 2} ; + while (true) { + int sorder = getSecondaryFrenchCE(cebuffer, offset, + continuationoffset, 0); + int torder = getSecondaryFrenchCE(cebuffer, offset, + continuationoffset,1); + if (sorder == torder) { + if (cebuffer[0][offset[0] - 1] + == CollationElementIterator.NULLORDER + || (offset[0] < 0 && offset[1] < 0)) { + break; + } + } + else { + return (sorder < torder) ? -1 : 1; + } + } + } + return 0; + } + + /** + * Calculates the next secondary french CE. + * @param cebuffer array of buffers to append with the next ce + * @param offset array of offsets corresponding to the cebuffer + * @param continuationoffset index of the start of a continuation + * @param index of cebuffer to use + * @return result next modified ce + */ + private static final int getSecondaryFrenchCE(int cebuffer[][], + int offset[], + int continuationoffset[], + int index) + { + int result = CollationElementIterator.IGNORABLE; + while (result == CollationElementIterator.IGNORABLE + && offset[index] >= 0) { + if (continuationoffset[index] == 0) { + while (isContinuation(cebuffer[0][offset[index] --])); + // after this, sorder is at the start of continuation, + // and offset points before that + if (isContinuation(cebuffer[0][offset[index] + 1])) { + // save offset for later + continuationoffset[index] = offset[index]; + offset[index] += 2; + } + } + else { + result = cebuffer[0][offset[index] ++]; + if (!isContinuation(result)) { + // we have finished with this continuation + offset[index] = continuationoffset[index]; + // reset the pointer to before continuation + continuationoffset[index] = 0; + continue; + } + } + result &= CE_SECONDARY_MASK_; // remove continuation bit + } + return result; + } + + /** + * Does case strength comparison based on the collected ces. + * @param cebuffer array of int arrays that contains the collected ces + * @return the case strength comparison result + */ + private final int doCaseCompare(int cebuffer[][]) + { + int sorder = CollationElementIterator.IGNORABLE; + int torder = CollationElementIterator.IGNORABLE; + int soffset = 0; + int toffset = 0; + while (true) { + while ((sorder & CE_REMOVE_CASE_) + == CollationElementIterator.IGNORABLE) { + sorder = cebuffer[0][soffset ++]; + if (!isContinuation(sorder)) { + sorder &= CE_CASE_MASK_3_; + sorder ^= m_caseSwitch_; + } + else { + sorder = CollationElementIterator.IGNORABLE; + } + } + + while ((torder & CE_REMOVE_CASE_) + == CollationElementIterator.IGNORABLE) { + torder = cebuffer[1][toffset ++]; + if (!isContinuation(sorder)) { + torder &= CE_CASE_MASK_3_; + torder ^= m_caseSwitch_; + } + else { + torder = CollationElementIterator.IGNORABLE; + } + } + + if ((sorder & CE_CASE_BIT_MASK_) < (torder & CE_CASE_BIT_MASK_)) { + return -1; + } + else if ((sorder & CE_CASE_BIT_MASK_) + > (torder & CE_CASE_BIT_MASK_)) { + return 1; + } + + if (cebuffer[0][soffset - 1] == CollationElementIterator.NULLORDER) + { + break; + } + else { + sorder = CollationElementIterator.IGNORABLE; + torder = CollationElementIterator.IGNORABLE; + } + } + return 0; + } + + /** + * Does tertiary strength comparison based on the collected ces. + * @param cebuffer array of int arrays that contains the collected ces + * @return the tertiary strength comparison result + */ + private final int doTertiaryCompare(int cebuffer[][]) + { + int soffset = 0; + int toffset = 0; + while (true) { + int sorder = CollationElementIterator.IGNORABLE; + int torder = CollationElementIterator.IGNORABLE; + while ((sorder & CE_REMOVE_CASE_) + == CollationElementIterator.IGNORABLE) { + sorder = cebuffer[0][soffset ++] & m_mask3_; + if (!isContinuation(sorder)) { + sorder ^= m_caseSwitch_; + } + else { + sorder &= CE_REMOVE_CASE_; + } + } + + while ((torder & CE_REMOVE_CASE_) + == CollationElementIterator.IGNORABLE) { + torder = cebuffer[1][toffset ++] & m_mask3_; + if (!isContinuation(torder)) { + torder ^= m_caseSwitch_; + } + else { + torder &= CE_REMOVE_CASE_; + } + } + + if (sorder == torder) { + if (cebuffer[0][soffset - 1] + == (CollationElementIterator.NULLORDER & CE_REMOVE_CASE_)) { + break; + } + } + else { + return (sorder < torder) ? -1 : 1; + } + } + return 0; + } + + /** + * Does quaternary strength comparison based on the collected ces. + * @param cebuffer array of int arrays that contains the collected ces + * @param lowestpvalue the lowest primary value that will not be ignored if + * alternate handling is shifted + * @return the quaternary strength comparison result + */ + private final int doQuaternaryCompare(int cebuffer[][], int lowestpvalue) + { + boolean sShifted = true; + boolean tShifted = true; + int soffset = 0; + int toffset = 0; + while (true) { + int sorder = CollationElementIterator.IGNORABLE; + int torder = CollationElementIterator.IGNORABLE; + while (sorder == CollationElementIterator.IGNORABLE + && sorder != CollationElementIterator.NULLORDER + || (isContinuation(sorder) && !sShifted)) { + sorder = cebuffer[0][soffset ++]; + if (isContinuation(sorder)) { + if (!sShifted) { + continue; + } + } + else if (sorder > lowestpvalue + || (sorder & CE_PRIMARY_MASK_) + == CollationElementIterator.IGNORABLE) { + // non continuation + sorder = CE_PRIMARY_MASK_; + sShifted = false; + } + else { + sShifted = true; + } + } + sorder &= CE_PRIMARY_MASK_; + while (torder == CollationElementIterator.IGNORABLE + && torder != CollationElementIterator.NULLORDER + || (isContinuation(torder) && !tShifted)) { + torder = cebuffer[0][toffset ++]; + if (isContinuation(torder)) { + if (!tShifted) { + continue; + } + } + else if (torder > lowestpvalue + || (torder & CE_PRIMARY_MASK_) + == CollationElementIterator.IGNORABLE) { + // non continuation + torder = CE_PRIMARY_MASK_; + tShifted = false; + } + else { + tShifted = true; + } + } + torder &= CE_PRIMARY_MASK_; + + if (sorder == torder) { + if (cebuffer[0][soffset -1] + == CollationElementIterator.NULLORDER) { + break; + } + } + else { + return (sorder < torder) ? -1 : 1; + } + } + return 0; + } + + /** + * Internal function. Does byte level string compare. Used by strcoll if + * strength == identical and strings are otherwise equal. This is a rare + * case. Comparison must be done on NFD normalized strings. FCD is not good + * enough. + * @param source text + * @param target text + * @param offset of the first difference in the text strings + * @param normalize flag indicating if we are to normalize the text before + * comparison + * @return 1 if source is greater than target, -1 less than and 0 if equals + */ + private static final int doIdenticalCompare(String source, String target, + int offset, boolean normalize) + { + if (normalize) { + /* + if (unorm_quickCheck(sColl->string, sLen, UNORM_NFD) != UNORM_YES) { + source = unorm_decompose(sColl->writableBuffer, + sColl->writableBufSize, + sBuf, sLen, FALSE, FALSE); + } + + if (unorm_quickCheck(tColl->string, tLen, UNORM_NFD) != UNORM_YES) { + target = unorm_decompose(tColl->writableBuffer, + tColl->writableBufSize, + tBuf, tLen, FALSE, FALSE); + } + */ + offset = 0; + } + + return doStringCompare(source, target, offset); + } + + /** + * Compares string for their codepoint order. + * This comparison handles surrogate characters and place them after the + * all non surrogate characters. + * @param source text + * @param target text + * @param offset start offset for comparison + * @return 1 if source is greater than target, -1 less than and 0 if equals + */ + private static final int doStringCompare(String source, + String target, + int offset) + { + // compare identical prefixes - they do not need to be fixed up + char schar = 0; + char tchar = 0; + while (true) { + schar = source.charAt(offset); + tchar = target.charAt(offset ++); + if (schar != tchar) { + break; + } + if (schar == 0) { + return 0; + } + } + + // if both values are in or above the surrogate range, Fix them up. + if (schar >= UTF16.LEAD_SURROGATE_MIN_VALUE + && tchar >= UTF16.LEAD_SURROGATE_MIN_VALUE) { + schar = fixupUTF16(schar); + tchar = fixupUTF16(tchar); + } + + // now c1 and c2 are in UTF-32-compatible order + return (schar < tchar) ? -1 : 1; // schar and tchar has to be different + } + + /** + * Rotate surrogates to the top to get code point order + */ + private static final char fixupUTF16(char ch) + { + if (ch >= 0xe000) { + ch -= 0x800; + } + else { + ch += 0x2000; + } + return ch; + } +}